# Evaluation script for the new GPT planners

In [1]:
import sys
import os
import json
import yaml
import pathlib
import glob
import pandas as pd
from pathlib import Path

In [2]:
# Replace 'experiment_log.json' with the path to your actual JSON file
LOCAL_MACHINE = "am1"
ROOT_PATH = pathlib.Path("__file__").resolve().parent.parent
EXP_FOLDER = os.path.join(ROOT_PATH, "experiments")
CONFIGS_FOLDER = os.path.join(ROOT_PATH, "cos_eor", "configs", "local")
ENVS_FILE_PATH = os.path.join(CONFIGS_FOLDER , "envs_paper.yaml")

# Note: put the OpenAI key here:
with open(os.path.join(CONFIGS_FOLDER, "api_key.yaml")) as kfile:
    k = yaml.safe_load(kfile)

In [3]:
PAIRS = {
    'merom_1_int': {'bootstrap': 'bt_6_m1_train_pair', 
                    'finetune iter 1': 'ft_24_iter_1_from_bt_6_pair_personal', 
                    'finetune iter 2': 'ft_28_iter_2_from_ft_24_pair_personal'
                    },
    'pomaria_1_int': {'bootstrap': 'bt_7_p1_train_pair', 
                    'finetune iter 1': 'ft_23_iter_1_from_bt_7_pair_personal', 
                    'finetune iter 2': 'ft_27_iter_2_from_ft_23_pair_personal'
                    },
    'pomaria_2_int': {'bootstrap': 'bt_8_p2_train_pair', 
                    'finetune iter 1':'ft_22_iter_1_from_bt_8_pair_personal', 
                    'finetune iter 2':'ft_26_iter_2_from_ft_22_pair_personal'
                    },
    'rs_int': {'bootstrap': 'bt_9_r_train_pair', 
            'finetune iter 1': 'ft_25_iter_1_from_bt_9_pair_personal', 
            'finetune iter 2':'ft_29_iter_2_from_ft_25_pair_personal'},
}
SINGLES = ['single_run_20_turbo_benevolence1_pomaria1', 'single_run_22_turbo_other_5_scenes']
SAYPLAN = ['sayplan_paper']
SAYCAN = ['saycan_paper']


VARIANT_NAME_MAPPING = {}
SOURCE_MAPPING = {}
for s in PAIRS:
    for name in PAIRS[s]:
        variant = PAIRS[s][name]
        VARIANT_NAME_MAPPING[variant] = name + ' | ' + s
        SOURCE_MAPPING[variant] = s
for v in SINGLES:
    SOURCE_MAPPING[v] = 'None'
    VARIANT_NAME_MAPPING[v] = 'zero-shot gpt-3.5-turbo'
for v in SAYPLAN:
    SOURCE_MAPPING[v] = 'None'
    VARIANT_NAME_MAPPING[v] = 'SayPlan'
for v in SAYCAN:
    SOURCE_MAPPING[v] = 'None'
    VARIANT_NAME_MAPPING[v] = 'SayCan'

def flatten_list(input_list):
    return [item for sublist in input_list for item in sublist]


SOURCE_SCENES = ['merom_1_int', 'pomaria_1_int', 'pomaria_2_int', 'rs_int']
COMPARE_EXP = list(VARIANT_NAME_MAPPING.keys())

LOGS_FOLDER = os.path.join(ROOT_PATH, "logs")

# Constants
ANNOTATION = "annotation"
DIFF_CORRECT_LOC = "diff_correct_loc"
EXPERIMENT = "experiment"
FLAG = "flag"
NUM_OBJECTS_DISCOVERED = "num_objects_discovered"
NUM_RECS_DISCOVERED = "num_recs_discovered"
OUTCOME = "outcome"
PROMPT = "prompt"
REWARD = "reward"
REWARD_WEIGHTS = {NUM_OBJECTS_DISCOVERED: 1, NUM_RECS_DISCOVERED: 1, DIFF_CORRECT_LOC: 10}
SCENE = "scene"
SPLIT = "split"
SPLIT_SCENE = "split_scene"
SUC = "succeeded"
SUC_STEPS = "successful_steps"

NUM_TRAIN_SCENES = 4
NUM_TEST_SCENES = 0

# Read the scene IDs from the envs.yaml file
with open(ENVS_FILE_PATH , 'r') as file:
    scenes_list = yaml.safe_load(file).split()
scenes = {}
for i, s in enumerate(scenes_list):
    if i < NUM_TRAIN_SCENES:
        scenes[s] = "train"
    else:
        scenes[s] = "test"
print ('scenes', scenes)

scenes {'pomaria_1_int': 'train', 'pomaria_2_int': 'train', 'merom_1_int': 'train', 'rs_int': 'train'}


In [4]:
# Load the scene IDs from the envs.yaml file
def load_scenes(file_path):
    with open(file_path, 'r') as file:
        return yaml.safe_load(file)

# Function to get log file paths for a given experiment ID and scene ID
def get_experiment_log_paths(experiment_id, scene_id):
    logs_path_pattern = f'{LOGS_FOLDER}/{experiment_id}/demo/{scene_id}/data_*.json'
    MOD_VAL = 10
    if 'ablation' in experiment_id:
        MOD_VAL = 35
        MOD_TEST = 10
    elif 'sayplan' in experiment_id or 'saycan' in experiment_id:
        MOD_VAL = 25
        MOD_TEST = 10
    elif 'pair' in experiment_id:
        MOD_VAL = 25
        MOD_TEST = 10
    elif 'small' in experiment_id:
        MOD_VAL = 10
        MOD_TEST = 10
    else:
        MOD_VAL = 10
        MOD_TEST = 10
    
    all_paths = sorted(glob.glob(logs_path_pattern), reverse=False)[:] # sorted ascending according to time
    
    log_paths = {}
    if scenes[scene_id] == 'test':
        for i, path in enumerate(all_paths):
            if i % MOD_TEST <= 9:
                log_paths[path] = 'test'
    elif scenes[scene_id] == 'train':
        # half of the paths are train/test iterations
        for i, path in enumerate(all_paths):
            if i % MOD_VAL >= 5:
                # log 5 - 9,14,24 are for training
                log_paths[path] = 'train'
            else:
                # log 0 - 4 are for testing
                log_paths[path] = 'test'
    return log_paths

def reward(result):
    reward = sum(result[k] * REWARD_WEIGHTS[k] for k in REWARD_WEIGHTS)
    return reward

def annotate_record(record, experiment_id, scene_id):
    result = {}
    result[SCENE] = scene_id
    result[EXPERIMENT] = experiment_id
    result[NUM_OBJECTS_DISCOVERED] = len(record[OUTCOME]["objects_discovered"])
    result[NUM_RECS_DISCOVERED] = len(record[OUTCOME]["recs_discovered"])
    result[DIFF_CORRECT_LOC] = record[OUTCOME]["count_correct"]["end"] - record[OUTCOME]["count_correct"]["start"]
    # craft a response based on successful steps
    result[REWARD] = reward(result)
    return result

def annotate_episode(records, experiment_id, scene_id, split='train'):
    result = {}
    start_correct = records[0][OUTCOME]["count_correct"]["start"]
    start_wrong = records[0][OUTCOME]["count_wrong"]["start"]
    end_correct = records[-1][OUTCOME]["count_correct"]["end"]
    diff_correct = end_correct - start_correct
    objs = []
    recs = []
    for record in records:
        objs += record[OUTCOME]["objects_discovered"]
        recs += record[OUTCOME]["recs_discovered"]
    result[DIFF_CORRECT_LOC] = diff_correct
    result['diff_gt'] = start_wrong
    result['success_rate'] = diff_correct/start_wrong * 100
    result["objects_discovered"] = len(objs)
    result['recs_discovered'] = len(recs)
    result['experiment'] = experiment_id
    result['scene'] = scene_id
    steps = 0
    successful_steps = 0
    for record in records:
        steps += len(record["logs"])
        successful_steps += len([l for l in record["logs"] if l['flag'] == SUC])
        
    num_correct_placement = 0
    num_wrong_placement = 0
    num_placement = 0
    num_pick = 0
    movements = []
    for record in records:
        for log in record['logs']:
            diff = log[OUTCOME]["count_correct"]['end'] - log[OUTCOME]["count_correct"]['start']
            objects_moved = log[OUTCOME]["objects_moved"]
            for obj in objects_moved:
                if objects_moved[obj][-1] == "agent":
                    # picking only, could be right or wrong
                    num_pick += 1
                else: 
                    # moving from agent to rec or moving from rec to rec
                    num_placement += 1
                    if objects_moved[obj][0] == 'agent':
                        # moving from agent to rec
                        if diff <= 0:
                            # wrong placement
                            num_wrong_placement += 1
                        else:
                            num_correct_placement += 1

                    else:
                        # moving from rec to rec
                        if diff < 0:
                            num_wrong_placement += 1
                        else:
                            # +1 if wrong rec -> correct rec, 0 if correct rec -> correct rec
                            num_correct_placement += 1
                            
                    movements.append((obj, objects_moved[obj][-1]))
                    
            
    result['steps'] = steps
    result['successful_steps'] = successful_steps
    result['pick_steps'] = num_pick
    result['correct_placement_steps'] = num_correct_placement
    result['wrong_placement_steps'] = num_wrong_placement
    result['placement_steps'] = num_placement
    result['split'] = split
    result['movements'] = movements
    return result
    

# Function to load experiment logs and add the scene name
def load_and_annotate_logs(experiment_id, scenes):
    all_records = []
    all_episodes = []
    for scene_id in scenes:
        # log file paths is a dictionary {path: 'train' or 'test'}
        log_file_paths = get_experiment_log_paths(experiment_id, scene_id)
        for log_file_path in log_file_paths:
            with open(log_file_path, 'r') as file:
                records = json.load(file)[:]
                split = log_file_paths[log_file_path]
                episode_result = annotate_episode(records, experiment_id, scene_id, split)
                all_episodes.append(episode_result)
                # Annotate each record with the scene name
                for record in records:
                    record[ANNOTATION] = annotate_record(record, experiment_id, scene_id)
                all_records.extend(records)
    return all_records, all_episodes

# Load and annotate logs
annotated_logs = {}
annotated_episodes = {}
annotated_episode_list = []
annotations = []
for experiment_id in COMPARE_EXP:
    annotated_logs[experiment_id], annotated_episodes[experiment_id] = load_and_annotate_logs(experiment_id, scenes)
    annotated_episode_list += annotated_episodes[experiment_id]
    annotations += [l[ANNOTATION] for l in annotated_logs[experiment_id]]


In [5]:
df_episodes = pd.DataFrame(annotated_episode_list)
df_episodes[SPLIT_SCENE] = df_episodes.apply(lambda x: scenes[x[SCENE]], axis=1)
df_episodes['variant'] = df_episodes.apply(lambda x: VARIANT_NAME_MAPPING[x['experiment']], axis=1)
df_episodes['source'] = df_episodes.apply(lambda x: SOURCE_MAPPING[x[EXPERIMENT]], axis=1)

In [6]:
pd.set_option('display.max_rows', 200)
train_instances = df_episodes[df_episodes.apply(lambda x: x['source'] == 'None' or x['source'] == x['scene'] , axis=1)]

# Results for the paper

### The main results: in-domain adaptation success metric

In [7]:
df_main = train_instances.copy(deep=True)

In [8]:
# scene 1: pomaria_1
# scene 2: merom_1
# scene 3: rs_int
# scene 4: pomaria_2
name_map = {
    'pomaria_1_int': 'Scene 1',
    'merom_1_int': 'Scene 2',
    'rs_int': 'Scene 4',
    'pomaria_2_int': 'Scene 3'
}
variant_map = {
    'zero-shot gpt-3.5-turbo': 'LLM-Planner',
    'SayPlan': 'SayPlan',
    'SayPlan-nofeed': 'SayPlan-nofeed',
    'SayCan': 'SayCan',
    'bootstrap': 'LLM-Personalize (bootstrap)',
    'finetune iter 1': 'LLM-Personalize (SI Iter=1)',
    'finetune iter 2': 'LLM-Personalize (SI Iter=2)',
    'bt10': 'ablation_bt_large_1',
    'bt11': 'ablation_bt_small_1',
    'bt12': 'ablation_bt_large_2',
    'bt14': 'ablation_bt_large_3',
    'ft31': 'ablation_ft_small_iter1'
}
df_main['scene name'] = df_main.apply(lambda x: name_map[x['scene']], axis=1)
df_main['variant name'] = df_main.apply(lambda x: variant_map[x['variant'].split('|')[0].strip(' ')], axis=1)
df_main['task set'] = df_main.apply(lambda x: x['split'], axis=1)
df_main[['scene name', 'task set', 'variant name', 
             "success_rate"]].groupby(['scene name', 'task set', 'variant name']).agg(['mean', 'sem'])
# Assuming df_main is your original DataFrame
# Step 1: Group by and aggregate
grouped = df_main.groupby(['scene name', 'task set', 'variant name'])['success_rate'].agg(['mean', 'sem', 'count'])
# Step 2: Unstack to rearrange the DataFrame
# Unstack 'scene name' and 'task set' to create a multi-level column structure
reshaped = grouped.unstack(level=[0, 1])
# Step 3: Reorder the columns to ensure 'train' appears before 'test'
# This step might require custom handling based on the specific column names in your DataFrame
reshaped = reshaped.swaplevel(1, 2, axis=1).sort_index(axis=1)
reshaped = reshaped.swaplevel(0, 2, axis=1).sort_index(axis=1)
reshaped


scene name,Scene 1,Scene 1,Scene 1,Scene 1,Scene 1,Scene 1,Scene 2,Scene 2,Scene 2,Scene 2,...,Scene 3,Scene 3,Scene 3,Scene 3,Scene 4,Scene 4,Scene 4,Scene 4,Scene 4,Scene 4
task set,test,test,test,train,train,train,test,test,test,train,...,test,train,train,train,test,test,test,train,train,train
Unnamed: 0_level_2,count,mean,sem,count,mean,sem,count,mean,sem,count,...,sem,count,mean,sem,count,mean,sem,count,mean,sem
variant name,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
LLM-Personalize (SI Iter=1),25,25.8,6.687551,100,17.983333,3.78648,25,21.733333,5.629354,100,...,6.345515,100,32.416667,3.202165,25,10.266667,6.708673,100,24.233333,3.279226
LLM-Personalize (SI Iter=2),25,29.6,5.4,100,25.516667,3.143384,25,25.2,4.046398,100,...,4.354648,100,33.583333,3.845018,25,20.4,6.819281,100,29.15,2.897665
LLM-Personalize (bootstrap),25,17.6,6.165225,100,4.166667,2.630916,25,12.6,9.068709,100,...,4.414832,100,22.616667,2.639833,25,25.733333,6.776238,100,10.933333,3.31995
LLM-Planner,25,-3.6,4.860727,25,5.333333,4.439928,25,-9.866667,6.350066,25,...,3.387176,25,-14.266667,3.997777,25,-30.266667,4.836857,25,-29.6,5.264557
SayCan,5,0.0,0.0,20,-2.666667,1.897983,5,-10.666667,6.863753,20,...,4.898979,20,-3.333333,2.294157,5,0.0,0.0,20,-1.666667,1.666667
SayPlan,5,-5.0,5.0,20,-7.0,5.91435,5,-1.666667,9.279607,20,...,8.306624,20,0.416667,4.502882,5,-12.333333,15.344199,20,-10.75,3.888946


In [9]:
grouped1 = df_main.groupby(['scene name', 'task set', 'variant name'])['success_rate'].agg(
    lambda x: f'{x.mean():.2f} ± {x.sem():.2f}'
)
reshaped1 = grouped1.unstack(level=[0, 1])
reshaped1

scene name,Scene 1,Scene 1,Scene 2,Scene 2,Scene 3,Scene 3,Scene 4,Scene 4
task set,test,train,test,train,test,train,test,train
variant name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
LLM-Personalize (SI Iter=1),25.80 ± 6.69,17.98 ± 3.79,21.73 ± 5.63,19.48 ± 2.99,41.40 ± 6.35,32.42 ± 3.20,10.27 ± 6.71,24.23 ± 3.28
LLM-Personalize (SI Iter=2),29.60 ± 5.40,25.52 ± 3.14,25.20 ± 4.05,18.52 ± 2.66,43.33 ± 4.35,33.58 ± 3.85,20.40 ± 6.82,29.15 ± 2.90
LLM-Personalize (bootstrap),17.60 ± 6.17,4.17 ± 2.63,12.60 ± 9.07,-3.38 ± 3.08,24.33 ± 4.41,22.62 ± 2.64,25.73 ± 6.78,10.93 ± 3.32
LLM-Planner,-3.60 ± 4.86,5.33 ± 4.44,-9.87 ± 6.35,-8.40 ± 3.84,-4.07 ± 3.39,-14.27 ± 4.00,-30.27 ± 4.84,-29.60 ± 5.26
SayCan,0.00 ± 0.00,-2.67 ± 1.90,-10.67 ± 6.86,-1.25 ± 1.25,-8.00 ± 4.90,-3.33 ± 2.29,0.00 ± 0.00,-1.67 ± 1.67
SayPlan,-5.00 ± 5.00,-7.00 ± 5.91,-1.67 ± 9.28,-6.83 ± 2.78,-13.00 ± 8.31,0.42 ± 4.50,-12.33 ± 15.34,-10.75 ± 3.89
