# Annotation script to annotate collected rollouts

In [208]:
import sys
import os
import json
from pprint import pprint
import yaml
import pathlib
import glob
import pandas as pd
import openai
from pathlib import Path

In [209]:
# Replace 'experiment_log.json' with the path to your actual JSON file
ROOT_PATH = pathlib.Path("__file__").resolve().parent.parent
EXP_FOLDER = os.path.join(ROOT_PATH, "experiments")
# Define the experiment ID
experiment_id = 'ft_23_from_bt_7_pair_personal' # Replace with your actual experiment ID
logs_id = 'bt_7_p1_train_pair' # last iteration of collected logs from which the new model learn from
OUTPUT_PATH = os.path.join(EXP_FOLDER, experiment_id)
LOGS_FOLDER = os.path.join(ROOT_PATH, "logs")
CONFIGS_FOLDER = os.path.join(ROOT_PATH, "cos_eor", "configs", "local")
ENVS_FILE_PATH = os.path.join(CONFIGS_FOLDER , "envs_demo.yaml")


# Note: put the OpenAI key here:
with open(os.path.join(CONFIGS_FOLDER, "api_key.yaml")) as kfile:
    k = yaml.safe_load(kfile)
openai.api_key = k['key'] # PUT THE API_KEY into key.txt file
if 'organization' in k:
    openai.organization = k['organization']
FINETUNE_MODEL = "TO BE ADDED" # the base model from the last iter
SUFFIX = f"TO BE ADDED" # suffix to be used for the new output fine-tune model, e.g., ft_23_from_bt

TRAIN_FILE_NAME_OUTPUT = "train.jsonl"
VALID_FILE_NAME_OUTPUT = "valid.jsonl"
TEST_FILE_NAME_OUTPUT = "test.jsonl"
META_FILE_NAME = "info.txt"

# Constants
ANNOTATION = "annotation"
DIFF_CORRECT_LOC = "diff_correct_loc"
EXPERIMENT = "experiment"
FLAG = "flag"
FINETUNE_MSG = "finetune_message"
NUM_OBJECTS_DISCOVERED = "num_objects_discovered"
NUM_RECS_DISCOVERED = "num_recs_discovered"
OUTCOME = "outcome"
PROMPT = "prompt"
REWARD = "reward"
REWARD_WEIGHTS = {NUM_OBJECTS_DISCOVERED: 1, NUM_RECS_DISCOVERED: 1, DIFF_CORRECT_LOC: 10}
SCENE = "scene"
SUC = "succeeded"
SUC_STEPS = "successful_steps"

# Read the scene IDs from the envs.yaml file
with open(ENVS_FILE_PATH , 'r') as file:
    scenes = yaml.safe_load(file).split()

In [210]:
# Load the scene IDs from the envs.yaml file
def load_scenes(file_path):
    with open(file_path, 'r') as file:
        return yaml.safe_load(file)

# Function to get log file paths for a given experiment ID and scene ID
def get_experiment_log_paths(logs_id, scene_id, mode='train'):
    logs_path_pattern = f'{LOGS_FOLDER}/{logs_id}/demo/{scene_id}/data_*.json'
    all_paths = sorted(glob.glob(logs_path_pattern), reverse=False) # sorted ascending according to time
    # we need to get the train episodes for train, test episodes for evaluation
    log_paths = []
    if mode == 'all':
        return all_paths
    if mode == 'train':
        if 'ablationlarge' in logs_id:
            MOD_VAL = 35
        elif 'ablationsmall' in logs_id:
            MOD_VAL = 10
        elif 'small' in logs_id:
            MOD_VAL = 10
        elif "large" in logs_id:
            MOD_VAL = 25
        elif "pair" in logs_id:
            MOD_VAL = 25
        else:
            MOD_VAL = 10
            print ('no experiment size provided, assume small experiment')
        print ('mod val', MOD_VAL)
        for i, log in enumerate(all_paths):
            if i % MOD_VAL >= 5:
                # log 5 - 9 are for training
                log_paths.append(log)
            else:
                # log 0 - 4 are for testing
                continue
    return log_paths

def reward(result):
    reward = sum(result[k] * REWARD_WEIGHTS[k] for k in REWARD_WEIGHTS)
    return reward

def compile_steps(steps):
    # Enumerate over the steps, starting at 1, and format them into a string
    nl = "\n".join(f"step {index}: {step}" for index, step in enumerate(steps, start=1))
    nl_without_prefix = nl.replace('step 1: ', '')
    return nl_without_prefix

def prepare_example_conversation(system_msg, user_msg, assistant_message):
    messages = []
    messages.append({"role": "system", "content": system_msg,})
    messages.append({"role": "user", "content": user_msg})
    messages.append({"role": "assistant", "content": assistant_message})
    return {"messages": messages}

def annotate_record(record, logs_id, scene_id):
    result = {}
    result[SCENE] = scene_id
    result[EXPERIMENT] = logs_id
    result[NUM_OBJECTS_DISCOVERED] = len(record[OUTCOME]["objects_discovered"])
    result[NUM_RECS_DISCOVERED] = len(record[OUTCOME]["recs_discovered"])
    result[DIFF_CORRECT_LOC] = record[OUTCOME]["count_correct"]["end"] - record[OUTCOME]["count_correct"]["start"]
    # craft a response based on successful steps
    result[SUC_STEPS] = [l['step_raw'] for l in record["logs"] if l[FLAG] == SUC]
    system_msg = record["low_level"]["prompt"]["system"]
    user_msg = record["low_level"]["prompt"]["user"]
    assistant_msg = compile_steps(result[SUC_STEPS])
    result[FINETUNE_MSG] = prepare_example_conversation(system_msg=system_msg, user_msg=user_msg, assistant_message=assistant_msg)
    result[REWARD] = reward(result)
    return result

# Function to load experiment logs and add the scene name
def load_and_annotate_logs(logs_id, scenes):
    all_records = []
    all_episodes = []
    for scene_id in scenes:
        log_file_paths = get_experiment_log_paths(logs_id, scene_id, mode='train')
        for log_file_path in log_file_paths:
            with open(log_file_path, 'r') as file:
                records = json.load(file)
                # Annotate each record with the scene name
                for record in records:
                    record[ANNOTATION] = annotate_record(record, logs_id, scene_id)
                all_records.extend(records)
                all_episodes.append(records)
    return all_records, all_episodes

# Load and annotate logs
annotated_logs, annotated_episodes = load_and_annotate_logs(logs_id, scenes)


mod val 35


In [211]:
positive_records = [log for log in annotated_logs if log['annotation']['diff_correct_loc'] > 0]
pick_records = [log for log in annotated_logs if log['annotation']]

from collections import Counter
Counter([log[ANNOTATION][SCENE] for log in positive_records])

Counter({'pomaria_1_int': 327})

In [212]:
correct_episode_steps = []
for ie, episode in enumerate(annotated_episodes):
    flag_correct = False
    for ir, record in enumerate(episode):
        if record[ANNOTATION]['diff_correct_loc'] > 0:
            correct_episode_steps.append((ie, ir)) 

In [213]:
from pprint import pprint
print ('Number of correct step episodes', len(correct_episode_steps))
print ('Avg correct steps per episode', len(correct_episode_steps)/len(annotated_episodes))
print (correct_episode_steps)

Number of correct step episodes 327
Avg correct steps per episode 2.18
[(0, 1), (0, 3), (0, 6), (1, 2), (1, 4), (2, 4), (2, 5), (3, 2), (3, 3), (4, 1), (4, 2), (4, 4), (4, 5), (5, 3), (6, 1), (6, 9), (7, 2), (7, 3), (7, 4), (8, 2), (8, 3), (8, 6), (8, 8), (9, 1), (9, 2), (10, 1), (10, 2), (10, 3), (11, 1), (13, 1), (14, 1), (14, 5), (14, 7), (15, 1), (16, 1), (16, 2), (16, 5), (16, 7), (17, 1), (17, 4), (18, 2), (19, 1), (20, 1), (20, 2), (20, 6), (20, 8), (20, 9), (20, 10), (20, 11), (21, 1), (21, 6), (22, 1), (22, 2), (22, 5), (23, 3), (23, 5), (24, 1), (24, 5), (25, 2), (25, 3), (26, 3), (26, 5), (27, 1), (27, 2), (27, 3), (28, 1), (28, 2), (28, 5), (29, 1), (29, 5), (29, 6), (30, 4), (30, 5), (30, 9), (30, 10), (31, 1), (32, 3), (32, 6), (32, 7), (33, 2), (33, 4), (34, 1), (34, 3), (34, 4), (34, 5), (35, 7), (36, 1), (36, 4), (36, 6), (37, 1), (37, 3), (37, 5), (38, 3), (38, 4), (38, 6), (39, 1), (39, 4), (40, 2), (40, 5), (40, 8), (42, 2), (43, 5), (44, 1), (44, 3), (45, 1), (46, 

In [214]:
EPISODE_INDEX = 0
STEP_INDEX = 1
pprint (annotated_episodes[EPISODE_INDEX][STEP_INDEX][ANNOTATION][SCENE])
pprint (annotated_episodes[EPISODE_INDEX][STEP_INDEX][ANNOTATION]['successful_steps'])
pprint (annotated_episodes[EPISODE_INDEX][STEP_INDEX]['correct_objects'])

'pomaria_1_int'
['go to candy 1',
 'look at candy 1',
 'pick up candy 1',
 'go to pantry room 0 top cabinet 55',
 'look at pantry room 0 top cabinet 55',
 'place candy 1 on pantry room 0 top cabinet 55']
{'end': {'candy 1': 'pantry room 0 top cabinet 55',
         'cloth 1': 'pantry room 0 top cabinet 54',
         'dumbbell rack 1': 'pantry room 0 top cabinet 54',
         'umbrella 1': 'pantry room 0 top cabinet 55'},
 'start': {'cloth 1': 'pantry room 0 top cabinet 54',
           'dumbbell rack 1': 'pantry room 0 top cabinet 54',
           'umbrella 1': 'pantry room 0 top cabinet 55'}}


In [215]:
finetune_records = [log for log in annotated_logs if log[ANNOTATION][DIFF_CORRECT_LOC] > 0]
print ('finetune records', len(finetune_records))
finetune_records[0][ANNOTATION][FINETUNE_MSG]

finetune records 327


{'messages': [{'role': 'system',
   'content': 'You are a one-handed household robot.'},
  {'role': 'user',
   'content': 'There are objects misplaced on wrong receptacles and potentially in the wrong room. \nYou are holding nothing.\nIn corridor 0, found receptacles: corridor 0 carpet 13, corridor 0 shelf 14, corridor 0 table 0. \nIn kitchen 0, found receptacles: kitchen 0 counter 40, kitchen 0 dishwasher 36, kitchen 0 bottom cabinet 35, kitchen 0 top cabinet 42, kitchen 0 sink 37, kitchen 0 counter 49, kitchen 0 bottom cabinet 44, kitchen 0 counter 48, kitchen 0 carpet 34, kitchen 0 cooktop 64, kitchen 0 chair 12, kitchen 0 oven 46, kitchen 0 plant 5, kitchen 0 chair 7, kitchen 0 table 6, kitchen 0 bottom cabinet 47, kitchen 0 counter 33, kitchen 0 bottom cabinet 32, kitchen 0 counter 31, kitchen 0 oven 21, kitchen 0 oven 22, kitchen 0 bottom cabinet 26, kitchen 0 bottom cabinet 29, kitchen 0 bottom cabinet 30, kitchen 0 top cabinet 27, kitchen 0 fridge 20, kitchen 0 microwave 23, ki

## remove duplicates
(keep the first of groupby same 'object_moved', 'rec_before', 'rec_after', 'correct_before')

In [216]:
df_all = pd.DataFrame(annotated_logs)
df_all['object_moved'] = df_all.apply(lambda x: list(x['outcome']['objects_moved'].keys())[0] if len(x['outcome']['objects_moved'].keys()) > 0 else None, axis=1)
df_all['rec_before'] = df_all.apply(lambda x: list(x['outcome']['objects_moved'].values())[0][0] if len(x['outcome']['objects_moved'].keys()) > 0 else None, axis=1)
df_all['rec_after'] = df_all.apply(lambda x: list(x['outcome']['objects_moved'].values())[0][1] if len(x['outcome']['objects_moved'].keys()) > 0 else None, axis=1)
df_all['correct_before'] = df_all.apply(lambda x: str(set(x['correct_objects']['start'])), axis=1)
df_all_remove_duplicate = df_all.groupby(['object_moved', 'rec_before', 'rec_after', 'correct_before']).first().reset_index()


In [217]:
# let's see if we could de-duplicate before merging
df_finetune = pd.DataFrame(finetune_records)
df_finetune['object_moved'] = df_finetune.apply(lambda x: list(x['outcome']['objects_moved'].keys())[0], axis=1)
df_finetune['rec_before'] = df_finetune.apply(lambda x: list(x['outcome']['objects_moved'].values())[0][0], axis=1)
df_finetune['rec_after'] = df_finetune.apply(lambda x: list(x['outcome']['objects_moved'].values())[0][1], axis=1)
df_finetune['correct_before'] = df_finetune.apply(lambda x: str(set(x['correct_objects']['start'])), axis=1)
df_remove_duplicate = df_finetune.groupby(['object_moved', 'rec_before', 'rec_after', 'correct_before']).first().reset_index()
finetune_msg_simulation = df_remove_duplicate.apply(lambda x: x[ANNOTATION][FINETUNE_MSG], axis=1).to_list()
print ('all finetune records before duplicate removal', len(df_finetune))
print ('finetune records after duplicate removal', len(finetune_msg_simulation))
print ('num correct objects', len(df_finetune.groupby(['object_moved']).first()))

all finetune records before duplicate removal 327
finetune records after duplicate removal 259
num correct objects 48


In [218]:
raise Exception('move forward to write messages to jsonl')

Exception: move forward to write messages to jsonl

In [219]:
finetune_records = finetune_msg_simulation # [log for log in annotated_logs if log[ANNOTATION][DIFF_CORRECT_LOC] > 0]
NUM_VALID = 0
NUM_TRAIN = len(finetune_records) - NUM_VALID
print ('all positive records', len(finetune_records))
training_data = finetune_records[:NUM_TRAIN]# [log[ANNOTATION][FINETUNE_MSG] for log in finetune_records[:400]]
validation_data = finetune_records[NUM_TRAIN:] # [log[ANNOTATION][FINETUNE_MSG] for log in finetune_records[400:]]

all positive records 259


In [221]:
def write_jsonl(data_list: list, filename: str) -> None:
    with open(filename, "w") as out:
        for ddict in data_list:
            jout = json.dumps(ddict) + "\n"
            out.write(jout)
            
def write_metadata_file(filepath):
    with open(filepath, 'w') as out:
        flag = ""
        flag += f"Number of training samples: {len(training_data)}\n"
        flag += f"Number of validation samples: {len(validation_data)}\n"
        flag += f"Source folder: {logs_id}"
        out.write(flag)
        print (flag)

In [222]:
training_file_name = os.path.join(OUTPUT_PATH, TRAIN_FILE_NAME_OUTPUT)
# Create the parent directory if it doesn't exist
Path(training_file_name).parent.mkdir(parents=True, exist_ok=True)
write_jsonl(training_data, training_file_name)

if NUM_VALID > 0:
    validation_file_name = os.path.join(OUTPUT_PATH, VALID_FILE_NAME_OUTPUT)
    write_jsonl(validation_data, validation_file_name)

write_metadata_file(os.path.join(OUTPUT_PATH, META_FILE_NAME))


Number of training samples: 259
Number of validation samples: 0
Source folder: ft_36_from_bt_14_ablationlarge


In [223]:
raise Exception('move forward to create finetune jobs')

Exception: move forward to create finetune jobs

In [224]:
def create_finetune_upload_response():
    training_response = openai.File.create(file=open(training_file_name, "rb"), purpose="fine-tune")
    training_file_id = training_response["id"]
    print("Training file ID:", training_file_id)
    if NUM_VALID > 0:
        validation_response = openai.File.create(file=open(validation_file_name, "rb"), purpose="fine-tune")
        validation_file_id = validation_response["id"]
        print("Validation file ID:", validation_file_id)
        return {"training_response": training_response, "validation_response": validation_response}
    else:
        return {"training_response": training_response}

def create_finetune_response_and_log(training_file_id, validation_file_id=None):
    if validation_file_id is not None:
        response = openai.FineTuningJob.create( training_file=training_file_id, validation_file=validation_file_id, model=FINETUNE_MODEL, suffix=SUFFIX, \
            hyperparameters={"n_epochs":1})
    else:
        response = openai.FineTuningJob.create( training_file=training_file_id, model=FINETUNE_MODEL, suffix=SUFFIX, \
            hyperparameters={"n_epochs":1})
    job_id = response["id"]
    with open(os.path.join(OUTPUT_PATH, "job_info.txt"), 'w') as out:
        flag = ""
        flag += f"training file id: {training_file_id}\n"
        flag += f"validation file id: {validation_file_id}\n"
        flag += f"finetune job id: {job_id}\n"
        flag += f"finetune model: {FINETUNE_MODEL}\n"
        flag += f"finetune suffix: {SUFFIX}"
        out.write(flag)
        print("Status:", response["status"])
        print("Job ID:", response["id"])
    return response

In [225]:
upload_result = create_finetune_upload_response()
training_response = upload_result['training_response']
validation_file_id = None
if NUM_VALID > 0:
    validation_response = upload_result['validation_response']
    validation_file_id = validation_response['id']
training_file_id = training_response['id']

finetune_response = create_finetune_response_and_log(training_file_id, validation_file_id)
job_id = finetune_response['id']

Training file ID: file-iPycS4D30kHVZX5nYTtV0eh9
Status: validating_files
Job ID: ftjob-RwpQtzgv7GZt2qIlw7FO0tIy


In [226]:
response = openai.FineTuningJob.retrieve(job_id)

print("Job ID:", response["id"])
print("Status:", response["status"])
print("Trained Tokens:", response["trained_tokens"])

Job ID: ftjob-RwpQtzgv7GZt2qIlw7FO0tIy
Status: validating_files
Trained Tokens: None
