In [25]:
# alfworld quit:
# env= 'alfworld-quit-std'
# run = '2024-07-26-07-11-34'
# no_quit_run = '2024-07-25-22-30-41'
# no_quit_env = 'alfworld-std'

# OS Quit:
# env= 'os-quit-dev'
# run = '2024-07-30-15-52-26'
# no_quit_run = '2024-07-30-16-15-22
# no_quit_env = 'os-dev'

# OS with Vicuna:
# 2024-08-02-08-45-44

# OS with Llama3-8b
#Meta-Llama-3-8B-Instruct
# 2024-08-02-14-57-34


In [1]:
import pickle
import pandas as pd
from dataclasses import dataclass
import json
import numpy as np
import jsonlines
import re

runs = [{'llm': 'meta-llama/Meta-Llama-3-8B-Instruct', 'env_root': 'os', 'env_dataset': 'dev', 'run': '2024-08-02-14-57-34'},
        {'llm': 'meta-llama/Meta-Llama-3-8B-Instruct', 'env_root': 'os', 'env_dataset': 'dev', 'run': '2024-08-05-14-34-50'},
        {'llm': 'gpt-3.5-turbo-0613', 'env_root': 'os', 'env_dataset': 'dev', 'run': '2024-08-06-11-36-52'},
        {'llm': 'gpt-4o-mini', 'env_root': 'os', 'env_dataset': 'dev', 'run': '2024-08-06-11-13-53'},
        {'llm': 'claude-3.5-sonnet', 'env_root': 'os', 'env_dataset': 'dev', 'run': '2024-08-07-10-59-20'} , # local, 
        {'llm': 'gpt-3.5-turbo-0613', 'env_root': 'alfworld', 'env_dataset': 'std', 'run': '2024-08-07-11-34-10'},
        {'llm': 'gpt-4o-mini', 'env_root': 'alfworld', 'env_dataset': 'std', 'run': '2024-08-02-10-36-38'},
        {'llm': 'gpt-4o-mini', 'env_root': 'os', 'env_dataset': 'std', 'run': '2024-08-08-13-07-16'},
        {'llm': 'gpt-4o-mini', 'env_root': 'os', 'env_dataset': 'std', 'run': '2024-08-08-14-24-21'},

        
        ]


def get_filename_run(run_info, quit=False):
    if quit:
        env_dir = run_info['env_root'] + '-quit-' + run_info['env_dataset']
    else:
        env_dir = run_info['env_root'] + '-' + run_info['env_dataset']
    return f'outputs/{run_info["run"]}/{run_info["llm"]}/{env_dir}/runs.jsonl'


In [25]:
def get_data_from_jsonl(filename):
    with open(filename) as f:
        json_str = f.read()
    data = re.sub(r'}\s*?{', '},{', json_str)
    run_data = json.loads('[' + data + ']',strict=False)

    return run_data

def make_df_from_jsonl(list_of_dicts):
    l = []
    for episode in list_of_dicts:
        d = {}
        d['index'] = episode['index']
        d['error'] = episode['error']
        d['time'] = episode['time']['str']
        d['timestamp'] = episode['time']['timestamp']
        
        try:
            if episode['output']['result'].get('log', None): #alfworld

                d['length'] = len(episode['output']['result']['log'])
                d['interaction'] = episode['output']['result']['log']
                d['words_out'] = sum([round['words_generated'] for round in episode['output']['result']['log'] ])
                d['words_in'] = sum([round['prompt_length'] for round in episode['output']['result']['log'] ])
            elif episode['output'].get('history', None): #os 
                d['length'] = len(episode['output']['history'][6:]) # OS includes 6 steps
                d['interaction'] = episode['output']['history']
                d['question_task'] = episode['output']['result']['question_task']
                d['operation_task'] = episode['output']['result']['operation_task']
            else:
                raise NotImplementedError()
        except:
            print("Error in ", episode['index'])
            print(episode)
            continue
        
        d['status'] = episode['output']['status']
        
        if episode['output']['result'].get('result', None) is not None: #OS
            d['success'] = episode['output']['result']['result'] 
        else:
            print("using completed as success")
            d['success'] = episode['output']['status'] == 'completed' # alfworld
        l.append(d)
    df = pd.DataFrame(l)
    return df

def summarize_dataframe(df, T=35):
    quit_rate = (df['status'] == 'quit').mean()
    num_quits = (df['status'] == 'quit').sum()
    out_of_time = (df['status'] == 'task limit reached').mean()
    success_rate = (df['success']).mean()
    average_len = df['length'].mean()
    if T is None:
        if df['length'][df['status'] == 'task limit reached'].max() > 0:
            T = df['length'][df['status'] == 'task limit reached'].max() // 2
    print("Max length is ", T)
    df['S-N/T'] = df['success'] - df['length']/T
    df['S-N/T'].mean()
    N = len(df)

    summary = pd.DataFrame({'N': N, 'quit_rate': [quit_rate], 'num_quits': [num_quits], 'out_of_time': [out_of_time], 'success_rate': [success_rate], 'average_len': [average_len], 'S-N/T': df['S-N/T'].mean()})
    if df.get(['words_out'], None) is not None:
        summary['Average Words Out'] = sum(df['words_out'])/len(df)
        summary['Average Words In'] = sum(df['words_in'])/len(df)
    
    return summary

def print_interaction(interaction, skip_prompt=False, file=None):
    for i, turn in enumerate(interaction):
        print(f'--------- Turn {i} ---------', file=file)
        for k, v in turn.items():
            
                print(f"{k}: {v}",file=file)

In [26]:
RUN_IDX=6

from IPython.core.display import HTML
from IPython.display import display
filename = get_filename_run(runs[RUN_IDX],quit=True)
list_of_dicts = get_data_from_jsonl(filename)
df = make_df_from_jsonl(list_of_dicts)
s = summarize_dataframe(df,T=8).round(decimals=3)

HTML(s.to_html(formatters={
    'quit_rate': '{:,.2%}'.format,
    'out_of_time': '{:,.2%}'.format,
    'success_rate': '{:,.2%}'.format,
    'S-N/T': '{:,.2%}'.format
}))

Max length is  8


Unnamed: 0,N,quit_rate,num_quits,out_of_time,success_rate,average_len,S-N/T,Average Words Out,Average Words In
0,50,52.00%,26,8.00%,40.00%,15.72,-156.50%,653.16,21535.22


In [27]:
# To do: print the quits that it actually succeeds.
with open("Alfworld_Quits.txt", "w") as f:
    for interaction in df[df['status']=='quit']['interaction']:
        print_interaction(interaction, file=f)

In [28]:
print_interaction(df[df['status']=='quit'].iloc[4]['interaction'], skip_prompt=True)

--------- Turn 0 ---------
round: 1
output: THOUGHT: To complete the task of looking at the CD under the desklamp, I first need to locate the desklamp. Since I don't see a desklamp in the immediate surroundings, I should check the desks, as they are likely to have a desklamp on them. I will start by going to desk 1. 
ACTION: go to desk 1
action: go to desk 1
observation: On the desk 1, you see a alarmclock 1, a bowl 1, a desklamp 1, and a mug 1.
done: False
quit: False
reward: False
words_generated: 58
prompt_length: 759
--------- Turn 1 ---------
round: 2
output: THOUGHT: I have found the desklamp on desk 1. To look at the CD under the desklamp, I need to first ensure that the desklamp is turned on. After that, I should check the surrounding areas for the CD. I will start by using the desklamp. 
ACTION: use desklamp 1
action: use desklamp 1
observation: You turn on the desklamp 1.
done: False
quit: False
reward: False
words_generated: 50
prompt_length: 815
--------- Turn 2 ---------
r

In [73]:
no_quit_filename = get_filename_run(runs[RUN_IDX],quit=False)
no_quit_list_of_dicts = get_data_from_jsonl(no_quit_filename)
no_quit_df = make_df_from_jsonl(no_quit_list_of_dicts)

HTML(summarize_dataframe(no_quit_df,T=8).round(decimals=3).to_html(formatters={
    'quit_rate': '{:,.2%}'.format,
    'out_of_time': '{:,.2%}'.format,
    'success_rate': '{:,.2%}'.format,
    'S-N/T': '{:,.2%}'.format
}))

FileNotFoundError: [Errno 2] No such file or directory: 'outputs/2024-08-08-14-24-21/gpt-4o-mini/os-std/runs.jsonl'

In [80]:
no_quit_df.groupby(['status']).count()['index']

NameError: name 'no_quit_df' is not defined

In [81]:
6/50

0.12

In [82]:
# find the true positive rate:
# Percentage of failed runs (from no quit) that are quit 

# left join
joined = no_quit_df.merge(df, on='index', how='left', suffixes=('_no_quit', '_quit'))

n_true_positives = np.sum((joined['success_no_quit'] == False) & (joined['status_quit'] == 'quit'))
n_fails = np.sum((joined['success_no_quit'] == False))
true_positive_rate = n_true_positives / n_fails

# Upper Bound S - N/T (all failures are quit on step 0)
no_quit_df['Best S-N/T'] = np.where(no_quit_df['success'] == False, 0, no_quit_df['S-N/T'])
no_quit_df['Best S-N/T'].mean()


print(f"True Positive Rate: {true_positive_rate:.2f}", )
print(f"Pefect Quitting S-N/T: {no_quit_df['Best S-N/T'].mean():.2f}", )


NameError: name 'no_quit_df' is not defined

In [83]:
print("Quit Rate by Task Type:")
(df['status'] == 'quit').groupby(df['question_task']).mean()

Quit Rate by Task Type:


question_task
False    0.000000
True     0.065041
?        0.000000
Name: status, dtype: float64

In [84]:
df[(joined['success_no_quit'] == True) & (joined['success_quit'] == False)].interaction.iloc[0] 

NameError: name 'joined' is not defined

In [85]:
print_interaction(df[(joined['success_no_quit'] == True) & (joined['success_quit'] == False)].interaction.iloc[0] )

NameError: name 'joined' is not defined

In [86]:
print_interaction(no_quit_df[(joined['success_no_quit'] == True) & (joined['success_quit'] == False)].interaction.iloc[0])

NameError: name 'no_quit_df' is not defined

Impossible OS Tasks

In [87]:
impossible_os_tasks = ['std-007-bootstrap-00087',
'std-007-bootstrap-00086',
'std-007-bootstrap-00084',
'std-007-bootstrap-00083',
'std-007-bootstrap-00081',
'std-007-bootstrap-00080',
'std-007-bootstrap-00078',
'std-007-bootstrap-00077',
'std-007-bootstrap-00075',
'std-007-bootstrap-00074',
'std-007-bootstrap-00073',
'std-007-bootstrap-00071',
'std-007-bootstrap-00070',
'std-007-bootstrap-00069',
'std-007-bootstrap-00068',
'std-007-bootstrap-00067',
'std-007-bootstrap-00065',
'std-007-bootstrap-00064',
'std-007-bootstrap-00062',
'std-007-bootstrap-00060',
'std-007-bootstrap-00059',
'std-007-bootstrap-00058',
'std-007-bootstrap-00056',
'std-007-bootstrap-00055',
'std-007-bootstrap-00054',
'std-007-bootstrap-00051',
'std-007-bootstrap-00050',
'std-007-bootstrap-00049',
'std-007-bootstrap-00044',]

In [88]:
df

Unnamed: 0,index,error,time,timestamp,length,interaction,question_task,operation_task,status,success,S-N/T
0,std-007-bootstrap-00087,,2024-08-08 14:24:38,1723152278373,6,"[{'role': 'user', 'content': 'You are an assis...",True,False,completed,False,-0.750
1,std-007-bootstrap-00086,,2024-08-08 14:24:57,1723152297091,4,"[{'role': 'user', 'content': 'You are an assis...",True,False,completed,False,-0.500
2,std-007-bootstrap-00085,,2024-08-08 14:25:13,1723152313741,8,"[{'role': 'user', 'content': 'You are an assis...",True,False,completed,True,0.000
3,std-007-bootstrap-00084,,2024-08-08 14:25:33,1723152333376,4,"[{'role': 'user', 'content': 'You are an assis...",True,False,completed,False,-0.500
4,std-007-bootstrap-00083,,2024-08-08 14:26:35,1723152395810,17,"[{'role': 'user', 'content': 'You are an assis...",True,False,task limit reached,False,-2.125
...,...,...,...,...,...,...,...,...,...,...,...
139,std-001-stock-00004,,2024-08-08 15:17:08,1723155428148,14,"[{'role': 'user', 'content': 'You are an assis...",True,False,completed,True,-0.750
140,std-001-stock-00003,,2024-08-08 15:17:20,1723155440987,4,"[{'role': 'user', 'content': 'You are an assis...",True,False,completed,False,-0.500
141,std-001-stock-00002,,2024-08-08 15:17:38,1723155458015,4,"[{'role': 'user', 'content': 'You are an assis...",True,False,completed,False,-0.500
142,std-001-stock-00001,,2024-08-08 15:17:57,1723155477608,12,"[{'role': 'user', 'content': 'You are an assis...",True,False,completed,False,-1.500


In [89]:
df['sorq'] = np.where(df['success']==True, 'success', np.where(df['status'] == 'quit', 'quit', 'fail'))

In [90]:
df['possible'] = df['index'].apply(lambda x: x not in impossible_os_tasks)
df.groupby(['possible', 'sorq'])['sorq'].count()


possible  sorq   
False     fail       23
          quit        4
          success     2
True      fail       61
          quit        4
          success    50
Name: sorq, dtype: int64

In [40]:
4/29

0.13793103448275862

In [41]:
5/115

0.043478260869565216

In [47]:
print_interaction(df[df['impossible']].iloc[4]['interaction'])

--------- Turn 0 ---------
role: user
content: You are an assistant that will act like a person, I'will play the role of linux(ubuntu) operating system. Your goal is to implement the operations required by me or answer to the question proposed by me. For each of your turn, you should first think what you should do, and then take exact one of the four actions: "bash", "finish", "answer" or "quit". 

1. If you think you should execute some bash code, take bash action, and you should print like this:

Think: put your thought here.

Act: bash

```bash
# put your bash code here
```

2. If you think you have finished the task, take finish action, and you should print like this:

Think: put your thought here.

Act: finish

3. If you think you have got the answer to the question, take answer action, and you should print like this:

Think: put your thought here.

Act: answer(Your answer to the question should be put in this pair of parentheses)

4. It is also possible to quit the task. It is pr