In [1]:
# alfworld quit:
# env= 'alfworld-quit-std'
# run = '2024-07-26-07-11-34'
# no_quit_run = '2024-07-25-22-30-41'
# no_quit_env = 'alfworld-std'

# OS Quit:
# env= 'os-quit-dev'
# run = '2024-07-30-15-52-26'
# no_quit_run = '2024-07-30-16-15-22
# no_quit_env = 'os-dev'

# OS with Vicuna:
# 2024-08-02-08-45-44

# OS with Llama3-8b
#Meta-Llama-3-8B-Instruct
# 2024-08-02-14-57-34


In [57]:
import pickle
import pandas as pd
from dataclasses import dataclass
import json
import numpy as np
import jsonlines
import re

runs = [{'llm': 'meta-llama/Meta-Llama-3-8B-Instruct', 'env_root': 'os', 'env_dataset': 'dev', 'run': '2024-08-02-14-57-34'},
        {'llm': 'meta-llama/Meta-Llama-3-8B-Instruct', 'env_root': 'os', 'env_dataset': 'dev', 'run': '2024-08-05-14-34-50'},
        {'llm': 'gpt-3.5-turbo-0613', 'env_root': 'os', 'env_dataset': 'dev', 'run': '2024-08-06-11-36-52'},
        {'llm': 'gpt-4o-mini', 'env_root': 'os', 'env_dataset': 'dev', 'run': '2024-08-06-11-13-53'}
        ]


def get_filename_run(run_info, quit=False):
    if quit:
        env_dir = run_info['env_root'] + '-quit-' + run_info['env_dataset']
    else:
        env_dir = run_info['env_root'] + '-' + run_info['env_dataset']
    return f'outputs/{run_info["run"]}/{run_info["llm"]}/{env_dir}/runs.jsonl'


In [58]:
def get_data_from_jsonl(filename):
    with open(filename) as f:
        json_str = f.read()
    data = re.sub(r'}\s*?{', '},{', json_str)
    run_data = json.loads('[' + data + ']',strict=False)

    return run_data

def make_df_from_jsonl(list_of_dicts):
    l = []
    for episode in list_of_dicts:
        d = {}
        d['index'] = episode['index']
        d['error'] = episode['error']
        d['time'] = episode['time']['str']
        d['timestamp'] = episode['time']['timestamp']
        
        if episode['output']['result'].get('log', None): #alfworld

            d['length'] = len(episode['output']['result']['log'])
            d['interaction'] = episode['output']['result']['log']
            d['words_out'] = sum([round['words_generated'] for round in episode['output']['result']['log'] ])
            d['words_in'] = sum([round['prompt_length'] for round in episode['output']['result']['log'] ])
        
        elif episode['output'].get('history', None): #os 
            d['length'] = len(episode['output']['history'][6:]) # OS includes 6 steps
            d['interaction'] = episode['output']['history']
            d['question_task'] = episode['output']['result']['question_task']
            d['operation_task'] = episode['output']['result']['operation_task']
        else:
            raise NotImplementedError()
        
        d['status'] = episode['output']['status']
        
        if episode['output']['result'].get('result', None) is not None: #OS
            d['success'] = episode['output']['result']['result'] 
        else:
            print("using completed as success")
            d['success'] = episode['output']['status'] == 'completed' # alfworld
        l.append(d)
    df = pd.DataFrame(l)
    return df

def summarize_dataframe(df):
    quit_rate = (df['status'] == 'quit').mean()
    num_quits = (df['status'] == 'quit').sum()
    out_of_time = (df['status'] == 'task limit reached').mean()
    success_rate = (df['success']).mean()
    average_len = df['length'].mean()
    if df['length'][df['status'] == 'task limit reached'].max() > 0:
        T = df['length'][df['status'] == 'task limit reached'].max() //2
    else:
        T=8
        print(f"Need to put in max length manually, putting in T={T}")
    print("Max length is ", T)
    df['S-N/T'] = df['success'] - df['length']/T
    df['S-N/T'].mean()
    N = len(df)

    summary = pd.DataFrame({'N': N, 'quit_rate': [quit_rate], 'num_quits': [num_quits], 'out_of_time': [out_of_time], 'success_rate': [success_rate], 'average_len': [average_len], 'S-N/T': df['S-N/T'].mean()})
    if df.get(['words_out'], None) is not None:
        summary['Average Words Out'] = sum(df['words_out'])/len(df)
        summary['Average Words In'] = sum(df['words_in'])/len(df)
    
    return summary

In [64]:
RUN_IDX=2

from IPython.core.display import HTML
from IPython.display import display
filename = get_filename_run(runs[RUN_IDX],quit=True)
list_of_dicts = get_data_from_jsonl(filename)
df = make_df_from_jsonl(list_of_dicts)
s = summarize_dataframe(df).round(decimals=3)

HTML(s.to_html(formatters={
    'quit_rate': '{:,.2%}'.format,
    'out_of_time': '{:,.2%}'.format,
    'success_rate': '{:,.2%}'.format,
    'S-N/T': '{:,.2%}'.format
}))

Need to put in max length manually, putting in T=8
Max length is  8


Unnamed: 0,N,quit_rate,num_quits,out_of_time,success_rate,average_len,S-N/T
0,26,0.00%,0,0.00%,42.30%,5.615,-27.90%


In [61]:
print("Quit Rate by Task Type:")
(df['status'] == 'quit').groupby(df['question_task']).mean()

Quit Rate by Task Type:


question_task
False    0.0
True     0.0
?        0.0
Name: status, dtype: float64

In [65]:
no_quit_filename = get_filename_run(runs[RUN_IDX],quit=False)
no_quit_list_of_dicts = get_data_from_jsonl(no_quit_filename)
no_quit_df = make_df_from_jsonl(no_quit_list_of_dicts)

HTML(summarize_dataframe(no_quit_df).round(decimals=3).to_html(formatters={
    'quit_rate': '{:,.2%}'.format,
    'out_of_time': '{:,.2%}'.format,
    'success_rate': '{:,.2%}'.format,
    'S-N/T': '{:,.2%}'.format
}))

Max length is  8


Unnamed: 0,N,quit_rate,num_quits,out_of_time,success_rate,average_len,S-N/T
0,26,0.00%,0,7.70%,53.80%,5.615,-16.30%


In [66]:
# find the true positive rate:
# Percentage of failed runs (from no quit) that are quit 

# left join
joined = no_quit_df.merge(df, on='index', how='left', suffixes=('_no_quit', '_quit'))

n_true_positives = np.sum((joined['success_no_quit'] == False) & (joined['status_quit'] == 'quit'))
n_fails = np.sum((joined['success_no_quit'] == False))
true_positive_rate = n_true_positives / n_fails

# Upper Bound S - N/T (all failures are quit on step 0)
no_quit_df['Best S-N/T'] = np.where(no_quit_df['success'] == False, 0, no_quit_df['S-N/T'])
no_quit_df['Best S-N/T'].mean()


print(f"True Positive Rate: {true_positive_rate:.2f}", )
print(f"Pefect Quitting S-N/T: {no_quit_df['Best S-N/T'].mean():.2f}", )


True Positive Rate: 0.00
Pefect Quitting S-N/T: 0.25


In [18]:
df[(joined['success_no_quit'] == True) & (joined['success_quit'] == False)].interaction.iloc[0] 

[{'role': 'user',
  'content': 'You are an assistant that will act like a person, I\'will play the role of linux(ubuntu) operating system. Your goal is to implement the operations required by me or answer to the question proposed by me. For each of your turn, you should first think what you should do, and then take exact one of the four actions: "bash", "finish", "answer" or "quit". \n\n1. If you think you should execute some bash code, take bash action, and you should print like this:\n\nThink: put your thought here.\n\nAct: bash\n\n```bash\n# put your bash code here\n```\n\n2. If you think you have finished the task, take finish action, and you should print like this:\n\nThink: put your thought here.\n\nAct: finish\n\n3. If you think you have got the answer to the question, take answer action, and you should print like this:\n\nThink: put your thought here.\n\nAct: answer(Your answer to the question should be put in this pair of parentheses)\n\n4. It is also possible to quit the task

In [None]:
def print_interaction(interaction, skip_prompt=False):
    for i, turn in enumerate(interaction):
        print(f'--------- Turn {i} ---------')
        if skip_prompt and i == 0:
            continue
        for k, v in turn.items():
            
                print(f"{k}: {v}")
            
print_interaction(df[(joined['success_no_quit'] == True) & (joined['success_quit'] == False)].interaction.iloc[0] )

--------- Turn 0 ---------
role: user
content: You are an assistant that will act like a person, I'will play the role of linux(ubuntu) operating system. Your goal is to implement the operations required by me or answer to the question proposed by me. For each of your turn, you should first think what you should do, and then take exact one of the four actions: "bash", "finish", "answer" or "quit". 

1. If you think you should execute some bash code, take bash action, and you should print like this:

Think: put your thought here.

Act: bash

```bash
# put your bash code here
```

2. If you think you have finished the task, take finish action, and you should print like this:

Think: put your thought here.

Act: finish

3. If you think you have got the answer to the question, take answer action, and you should print like this:

Think: put your thought here.

Act: answer(Your answer to the question should be put in this pair of parentheses)

4. It is also possible to quit the task. It is pr

In [None]:
print_interaction(no_quit_df[(joined['success_no_quit'] == True) & (joined['success_quit'] == False)].interaction.iloc[0])

--------- Turn 0 ---------
role: user
content: You are an assistant that will act like a person, I'will play the role of linux(ubuntu) operating system. Your goal is to implement the operations required by me or answer to the question proposed by me. For each of your turn, you should first think what you should do, and then take exact one of the three actions: "bash", "finish" or "answer". 

1. If you think you should execute some bash code, take bash action, and you should print like this:

Think: put your thought here.

Act: bash

```bash
# put your bash code here
```

2. If you think you have finished the task, take finish action, and you should print like this:

Think: put your thought here.

Act: finish

3. If you think you have got the answer to the question, take answer action, and you should print like this:

Think: put your thought here.

Act: answer(Your answer to the question should be put in this pair of parentheses)

If the output is too long, I will truncate it. The trunc

```bash
for i in {1..3}; do
  echo "Checking interval: $i seconds"
  count_before=$(grep -c 'succeed' /root/log)
  sleep $i
  count_after=$(grep -c 'succeed' /root/log)
  if [ $count_after -gt $count_before ]; then
    echo $i
    break
  fi
done

```bash
# Check for 'succeed' every second for 3 seconds
for i in {1..3}; do
  sleep $i
  if grep -q 'succeed' /root/log; then
    echo $i
    break
  fi
done
```

```bash
getent group | awk -F: 'NR <= 70' | wc -l
```

```bash
getent group | awk -F: '$3 <= 70' | wc -l
```