# VIMA evaluation

## Load all evaluation results

In [None]:
from glob import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import re

levels = ['placement_generalization',
 'combinatorial_generalization',
 'novel_object_generalization',
 'novel_task_generalization']

files = glob('./results/[*.json')
result = []

for f in files:
    model_name = f[:-5]
    model_name = re.sub(r'\(.*\)', '', model_name).split(']')[-1]
    
    js = json.load(open(f, 'r'))

    # about prompt mode
    pm = ''
    pid = -1
    prop = []
    model_path = None
    for i, j in js.items():
        if i == 'global':
            pm = j.get('prompt_mode', 'N/A')
            pid = j.get('prompt_id', -2)
            model_path = j.get('model_path', None)
        else:
            try:
                del j['lm_prompt_hist']
                del j['lm_answer_hist']
            except:
                pass
            prop.append(j)

    for i in prop:
        i['level'] = f"L{levels.index(i['level']) + 1}"
        if pid < 0:
            # these methods use random user_prompt for action generation (default setting reported in paper)
            i['method'] = model_name
        elif pid < 15:
            # these methods use a fixed user_prompt for action generation
            i['method'] = model_name + f'_prompt{pid:03d}'
        else:
            # these methods omit the user_prompt for action generation
            i['method'] = model_name + '_no_prompt'
        i['prompt_mode'] = pm
    result.extend(prop)


df = pd.DataFrame(result).fillna('')
display(df.head())

## Show the success rate

In [None]:
def show_results(data, data_total):
    # Pivot the table
    grouped = data.groupby(['method', 'prompt_mode', 'level']).size().unstack(fill_value=0)
    total = data_total.groupby(['method', 'prompt_mode', 'level']).size().unstack(fill_value=0)
    
    # Create a new dataframe with the desired text format
    result = total.copy().astype(str)  # Copy the structure of table1
    for col in total.columns:
        for idx in total.index:
            if total.loc[idx, col] > 0:
                try:
                    result.loc[idx, col] = f"{grouped.loc[idx, col]} / {total.loc[idx, col]} ({grouped.loc[idx, col] / total.loc[idx, col] * 100:.1f}%)"
                except KeyError:
                    result.loc[idx, col] = f"0 / {total.loc[idx, col]} (0.0%)"
            else:
                result.loc[idx, col] = 'N/A'
    
    display(result)
    
print('Please note that results for L4 are not valid because there is no rotation data when the end effector is a spatula.')
show_results(df[df['success']], df)