In [1]:
import pandas as pd 
import re 
import numpy as np 
import json 
from fuzzywuzzy import process



In [2]:
df = pd.read_csv('data/single_turn_trials_march_2.csv')

0

In [3]:
ec_ans = df['EC Answer'].values
tom_ans = df['TOM Answer'].values
jp_ans = df['JP Answer'].values


ec_ans_ord = df['EC Answer Ordinal'].values
tom_ans_ord = df['TOM Answer Ordinal'].values
jp_ans_ord = df['JP Answer Ordinal'].values


ec_questions = df['EC Question'].values
tom_questions = df['TOM Question'].values
jp_questions = df['JP Question'].values

scenarios = df['State Description'].values 

In [4]:
def find_answer_fuzzy(question_string, inference_string):
    selected_ordinal = '-2'
    def get_answer_ordinal(inference_string):
        match = re.search(r'([A-Z])[\.\)]', inference_string)
        if match:
            return match.group(1)
        else:
            return None

    def get_answer_text(inference_string):
        # print(inference_string)
        action_regex = r"Action:\s*(.*)"
        answer_regex = r"Answer:\s*(.*)"
        match = re.search(action_regex, inference_string)
        if match:
            return match.group(1)
        
        match = re.search(answer_regex, inference_string)
        if match:
            return match.group(1)
        
        return None
    
    # If there is only one instance of Action: action then we get the action string 
    answer_text = get_answer_text(inference_string)
    # extract ordinal from the answer 
    if answer_text == None:
        answer_text = inference_string

    selected_ordinal = get_answer_ordinal(answer_text)
    
    if selected_ordinal is None: 
        action_list = []
        # print("QUESTION STRING: ", question_string)
        if 'actions' in question_string.strip().lower():
            action_list = question_string.strip().lower().split('available actions:')[-1].split('\n')
        elif 'answer' in question_string.strip().lower():
            action_list = question_string.strip().lower().split('available answers:')[-1].split('\n') 
            # print(action_list)
        if '' in action_list:
            action_list = [a for a in action_list if a != '']
        if ' ' in action_list:
            action_list = [a for a in action_list if a != ' ']
        action_list = [re.sub(r'^[a-z]\.\s*', '', action).strip() for action in action_list]
        action_start = answer_text.find("Action:")
        if action_start == -1:
            action_start = answer_text.find("Answer:")

        if action_start == -1:
            answer_text = re.sub(r'^[a-z]\.\s*', '', answer_text).strip()
            best_match, score = process.extractOne(answer_text, action_list)
            selected_ordinal = chr(action_list.index(best_match) + 65)
        else:
            answer_text = answer_text[action_start + 7:].strip()
            answer_text = re.sub(r'^[a-z]\.\s*', '', answer_text).strip()
            best_match, score = process.extractOne(answer_text, action_list)
            selected_ordinal = chr(action_list.index(best_match) + 65)
    
    return selected_ordinal

In [5]:
all_models = ['mistral_Mistral-7B-Instruct-v02', 'mistral_Mixtral-8x7B-Instruct-v01', 'vicuna_vicuna-7b-v15', 'vicuna_vicuna-13b-v15', 'llama_Llama-2-13b-chat-hf', 'llama_Llama-2-70b-chat-hf', 'openai_gpt-4-0125'] 

In [6]:
logs = {all_models[i]: json.load(open(f'logs/all_{all_models[i]}0.json')) for i in range(len(all_models))}

In [7]:
def fuzzy_search_test(find_answer_fuzzy):
    text = '''Explanation: Bob has a Yellow 2 card which is immediately playable on the Yellow Stack. By revealing Bob's rank 2 cards, I will be providing a Play Clue for the Yellow 2, which aligns with our convention to prioritize Play Clues over Save Clues. This action directly contributes to our progress without risking a life or wasting a turn on a discard when a productive play is available.
    Action:N. Reveal Bob's rank 2 cards.'''

    question = '''It is currently My (Alice) turn. Current Stacks: Red - R1, Yellow - Y1, Green - G0, White - W0, Blue - B0 
    My cards based on my knowledge:  
    Card 0 could be: [Red, Yellow, Green, White, Blue] [3]
    Card 1 could be: [Red, Yellow, Green, White, Blue] [1, 2, 4, 5]
    Card 2 could be: [Red, Yellow, Green, White, Blue] [3]
    Card 3 could be: [Red, Yellow, Green, White, Blue] [1, 2, 4, 5]
    Card 4 could be: [Red, Yellow, Green, White, Blue] [1, 2, 4, 5]
    I can see Bob's Cards are:  
    [Card 0: Yellow 3]  
    [Card 1: White 5]  
    [Card 2: Yellow 2]  
    [Card 3: Yellow 4]  
    [Card 4: Blue 3]  
    Bob's Knowledge about his cards:  
    Bob believes his Card 0 could be:¬† [Red, Yellow, Green, White, Blue] [1, 2, 3, 4, 5]
    Bob believes his Card 1 could be:¬† [Red, Yellow, Green, White, Blue] [5]
    Bob believes his Card 2 could be:¬† [Red, Yellow, Green, White, Blue] [2, 3, 4]
    Bob believes his Card 3 could be:¬† [Red, Yellow, Green, White, Blue] [2, 3, 4]
    Bob believes his Card 4 could be:¬† [Red, Yellow, Green, White, Blue] [2, 3, 4]
    Remaining Reveal Tokens: 5 
    Remaining Lives: 3  
    Deck Size: 38 
    The discard pile is: []
    My Action History: [Reveal Bob's Rank 1 Cards, Reveal Bob's Rank 5 Cards]
    The next playable cards for each stack are:  
    Only Red 2 can be played on Red Stack  
    Only Yellow 2 can be played on Yellow Stack  
    Only Green 1 can be played on Green Stack  
    Only White 1 can be played on White Stack  
    Only Blue 1 can be played on Blue Stack


    What action should I take next? 
    Available Actions: 
    A. Play Card 0.
    B. Play Card 1.
    C. Play Card 2.
    D. Play Card 3.
    E. Play Card 4.
    F. Discard Card 0.
    G. Discard Card 1.
    H. Discard Card 2.
    I. Discard Card 3.
    J. Discard Card 4.
    K. Reveal Bob's Yellow color cards.
    L. Reveal Bob's White color cards.
    M. Reveal Bob's Blue color cards.
    N. Reveal Bob's rank 2 cards.
    O. Reveal Bob's rank 3 cards.
    P. Reveal Bob's rank 4 cards.
    Q. Reveal Bob's rank 5 cards.
    '''

    find_answer_fuzzy(question,text)

In [8]:
# scenario_len = len(ec_questions)
# model_cols = []
# for m in all_models: 
#     model_cols.extend([f'{m} Answer', f'{m} Answer Ordinal', f'{m} Answer Correct'])
# error_logs = pd.DataFrame(columns=['Scenario', 'Type', 'Question', 'GT Answer', 'GT Answer Ordinal'] + model_cols)


# for i in range(scenario_len):
#     print({'Scenario': scenarios[i], 'Question': ec_questions[i], 'GT Answer': ec_ans[i], 'GT Ordinal': ec_ans_ord[i], 'Type': 'EC'})
#     new_row = pd.DataFrame({'Scenario': scenarios[i], 'Question': ec_questions[i], 'GT Answer': ec_ans[i], 'GT Ordinal': ec_ans_ord[i], 'Type': 'EC'})
#     for m in all_models:
#         new_row[f'{m} Answer'] = logs[m]['EC_ANSWERS'][i]
#         llm_ec_answer_ordinal = find_answer_fuzzy(ec_questions[i], logs[model]['EC_ANSWERS'][i])
#         new_row[f'{m} Answer Ordinal'] = llm_ec_answer_ordinal
#         new_row[f'{m} Answer Correct'] = 1 if llm_ec_answer_ordinal in ec_ans_ord[i] else 0 
#         error_logs = pd.concat([error_logs, new_row], ignore_index=True)
    
#     new_row = pd.DataFrame({'Scenario': scenarios[i], 'Question': tom_questions[i], 'GT Answer': tom_ans[i], 'GT Ordinal': tom_ans_ord[i], 'Type': 'TOM'})
#     for m in all_models:
#         new_row[f'{m} Answer'] = logs[m]['TOM_ANSWERS'][i]
#         llm_tom_answer_ordinal = find_answer_fuzzy(tom_questions[i], logs[model]['TOM_ANSWERS'][i])
#         new_row[f'{m} Answer Ordinal'] = llm_tom_answer_ordinal
#         new_row[f'{m} Answer Correct'] = 1 if llm_tom_answer_ordinal in tom_ans_ord[i] else 0 
#         error_logs = pd.concat([error_logs, new_row], ignore_index=True)
    
#     new_row = pd.DataFrame({'Scenario': scenarios[i], 'Question': jp_questions[i], 'GT Answer': jp_ans[i], 'GT Ordinal': jp_ans_ord[i], 'Type': 'JP'})
#     for m in all_models:
#         new_row[f'{m} Answer'] = logs[m]['JP_ANSWERS'][i]
#         llm_jp_answer_ordinal = find_answer_fuzzy(jp_questions[i], logs[model]['JP_ANSWERS'][i])
#         new_row[f'{m} Answer Ordinal'] = llm_jp_answer_ordinal
#         new_row[f'{m} Answer Correct'] = 1 if llm_jp_answer_ordinal in jp_ans_ord[i] else 0 
#         error_logs = pd.concat([error_logs, new_row], ignore_index=True)
        
scenario_len = len(ec_questions)
model_cols = []
for m in all_models: 
    model_cols.extend([f'{m} Answer', f'{m} Answer Ordinal', f'{m} Answer Correct'])

data = []

for i in range(scenario_len):
    for type, questions, ans, ans_ord, answers_key in [('EC', ec_questions, ec_ans, ec_ans_ord, 'EC_ANSWERS'), 
                                                        ('TOM', tom_questions, tom_ans, tom_ans_ord, 'TOM_ANSWERS'), 
                                                        ('JP', jp_questions, jp_ans, jp_ans_ord, 'JP_ANSWERS')]:
        row = {'Scenario': scenarios[i], 'Question': questions[i], 'GT Answer': ans[i], 'GT Ordinal': ans_ord[i], 'Type': type}
        for m in all_models:
            row[f'{m} Answer'] = logs[m][answers_key][i]
            answer_ordinal = find_answer_fuzzy(questions[i], logs[m][answers_key][i])
            row[f'{m} Answer Ordinal'] = answer_ordinal
            row[f'{m} Answer Correct'] = 1 if answer_ordinal in ans_ord[i] else 0 
        data.append(row)

error_logs = pd.DataFrame(data, columns=['Scenario', 'Type', 'Question', 'GT Answer', 'GT Answer Ordinal'] + model_cols)    


        
        

    



In [13]:
logs[m]['EC_ANSWERS'][0]

'Explanation: Based on the information provided, all stacks are at 0, indicating that no cards have been played yet as the game starts with all stacks at 0 and cards are added in ascending order. Answer: B. No.'

In [10]:
ec_tags_string = '''
Counting/Arrangement, layout-related
counting/arrangement, layout-related
counting/arrangement, layout-related
counting/arrangement, dynamic_variables
counting/arrangement, layout-related
counting/arrangement, dynamic_variables
counting/arrangement, dynamic_variables
counting/arrangement, layout-related
counting/arrangement, layout-related
counting/arrangement, dynamic_variables
counting/arrangement, layout-related
counting/arrangement, layout-related
counting/arrangement, layout-related
counting/arrangement, layout-related
counting/arrangement, layout-related, dynamic_variables,
counting/arrangement, layout-related, dynamic_variables,
counting/arrangement, dynamic_variables
counting/arrangement, dynamic_variables
counting/arrangement, layout-related
counting/arrangement, layout-related
counting/arrangement, dynamic_variables
counting/arrangement, layout-related
counting/arrangement, layout-related
counting/arrangement, dynamic_variables
counting/arrangement, dynamic_variables
counting/arrangement, layout-related, dynamic_variables,
counting/arrangement, layout-related, dynamic_variables,
counting/arrangement, dynamic_variables
layout-related, dynamic_variables
layout-related, counting/arrangement
layout-related, counting/arrangement
layout-related, dynamic_variables
layout-related
layout-related
layout-related
counting/arrangement, dynamic_variables
counting/arrangement, layout-related, dynamic_variables,
counting/arrangement, dynamic_variables
layout-related
layout-related
counting/arrangement, layout-related, dynamic_variables,
layout-related, dynamic_variables
layout-related, dynamic_variables
layout-related, dynamic_variables
layout-related, dynamic_variables
counting/arrangement, dynamic_variables
layout-related, counting/arrangement
dynamic_variables
layout-related, dynamic_variables
layout-related, dynamic_variables
layout-related, dynamic_variables
layout-related, dynamic_variables
layout-related, counting/arrangement
layout-related, dynamic_variables
layout-related
layout-related
counting/arrangement, layout-related, dynamic_variables,
layout-related, dynamic_variables
layout-related
layout-related
layout-related
counting/arrangement, dynamic_variables
Counting/arrangement, dynamic_variables
Counting/arrangement, dynamic_variables
Counting/arrangement, dynamic_variables
Counting/arrangement, dynamic_variables
'''

ec_tags = ec_tags_string.split('\n')
ec_tags = [t.strip().lower().split(', ') for t in ec_tags if t != '']


In [14]:
un = set()
for tt in ec_tags:
    for rag in tt:
        un.add(rag)


In [15]:
un

{'counting/arrangement',
 'dynamic_variables',
 'dynamic_variables,',
 'layout-related'}

In [25]:
# Find error percentage for each type of tag 

m = 'openai_gpt-4-0125'
scenario_len = len(ec_questions)

for m in all_models:
    counting_errors = 0
    layout_related_errors = 0
    dynamic_variable_errors = 0
    total_errors = 0
    counting_questions = 0
    layout_questions = 0
    for i in range(scenario_len):
        answer_ordinal = find_answer_fuzzy(ec_questions[i], logs[m]['EC_ANSWERS'][i])
        # print(ec_questions[i], logs[m]['EC_ANSWERS'][i])
        # print('EC: ', answer_ordinal, ec_ans_ord[i])
        is_ec_correct = 1 if answer_ordinal in ec_ans_ord[i] else 0
        if 'counting/arrangement' in ec_tags[i]:
            counting_questions += 1
            
        if 'layout-related' in ec_tags[i]:
            layout_questions += 1
        if is_ec_correct == 0:
            # for tag in ec_tags[i]:
            #     if tag == 'counting/arrangement':
            #         counting_errors += 1
            #     elif tag == 'layout-related':
            #         layout_related_errors += 1
            #     elif tag == 'dynamic_variables':
            #         dynamic_variable_errors += 1
            # total_errors += 1
            total_errors += 1 
            
            if 'counting/arrangement' in ec_tags[i]:
                counting_errors += 1
            
            if 'layout-related' in ec_tags[i]:
                layout_related_errors += 1
            print('TOTAL ERRORS: ', total_errors)
            print('COUNTING ERRORS: ', counting_errors)
            print('LAYOUT ERRORS: ', layout_related_errors)

    print(f'{m} Counting/Arrangement Error Percentage: {counting_errors / counting_questions}')
    print(f'{m} Layout Related Error Percentage: {layout_related_errors / layout_questions}')
    # print(f'{m} Dynamic Variable Error Percentage: {dynamic_variable_errors / total_errors}')




TOTAL ERRORS:  1
COUNTING ERRORS:  1
LAYOUT ERRORS:  1
TOTAL ERRORS:  2
COUNTING ERRORS:  2
LAYOUT ERRORS:  2
TOTAL ERRORS:  3
COUNTING ERRORS:  3
LAYOUT ERRORS:  2
TOTAL ERRORS:  4
COUNTING ERRORS:  4
LAYOUT ERRORS:  3
TOTAL ERRORS:  5
COUNTING ERRORS:  5
LAYOUT ERRORS:  3
TOTAL ERRORS:  6
COUNTING ERRORS:  6
LAYOUT ERRORS:  4
TOTAL ERRORS:  7
COUNTING ERRORS:  7
LAYOUT ERRORS:  5
TOTAL ERRORS:  8
COUNTING ERRORS:  8
LAYOUT ERRORS:  6
TOTAL ERRORS:  9
COUNTING ERRORS:  9
LAYOUT ERRORS:  7
TOTAL ERRORS:  10
COUNTING ERRORS:  10
LAYOUT ERRORS:  8
TOTAL ERRORS:  11
COUNTING ERRORS:  11
LAYOUT ERRORS:  8
TOTAL ERRORS:  12
COUNTING ERRORS:  12
LAYOUT ERRORS:  9
TOTAL ERRORS:  13
COUNTING ERRORS:  13
LAYOUT ERRORS:  10
TOTAL ERRORS:  14
COUNTING ERRORS:  14
LAYOUT ERRORS:  11
TOTAL ERRORS:  15
COUNTING ERRORS:  15
LAYOUT ERRORS:  12
TOTAL ERRORS:  16
COUNTING ERRORS:  16
LAYOUT ERRORS:  12
TOTAL ERRORS:  17
COUNTING ERRORS:  17
LAYOUT ERRORS:  12
TOTAL ERRORS:  18
COUNTING ERRORS:  18
LAYOU

In [18]:
# Find percent cases where both EC and ToM were correct for a model, but JP was still wrong

m = 'openai_gpt-4-0125'
jp_wrong_total = 0
synthesis_failure_total = 0
component_failure_toral = 0
for i in range(scenario_len):
    # for type, questions, ans, ans_ord, answers_key in [('EC', ec_questions, ec_ans, ec_ans_ord, 'EC_ANSWERS'), 
    #                                                     ('TOM', tom_questions, tom_ans, tom_ans_ord, 'TOM_ANSWERS'), 
    #                                                     ('JP', jp_questions, jp_ans, jp_ans_ord, 'JP_ANSWERS')]:
    #     row = {'Scenario': scenarios[i], 'Question': questions[i], 'GT Answer': ans[i], 'GT Ordinal': ans_ord[i], 'Type': type}
    #     for m in all_models:
    #         row[f'{m} Answer'] = logs[m][answers_key][i]
    #         answer_ordinal = find_answer_fuzzy(questions[i], logs[m][answers_key][i])
    #         row[f'{m} Answer Ordinal'] = answer_ordinal
    #         row[f'{m} Answer Correct'] = 1 if answer_ordinal in ans_ord[i] else 0 
    #     data.append(row)

    answer_ordinal = find_answer_fuzzy(ec_questions[i], logs[m]['EC_ANSWERS'][i])
    # print(ec_questions[i], logs[m]['EC_ANSWERS'][i])
    print('EC: ', answer_ordinal, ec_ans_ord[i])
    is_ec_correct = 1 if answer_ordinal in ec_ans_ord[i] else 0
    

    answer_ordinal = find_answer_fuzzy(tom_questions[i], logs[m]['TOM_ANSWERS'][i])
    # print(tom_questions[i], logs[m]['TOM_ANSWERS'][i])
    print('TOM: ', answer_ordinal, tom_ans_ord[i])
    is_tom_correct = 1 if answer_ordinal in tom_ans_ord[i] else 0

    answer_ordinal = find_answer_fuzzy(jp_questions[i], logs[m]['JP_ANSWERS'][i])
    # print(jp_questions[i], logs[m]['JP_ANSWERS'][i])
    print('JP: ', answer_ordinal, jp_ans_ord[i])
    is_jp_correct = 1 if answer_ordinal in jp_ans_ord[i] else 0

    if not is_jp_correct:
        jp_wrong_total += 1

        if is_ec_correct and is_tom_correct:
            synthesis_failure_total+=1 
        
        if not is_ec_correct or not is_tom_correct:
            component_failure_toral+=1


print(jp_wrong_total / scenario_len)
print('SYNTHESIS FAILURE: ', synthesis_failure_total / jp_wrong_total)
print('COMPONENT FAILURE: ', component_failure_toral / jp_wrong_total)





EC:  B B
TOM:  B E
JP:  L O
EC:  E E
TOM:  E E
JP:  L O
EC:  B B
TOM:  A D
JP:  N N
EC:  B B
TOM:  E A
JP:  K K
EC:  C A
TOM:  E E
JP:  E E
EC:  D D
TOM:  J F,G
JP:  G G,H
EC:  C C
TOM:  G G
JP:  G G
EC:  A A
TOM:  A F
JP:  K P,N
EC:  C B
TOM:  G G
JP:  Q Q
EC:  B B
TOM:  C G
JP:  M Q
EC:  A A
TOM:  B H
JP:  O R
EC:  B B
TOM:  D F
JP:  N Q
EC:  A A
TOM:  F H
JP:  P R
EC:  D D
TOM:  E E
JP:  O O
EC:  B B
TOM:  C H
JP:  M R
EC:  A A
TOM:  B F
JP:  N P
EC:  B B
TOM:  A A
JP:  K K
EC:  A A
TOM:  J B
JP:  M A
EC:  B A
TOM:  A A
JP:  K K
EC:  B B
TOM:  A A
JP:  K K
EC:  B B
TOM:  B G
JP:  K Q
EC:  B A
TOM:  E F
JP:  K P
EC:  A A
TOM:  B D
JP:  M N
EC:  B B
TOM:  G G
JP:  A G
EC:  B B
TOM:  F F,G
JP:  G F
EC:  A A
TOM:  E E
JP:  E E
EC:  B B
TOM:  B D
JP:  L N
EC:  A A
TOM:  A B
JP:  K L
EC:  B B
TOM:  B B
JP:  D C
EC:  B B
TOM:  A A
JP:  A C
EC:  B B
TOM:  B B
JP:  A C
EC:  B B
TOM:  D A
JP:  B B
EC:  C C
TOM:  J J,D,E
JP:  A C,D
EC:  B B
TOM:  E E
JP:  A D
EC:  B B
TOM:  E E,G,I
JP:  C C
EC

In [9]:
error_logs

Unnamed: 0,Scenario,Type,Question,GT Answer,GT Answer Ordinal,mistral_Mistral-7B-Instruct-v02 Answer,mistral_Mistral-7B-Instruct-v02 Answer Ordinal,mistral_Mistral-7B-Instruct-v02 Answer Correct,mistral_Mixtral-8x7B-Instruct-v01 Answer,mistral_Mixtral-8x7B-Instruct-v01 Answer Ordinal,...,vicuna_vicuna-13b-v15 Answer Correct,llama_Llama-2-13b-chat-hf Answer,llama_Llama-2-13b-chat-hf Answer Ordinal,llama_Llama-2-13b-chat-hf Answer Correct,llama_Llama-2-70b-chat-hf Answer,llama_Llama-2-70b-chat-hf Answer Ordinal,llama_Llama-2-70b-chat-hf Answer Correct,openai_gpt-4-0125 Answer,openai_gpt-4-0125 Answer Ordinal,openai_gpt-4-0125 Answer Correct
0,It is currently My (Alice) turn. Current Stack...,EC,It is currently My (Alice) turn. Current Stack...,No.,,"Based on the information provided, there have...",B,1,Explanation: Based on the information provide...,B,...,0,"Based on the information provided, the answe...",C,0,Explanation: Based on the information provid...,B,1,Explanation: Based on the information provided...,B,1
1,It is currently My (Alice) turn. Current Stack...,TOM,It is currently My (Alice) turn. Current Stack...,Reveal Bob's rank 1 cards.,,Explanation: Based on the current state of th...,E,1,Explanation: Based on the current state of th...,A,...,0,"Based on the information provided, it is mos...",B,0,E. Reveal Bob's rank 1 cards.\n\nExplanation...,E,1,Explanation: Since the game is at the beginnin...,B,0
2,It is currently My (Alice) turn. Current Stack...,JP,It is currently My (Alice) turn. Current Stack...,Reveal Bob's rank 1 cards.,,Explanation: Based on the current state of th...,K,0,Explanation: Based on the information provide...,N,...,0,Explanation: Based on the current informatio...,D,0,Explanation: We want to play a card that wil...,K,0,Explanation: Since it's the start of the game ...,L,0
3,It is currently My (Alice) turn. Current Stack...,EC,It is currently My (Alice) turn. Current Stack...,1 stack.,,Explanation: Based on the current state of th...,C,0,"Explanation: Based on the current stacks, onl...",D,...,0,"Based on the information provided, we can se...",B,0,"Based on the information provided, the curre...",A,0,Explanation: The game's current state shows th...,E,1
4,It is currently My (Alice) turn. Current Stack...,TOM,It is currently My (Alice) turn. Current Stack...,Reveal Bob's rank 2 cards.,,Explanation: Bob needs information to play hi...,E,1,Explanation: Bob has a Yellow 1 in his hand w...,A,...,1,"Based on the information provided, it would ...",A,0,"Based on the information provided, the best ...",E,1,"Explanation: Given the current stacks, the nex...",E,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,I am holding nothing. Bob is holding plate. Co...,TOM,I am holding nothing. Bob is holding plate. Co...,wait,,Explanation: Bob is holding an empty plate an...,A,0,Explanation: Since Bob is currently holding a...,A,...,0,Explanation: Based on the current state info...,A,0,Explanation: Since Bob is holding a plate an...,A,0,Explanation: Since the cooker contains 2 onion...,A,0
194,I am holding nothing. Bob is holding plate. Co...,JP,You are holding nothing. Bob is holding plate....,Pick up onion or tomato,,"Explanation: Since Bob is holding a plate, I ...",D,0,Explanation: Since Bob is holding a plate and...,B,...,1,Explanation: Since I am holding nothing and ...,A,1,Explanation: We want to maximize the number ...,C,0,Explanation: Since the cooker already has the ...,D,0
195,I am holding nothing. Bob is holding tomato. C...,EC,I am holding nothing. Bob is holding tomato. C...,1 tomato.,,Explanation: In order to cook soup in Overcoo...,A,0,Explanation: The cooker needs one onion to st...,A,...,0,Explanation: In order to begin cooking the s...,A,0,Explanation: The cooker already has 2 onions...,F,0,Explanation: Based on the game's rules and you...,C,1
196,I am holding nothing. Bob is holding tomato. C...,TOM,I am holding nothing. Bob is holding tomato. C...,Place tomato in cooker.,,Explanation: Since Bob is holding a tomato an...,B,0,Explanation: Since Bob is already holding the...,C,...,0,Explanation: Based on the current state info...,A,1,Explanation: Since Bob is holding a tomato a...,A,1,Explanation: Since the cooker already has 2 on...,A,1


In [None]:
# JP Synthesis Failures 

jp_sf = error_logs[error_logs[f'{m} Answer Correct']]


# JP Component Failures 

In [34]:

cols = [f'{m} Answer Correct' for m in all_models]

jp_error_logs = error_logs.loc[error_logs['Type'] == 'JP']
jp_error_logs_wrong = jp_error_logs.loc[jp_error_logs[cols].sum(axis=1) < 2]

tom_error_logs = error_logs.loc[error_logs['Type'] == 'TOM']
tom_error_logs_wrong = tom_error_logs.loc[tom_error_logs[cols].sum(axis=1) < 2]

ec_error_logs = error_logs.loc[error_logs['Type'] == 'EC']
ec_error_logs_wrong = ec_error_logs.loc[ec_error_logs[cols].sum(axis=1) < 2]





In [36]:
jp_error_logs_wrong.to_csv('jp_error_logs.csv')

In [10]:
errors = {all_models[i]: {'ec': [], 'tom': [], 'jp': []} for i in range(len(all_models))}
errors_df = pd.DataFrame(columns=['Model', 'Question', 'Inference', 'Ground Truth', 'Ground Truth Ordinal', 'Type'])
# for all models in logs, check the EC Answers, TOM Answers and JP Answers with the ground truth answers from the df, if the answer does not match add it to the corresponding dict key in errors
for model in all_models:
    for i in range(len(ec_ans)):
        
        if find_answer_fuzzy(ec_questions[i],logs[model]['EC_ANSWERS'][i]) not in ec_ans_ord[i]:
            new_row = pd.DataFrame({'Model': [model], 'Question': [ec_questions[i]], 'Inference': logs[model]['EC_ANSWERS'][i], 'Ground Truth': [ec_ans[i]], 'Ground Truth Ordinal': ec_ans_ord[i], 'Type': ['EC']})
            errors_df = pd.concat([errors_df, new_row], ignore_index=True)
            errors[model]['ec'].append(logs[model]['EC_ANSWERS'][i])
        
        if find_answer_fuzzy(tom_questions[i],logs[model]['TOM_ANSWERS'][i]) not in tom_ans_ord[i]:
            new_row = pd.DataFrame({'Model': [model], 'Question': [tom_questions[i]], 'Inference': logs[model]['TOM_ANSWERS'][i], 'Ground Truth': [tom_ans[i]],  'Ground Truth Ordinal': tom_ans_ord[i], 'Type': ['TOM']})
            errors_df = pd.concat([errors_df, new_row], ignore_index=True)
            errors[model]['tom'].append(logs[model]['TOM_ANSWERS'][i])
        
        if find_answer_fuzzy(jp_questions[i],logs[model]['JP_ANSWERS'][i]) not in jp_ans_ord[i]:
            new_row = pd.DataFrame({'Model': [model], 'Question': [jp_questions[i]], 'Inference': logs[model]['JP_ANSWERS'][i], 'Ground Truth': [jp_ans[i]], 'Ground Truth Ordinal': jp_ans_ord[i], 'Type': ['JP']})
            errors_df = pd.concat([errors_df, new_row], ignore_index=True)
            errors[model]['jp'].append(logs[model]['JP_ANSWERS'][i])



In [15]:
error_logs

NameError: name 'error_logs' is not defined

In [None]:
errors_df.to_csv('errors.csv')

In [58]:
ls = errors['llama_Llama-2-13b-chat-hf']['ec']
# print this list to a file like a bullet point list, remove all \n and \t
ls = [re.sub(r'\n', '', l) for l in ls]
ls = [re.sub(r'\t', '', l) for l in ls]
with open('errors_llama_Llama-2-13b-chat-hf.txt', 'w') as f:
    for l in ls:
        f.write(f'{l}\n')


In [63]:
# find errors for ec that are there in errors of all models 
ec_errors = set(errors[all_models[0]]['ec'])
# for model in all_models:
#     ec_errors = ec_errors.intersection(set(errors[model]['ec']))

In [64]:
ec_errors

{' Analysis: Since I am closer to Room 2 where the thief is, opening the door between Room 1 and 2 will increase the chances of trapping the thief.\n\nAction: A. I should open one of the doors.\nAnswer: A. Alice should open one of the doors.',
 ' Answer: A. No, we can get to the gate perfectly fine.\n\nExplanation: Since the killer is in room 7 and the exit gate is in room 1 (connected to room 7), and we have already fixed the generators, Alice and Bob can safely make their way to the exit gate and escape without the need to lure the killer away.',
 " Answer: C. Not enough information.\n\nExplanation: Since the killer is in the Backyard area, which is connected to the Gate area, there is no need for the survivors to lure the killer away to access the gate, as long as they can reach it before or at the same time as the killer. However, without knowing their exact positions and movement possibilities in the next turn, it's impossible to confirm that for certain.",
 ' Based on the given c

In [67]:
import pandas as pd

# save a new errors df that contains the model name, the question, the inference string and the ground truth answer, also add another field for the type of question
errors_df = pd.DataFrame(columns=['Model', 'Question', 'Inference', 'Ground Truth', 'Type'])
for model in all_models:
    for i in range(len(errors[model]['ec'])):
        new_row = pd.DataFrame({'Model': [model], 'Question': [ec_questions[i]], 'Inference': [errors[model]['ec'][i]], 'Ground Truth': [ec_ans[i]], 'Type': ['EC']})
        errors_df = pd.concat([errors_df, new_row], ignore_index=True)
    for i in range(len(errors[model]['tom'])):
        new_row = pd.DataFrame({'Model': [model], 'Question': [tom_questions[i]], 'Inference': [errors[model]['tom'][i]], 'Ground Truth': [tom_ans[i]], 'Type': ['TOM']})
        errors_df = pd.concat([errors_df, new_row], ignore_index=True)
    for i in range(len(errors[model]['jp'])):
        new_row = pd.DataFrame({'Model': [model], 'Question': [jp_questions[i]], 'Inference': [errors[model]['jp'][i]], 'Ground Truth': [jp_ans[i]], 'Type': ['JP']})
        errors_df = pd.concat([errors_df, new_row], ignore_index=True)

errors_df.to_csv('errors.csv', index=False)