In [9]:
# Jupyter Notebook Script

import pandas as pd
import os

# Load the data from CSV files
data_llama2 = pd.read_csv('../results/llama2/gsm8k/few_shot_cot_number/20240603-114449626099_f68528eb5c7f4637a7e34112039be56b_answered.csv')
data_openai = pd.read_csv('../results/openAi/gsm8k/few_shot_cot_number/20240526-011547328226_bd607da7b1094ebf88a525b0859baef6_answered.csv')

# Display the first few rows of each dataframe to understand their structure
print("Llama2 Data:")
display(data_llama2.head())

print("OpenAI Data:")
display(data_openai.head())

# Calculate the number of questions answered correctly by each model
num_correct_llama2 = data_llama2['correct'].sum()
num_correct_openai = data_openai['correct'].sum()

# Total number of questions in the dataset
total_questions_llama2 = len(data_llama2)
total_questions_openai = len(data_openai)

def count_common_wrong_answers(df1, df2):
    common_mistakes = df1.merge(df2, on='question', suffixes=('_llama', '_openai'))
    common_mistakes = common_mistakes[(common_mistakes['correct_llama'] == False) & (common_mistakes['correct_openai'] == False)]
    return len(common_mistakes), common_mistakes

num_common_wrong, common_mistakes_df = count_common_wrong_answers(data_llama2, data_openai)

print(f"Total number of common mistakes: {num_common_wrong}")
display(common_mistakes_df[['question', 'predicted label_llama', 'predicted label_openai', 'ground truth_llama', 'ground truth_openai']])

# Print out the results
print(f"Total number of answered questions in Llama2 dataset: {total_questions_llama2}")
print(f"Number of questions answered correctly by Llama-2: {num_correct_llama2}")
print(f"Total number of answered questions in OpenAI dataset: {total_questions_openai}")
print(f"Number of questions answered correctly by OpenAI GPT-3.5 Turbo: {num_correct_openai}")

# Find common mistakes where both models made errors
common_mistakes = data_llama2.merge(data_openai, on='question', suffixes=('_llama', '_openai'))
common_mistakes = common_mistakes[(common_mistakes['correct_llama'] == False) & (common_mistakes['correct_openai'] == False)]

directory = '../result_responses'
summary_file_path = os.path.join(directory,'correct_answers_summary_fewShot.txt')
common_mistakes_file_path = os.path.join(directory ,'common_mistakes_fewShot.csv')
os.makedirs(directory, exist_ok=True)

# Display the common mistakes
print("Common mistakes where both models failed:")
display(common_mistakes[['question', 'predicted label_llama', 'predicted label_openai', 'ground truth_llama', 'ground truth_openai']])

# Save the common mistakes to a CSV file
common_mistakes.to_csv('common_mistakes.csv', index=False)

# Save the summary of correct answers to a text file
with open(summary_file_path, 'w') as f:
    f.write(f"Total number of answered questions in Llama2 dataset: {total_questions_llama2}\n")
    f.write(f"Number of questions answered correctly by Llama-2: {num_correct_llama2}\n")
    f.write(f"Total number of answered questions in OpenAI dataset: {total_questions_openai}\n")
    f.write(f"Number of questions answered correctly by OpenAI GPT-3.5 Turbo: {num_correct_openai}\n")
    f.write(f"Total number of common mistakes: {num_common_wrong}\n")



Llama2 Data:


Unnamed: 0,index,question,predicted label,ground truth,confidence,correct
0,1,Weng earns $12 an hour for babysitting. Yester...,9,10.0,0.95,False
1,2,Betty is saving money for a new wallet which c...,115,5.0,0.85,False
2,3,"Julie is reading a 120-page book. Yesterday, s...",42,42.0,0.8,True
3,4,James writes a 3-page letter to 2 different fr...,160,624.0,0.95,False
4,5,Mark has a garden with flowers. He planted pla...,90,35.0,0.9,False


OpenAI Data:


Unnamed: 0,index,question,predicted label,ground truth,confidence,correct
0,2,Betty is saving money for a new wallet which c...,5,5.0,1.0,True
1,3,"Julie is reading a 120-page book. Yesterday, s...",30,42.0,0.95,False
2,6,Albert is wondering how much pizza he can eat ...,40,48.0,1.0,False
3,8,Alexis is applying for a new job and bought a ...,A,41.0,0.95,False
4,9,Tina makes $18.00 an hour. If she works more ...,1170,990.0,0.9,False


Total number of common mistakes: 1214


Unnamed: 0,question,predicted label_llama,predicted label_openai,ground truth_llama,ground truth_openai
3,Alexis is applying for a new job and bought a ...,119,A,41.0,41.0
4,Tina makes $18.00 an hour. If she works more ...,1,1170,990.0,990.0
6,Tobias is buying a new pair of shoes that cost...,28,2,5.0,5.0
10,James creates a media empire. He creates a mo...,18,410000,448000.0,448000.0
12,"In a truck, there are 26 pink hard hats, 15 gr...",34,49,43.0,43.0
...,...,...,...,...,...
6573,Jim decides to open up a bike shop. The most ...,500,4700,3000.0,3000.0
6574,Mary went to the store to buy fruit. Apples co...,16,16,15.0,15.0
6575,Mark deposited $88 in a bank. Bryan deposited ...,-112,88,400.0,400.0
6582,Hilary is shucking corn from ears that grew on...,257,259200,237600.0,237600.0


Total number of answered questions in Llama2 dataset: 7300
Number of questions answered correctly by Llama-2: 1510
Total number of answered questions in OpenAI dataset: 6741
Number of questions answered correctly by OpenAI GPT-3.5 Turbo: 5367
Common mistakes where both models failed:


Unnamed: 0,question,predicted label_llama,predicted label_openai,ground truth_llama,ground truth_openai
3,Alexis is applying for a new job and bought a ...,119,A,41.0,41.0
4,Tina makes $18.00 an hour. If she works more ...,1,1170,990.0,990.0
6,Tobias is buying a new pair of shoes that cost...,28,2,5.0,5.0
10,James creates a media empire. He creates a mo...,18,410000,448000.0,448000.0
12,"In a truck, there are 26 pink hard hats, 15 gr...",34,49,43.0,43.0
...,...,...,...,...,...
6573,Jim decides to open up a bike shop. The most ...,500,4700,3000.0,3000.0
6574,Mary went to the store to buy fruit. Apples co...,16,16,15.0,15.0
6575,Mark deposited $88 in a bank. Bryan deposited ...,-112,88,400.0,400.0
6582,Hilary is shucking corn from ears that grew on...,257,259200,237600.0,237600.0
