In [1]:
# Jupyter Notebook Script

import pandas as pd

# Load the data from CSV files
data_llama2 = pd.read_csv('../results/llama2/gsm8k/zero_shot_cot_gsm8k/20240603-112531205664_289c2160bbb5425186b314e0fb9e4551_answered.csv')
data_openai = pd.read_csv('../results/openAi/gsm8k/zero_shot_cot_gsm8k/20240525-174150582952_72dcbbf5bc124790b148e5ddce48fc59_answered.csv')

# Display the first few rows of each dataframe to understand their structure
print("Llama2 Data:")
display(data_llama2.head())

print("OpenAI Data:")
display(data_openai.head())

# Calculate the number of questions answered correctly by each model
num_correct_llama2 = data_llama2['correct'].sum()
num_correct_openai = data_openai['correct'].sum()

# Total number of questions in the dataset
total_questions_llama2 = len(data_llama2)
total_questions_openai = len(data_openai)

# Print out the results
print(f"Total number of answered questions in Llama2 dataset: {total_questions_llama2}")
print(f"Number of questions answered correctly by Llama-2: {num_correct_llama2}")
print(f"Total number of answered questions in OpenAI dataset: {total_questions_openai}")
print(f"Number of questions answered correctly by OpenAI GPT-3.5 Turbo: {num_correct_openai}")

# Find common mistakes where both models made errors
common_mistakes = data_llama2.merge(data_openai, on='question', suffixes=('_llama', '_openai'))
common_mistakes = common_mistakes[(common_mistakes['correct_llama'] == False) & (common_mistakes['correct_openai'] == False)]

directory = '../result_responses'
summary_file_path = os.path.join(directory, 'correct_answers_summary.txt')
common_mistakes_file_path = os.path.join(directory, 'common_mistakes.csv')
os.makedirs(directory, exist_ok=True)

# Display the common mistakes
print("Common mistakes where both models failed:")
display(common_mistakes[['question', 'predicted label_llama', 'predicted label_openai', 'ground truth_llama', 'ground truth_openai']])

# Save the common mistakes to a CSV file
common_mistakes.to_csv('/mnt/data/common_mistakes.csv', index=False)

# Save the summary of correct answers to a text file
with open(summary_file_path, 'w') as f:
    f.write(f"Total number of answered questions in Llama2 dataset: {total_questions_llama2}\n")
    f.write(f"Number of questions answered correctly by Llama-2: {num_correct_llama2}\n")
    f.write(f"Total number of answered questions in OpenAI dataset: {total_questions_openai}\n")
    f.write(f"Number of questions answered correctly by OpenAI GPT-3.5 Turbo: {num_correct_openai}\n")
