In [3]:
import pandas as pd
import numpy as np
from bert_score import score

def calculate_avg_bertscore_for_question(correct, distractors):
    """
    Calculate the average BERTScore (F1) by comparing each distractor (as a string)
    to the correct answer (as a string). If an input is missing, it is replaced with an empty string.
    """
    scores = []
    correct = str(correct) if pd.notnull(correct) else ""
    for distractor in distractors:
        distractor = str(distractor) if pd.notnull(distractor) else ""
        # Only compute score if both strings are non-empty
        if correct and distractor:
            P, R, F1 = score([distractor], [correct], lang='en', verbose=False)
            scores.append(F1.mean().item())
    return np.mean(scores) if scores else np.nan

def compute_question_bertscore(row, suffix):
    """
    Compute the average BERTScore for a question using columns with the given suffix.
    Assumes columns: Choice_1, Choice_2, Choice_3 and Correct_Answer.
    """
    try:
        correct_index = int(row[f'Correct_Answer{suffix}'])  # 1-indexed answer
    except Exception as e:
        print(f"Error converting correct answer: {e}")
        return np.nan
    
    correct_choice = row[f'Choice_{correct_index}{suffix}']
    distractors = []
    for i in range(1, 4):
        if i != correct_index:
            distractors.append(row[f'Choice_{i}{suffix}'])
    return calculate_avg_bertscore_for_question(correct_choice, distractors)

# ------------------------
# Load the baseline and generated questions
# ------------------------
baseline_file = 'Data/baseline_morphology_questions.csv'  # New file saved from the generator's mc_data
generated_file = 'Data/generated_morphology_questions.csv'

baseline_df = pd.read_csv(baseline_file)
generated_df = pd.read_csv(generated_file)

# ------------------------
# Standardize merge keys: Convert 'Word' and 'Task' to lowercase and strip whitespace
# ------------------------
for df in [baseline_df, generated_df]:
    df['Word'] = df['Word'].astype(str).str.strip().str.lower()
    df['Task'] = df['Task'].astype(str).str.strip().str.lower()

# ------------------------
# Merge the datasets on 'Word' and 'Task' for 1:1 comparison
# ------------------------
merged_df = pd.merge(generated_df, baseline_df, on=['Word', 'Task'], suffixes=('_gen', '_base'))

if merged_df.empty:
    print("No matching questions found between baseline and generated datasets. Check your merge keys!")
else:
    # ------------------------
    # Compute average BERTScore for each matched question in both sets
    # ------------------------
    merged_df['Baseline_Avg_BERTScore'] = merged_df.apply(lambda row: compute_question_bertscore(row, '_base'), axis=1)
    merged_df['Generated_Avg_BERTScore'] = merged_df.apply(lambda row: compute_question_bertscore(row, '_gen'), axis=1)
    
    # Optional: compute the difference between generated and baseline scores
    merged_df['Difference'] = merged_df['Generated_Avg_BERTScore'] - merged_df['Baseline_Avg_BERTScore']
    
    # ------------------------
    # Save the 1:1 comparison to a new CSV file
    # ------------------------
    output_file = 'Data/1to1_BERTScore_comparison.csv'
    merged_df.to_csv(output_file, index=False)
    
    # Print a summary of the comparison
    print("1:1 BERTScore Comparison:")
    print(merged_df[['Word', 'Task', 'Baseline_Avg_BERTScore', 'Generated_Avg_BERTScore', 'Difference']])
    print(f"\nComparison saved to {output_file}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1:1 BERTScore Comparison:
      Word                                               Task  \
0  hushing  selecting or providing the correct definition ...   

   Baseline_Avg_BERTScore  Generated_Avg_BERTScore  Difference  
0                0.826615                      NaN         NaN  

Comparison saved to Data/1to1_BERTScore_comparison.csv
