In [9]:
import json
import pandas as pd
import numpy as np
from scipy.optimize import linear_sum_assignment
from rouge_score import rouge_scorer
import Levenshtein
import matplotlib as plt

In [10]:
#load data for evaluation (loki generated atomic claims)
with open('translate/atomic_claims_FCGPT_claude_translated.json', 'r') as file:
    loki_translated = json.load(file)

with open('translate/translated.json', 'r') as file:
    dataset_translated = json.load(file)


In [11]:
# Process dataset to organize claims by comment
gt_comments = []
for entry in dataset_translated:
    claims = []
    for item in entry.get('claims', []):
        if isinstance(item, list):
            claims.extend(item)
        elif isinstance(item, dict):
            claims.append(item.get('claim', ''))
        elif isinstance(item, str):
            claims.append(item)
    gt_comments.append(claims)

loki_comments = []
for entry in loki_translated:
    claims = []
    for item in entry.get('claims', []):
        if isinstance(item, list):
            claims.extend(item)
        elif isinstance(item, dict):
            claims.append(item.get('claim', ''))
        elif isinstance(item, str):
            claims.append(item)
    loki_comments.append(claims)

In [12]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)


def compute_rouge_f1(reference, candidate):
    scores = scorer.score(reference, candidate)

    # Here we use the F1 score for each ROUGE metric; you can adjust this if needed
    rouge1_f1 = scores['rouge1'].fmeasure
    rouge2_f1 = scores['rouge2'].fmeasure
    rougeL_f1 = scores['rougeL'].fmeasure

    # for simplicity, you might average these scores
    avg_f1 = np.mean([rouge1_f1, rouge2_f1, rougeL_f1])
    return avg_f1

In [13]:
all_scores = []
all_pairs = []
comment_metrics = []
threshold = 0.7

for i, (gt_claims_comment, gen_claims_comment) in enumerate(zip(gt_comments, loki_comments)):
    # Skip empty comments
    if not gt_claims_comment or not gen_claims_comment:
        continue

    # Create similarity matrix for this comment
    num_gt = len(gt_claims_comment)
    num_gen = len(gen_claims_comment)

    # Handle unequal number of claims
    max_claims = max(num_gt, num_gen)
    similarity_matrix = np.zeros((max_claims, max_claims))

    # Fill similarity matrix (missing claims get 0 similarity)
    for i in range(min(num_gt, max_claims)):
        for j in range(min(num_gen, max_claims)):
            similarity_matrix[i, j] = compute_rouge_f1(gt_claims_comment[i], gen_claims_comment[j])

    # Apply Hungarian algorithm
    cost_matrix = -similarity_matrix
    row_ind, col_ind = linear_sum_assignment(cost_matrix)

    # Get valid matches (ignoring padding)
    valid_matches = [(r, c) for r, c in zip(row_ind, col_ind)
                     if r < num_gt and c < num_gen]

    if not valid_matches:
        continue

    # Calculate metrics for this comment
    comment_scores = [similarity_matrix[r, c] for r, c in valid_matches]
    avg_score = np.mean(comment_scores)

    # Track matched pairs
    comment_pairs = [
        {
            'Comment_Index': i,
            'GT_Claim': gt_claims_comment[r],
            'Generated_Claim': gen_claims_comment[c],
            'Score': similarity_matrix[r, c]
        }
        for r, c in valid_matches
    ]

    # Calculate comment-level metrics
    good_matches = sum(1 for score in comment_scores if score >= threshold)
    recall = good_matches / num_gt if num_gt > 0 else 0
    precision = good_matches / num_gen if num_gen > 0 else 0

    # Add comment metrics
    comment_metrics.append({
        'Comment_Index': i,
        'Avg_Score': avg_score,
        'Recall': recall,
        'Precision': precision,
        'GT_Claims': num_gt,
        'Gen_Claims': num_gen,
        'Good_Matches': good_matches
    })

    # Add to overall scores and pairs
    all_scores.extend(comment_scores)
    all_pairs.extend(comment_pairs)

In [14]:
# Convert to DataFrame for easier analysis
df_metrics = pd.DataFrame(comment_metrics)

# Overall metrics
overall_avg_score = np.mean(all_scores)
overall_recall = np.mean(df_metrics['Recall'])
overall_precision = np.mean(df_metrics['Precision'])

print(f"Overall Average Score: {overall_avg_score:.3f}")
print(f"Overall Recall: {overall_recall:.2%}")
print(f"Overall Precision: {overall_precision:.2%}")

Overall Average Score: 0.527
Overall Recall: 22.73%
Overall Precision: 20.11%
