In [None]:
import json
import pandas as pd
from itertools import combinations

# Cargar los datos
with open('input/ttcw_annotations.json') as f:
    annotations = json.load(f)

with open('input/ttcw_short_stories.json') as f:
    short_stories = json.load(f)

with open('input/ttcw_all_tests.json') as f:
    tests = json.load(f)

# Convertir a DataFrames
df_annotations = pd.DataFrame(annotations)
df_short_stories = pd.DataFrame(short_stories)
df_tests = pd.DataFrame(tests)

# Calcular la puntuación total para cada historia por crítico
df_annotations['binary_score'] = df_annotations['binary_verdict'].apply(lambda x: 1 if x == "Yes" else 0)
df_scores = df_annotations.groupby(['expert_idx', 'story_id']).agg(total_score=('binary_score', 'sum')).reset_index()

# Generar comparaciones por pares
pairwise_data = []
ties_count = 0

for expert, group in df_scores.groupby('expert_idx'):
    for (idx1, text1), (idx2, text2) in combinations(group.iterrows(), 2):
        # Comparar puntuaciones
        if text1['total_score'] > text2['total_score']:
            pairwise_data.append({
                'user_id': f'Expert{expert}',
                'preferred_text': text1['story_id'],
                'other_text': text2['story_id'],
                'label': 1
            })
            # Generar el par opuesto balanceado
            pairwise_data.append({
                'user_id': f'Expert{expert}',
                'preferred_text': text2['story_id'],
                'other_text': text1['story_id'],
                'label': 0
            })
        elif text1['total_score'] < text2['total_score']:
            pairwise_data.append({
                'user_id': f'Expert{expert}',
                'preferred_text': text2['story_id'],
                'other_text': text1['story_id'],
                'label': 1
            })
            # Generar el par opuesto balanceado
            pairwise_data.append({
                'user_id': f'Expert{expert}',
                'preferred_text': text1['story_id'],
                'other_text': text2['story_id'],
                'label': 0
            })
        else:
            ties_count += 1

# Convertir los datos a un DataFrame
pairwise_df = pd.DataFrame(pairwise_data)

# Mostrar el número de empates
print(f"Number of ties found: {ties_count}")

# Mostrar el resultado
pairwise_df


Número de empates encontrados: 153


Unnamed: 0,user_id,preferred_text,other_text,label
0,Expert0,10_Claude,10_GPT3.5,1
1,Expert0,10_GPT3.5,10_Claude,0
2,Expert0,10_GPT4,10_Claude,1
3,Expert0,10_Claude,10_GPT4,0
4,Expert0,10_NewYorker,10_Claude,1
...,...,...,...,...
2457,Expert10,9_Claude,9_NewYorker,0
2458,Expert10,9_NewYorker,9_GPT3.5,1
2459,Expert10,9_GPT3.5,9_NewYorker,0
2460,Expert10,9_NewYorker,9_GPT4,1


In [8]:
pairwise_df.to_csv('output/pairs_balanced_ttcw.csv')