In [None]:
import pandas as pd
import json
import os

input_dir = '1_2_datasets_final/1_pairwise_preferences/by_user'
output_dir = '1_2_datasets_final/1_pairwise_preferences/majority_voting'
os.makedirs(output_dir, exist_ok=True)

contradiction_count = 0
tie_count = 0

for filename in os.listdir(input_dir):
    if not filename.endswith('.json'):
        continue

    input_path = os.path.join(input_dir, filename)
    output_path = os.path.join(output_dir, filename)

    # Cargar el dataset original
    with open(input_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    df = pd.DataFrame(data)
    
    # Crear clave de par ordenado alfabéticamente para identificar el par sin importar el orden.
    df['pair'] = df.apply(lambda row: tuple(sorted([row['preferred_text'], row['other_text']])), axis=1)
    
    balanced_rows = []  # acumula filas balanceadas para cada par
    # Procesar cada grupo de par
    for pair, group in df.groupby('pair'):
        A, B = pair
        # Calcular el conteo según el orden original:
        count_ab = group[(group['preferred_text'] == A) & (group['other_text'] == B)].shape[0]
        count_ba = group[(group['preferred_text'] == B) & (group['other_text'] == A)].shape[0]
        net_vote = count_ab - count_ba

        # Resolver empates: si net_vote == 0, usar el orden lexicográfico (A, B)
        if net_vote == 0:
            tie_count += 1
            majority_preferred, majority_other = A, B
        elif net_vote > 0:
            majority_preferred, majority_other = A, B
        else:
            majority_preferred, majority_other = B, A

        # Generar las dos filas balanceadas
        pos_row = {
            'preferred_text': majority_preferred,
            'other_text': majority_other,
            'label': 1,
            'user_id': 'majority_voting'
        }
        neg_row = {
            'preferred_text': majority_other,
            'other_text': majority_preferred,
            'label': 0,
            'user_id': 'majority_voting'
        }
        balanced_rows.extend([pos_row, neg_row])
    
    # Guardar el dataset balanceado
    with open(output_path, 'w', encoding='utf-8') as outfile:
        json.dump(balanced_rows, outfile, ensure_ascii=False, indent=4)
    print(f"Processed {filename} - Empates resueltos: {tie_count}")

Processed ttcw_pairs_balanced.json - Empates resueltos: 830
Processed pronvsprompt_pairs_balanced.json - Empates resueltos: 13331
Processed hanna_pairs_balanced.json - Empates resueltos: 555952
Processed confederacy_pairs_balanced.json - Empates resueltos: 556342
Processed poetry_pairs_balanced.json - Empates resueltos: 556387
Processed slm_pairs_balanced.json - Empates resueltos: 560045
