In [None]:
import pandas as pd
import json,os
from tqdm import tqdm
import numpy as np
from scipy.optimize import minimize

In [None]:
df = pd.read_csv("matchups.csv")

In [None]:
df['image_group'] = df.groupby('image_path').ngroup() 
image_groups = df.groupby('image_group')

In [None]:
all_models = pd.concat([df['selected_model'], df['other_model']]).unique()
model_to_idx = {model: idx for idx, model in enumerate(all_models)}
n_models = len(all_models)

In [None]:
def fit_bradley_terry(df, all_models, model_to_idx, n_models):
    
    wins = np.zeros((n_models, n_models))
    
    for _, row in df.iterrows():
        winner_idx = model_to_idx[row['selected_model']]
        loser_idx = model_to_idx[row['other_model']]
        wins[winner_idx, loser_idx] += 1
    
    # Initialize strengths equally
    initial_strengths = np.ones(n_models) / n_models

    def neg_log_likelihood(strengths):
        strengths = np.exp(strengths)  # Ensure positive strengths
        strengths = strengths / sum(strengths)  # Normalize
        ll = 0
        for i in range(n_models):
            for j in range(n_models):
                if wins[i, j] > 0:
                    p_ij = strengths[i] / (strengths[i] + strengths[j])
                    ll += wins[i, j] * np.log(p_ij)
        return -ll

    result = minimize(neg_log_likelihood, np.log(initial_strengths), method='BFGS')
    strengths = np.exp(result.x)
    strengths = strengths / sum(strengths)
    
    return strengths

In [None]:
np.random.seed(42)

n_bootstrap = 1000
bootstrap_strengths = []

for _ in tqdm(range(n_bootstrap)):
    
    sampled_images = np.random.choice(list(image_groups.groups.keys()), size=len(image_groups), replace=True)
    sampled_data = pd.concat([image_groups.get_group(img) for img in sampled_images])
    
    results = fit_bradley_terry(sampled_data, all_models, model_to_idx, n_models)
    assert(len(results)) == 10
    bootstrap_strengths.append(results)

In [None]:
bootstrap_strengths = np.array(bootstrap_strengths)
strengths_mean = bootstrap_strengths.mean(axis=0)
confidence_intervals = np.percentile(bootstrap_strengths, [2.5, 97.5], axis=0)

In [None]:
final_results = pd.DataFrame({
    'model': all_models,
    'mean_strength': strengths_mean,
    'ci_lower': confidence_intervals[0],
    'ci_upper': confidence_intervals[1]
})

In [None]:
final_results = final_results.sort_values('mean_strength', ascending=False).reset_index(drop=True)

In [None]:
final_results.to_csv("final_results.csv",index=False)

# Rank by image

In [None]:
with open("captions_for_web.json","r") as f:
    caption_data = json.load(f)

In [None]:
for i,datum in enumerate(caption_data):
    
    url = os.path.basename(datum['url'])
    tmp = df.loc[df.image_path==url]
    
    all_models = pd.concat([tmp['selected_model'], tmp['other_model']]).unique()
    model_to_idx = {model: idx for idx, model in enumerate(all_models)}
    n_models = len(all_models)
    wins = np.zeros((n_models, n_models))
    
    for _, row in tmp.iterrows():
        winner_idx = model_to_idx[row['selected_model']]
        loser_idx = model_to_idx[row['other_model']]
        wins[winner_idx, loser_idx] += 1
    
    n_models = len(wins)

    # Initialize strengths equally
    initial_strengths = np.ones(n_models) / n_models

    result = minimize(neg_log_likelihood, np.log(initial_strengths), method='BFGS')
    strengths = np.exp(result.x)
    strengths = strengths / sum(strengths)

    results = pd.DataFrame({'model': all_models,'strength': strengths})
    results['total_wins'] = wins.sum(axis=1)
    results['total_matches'] = wins.sum(axis=1) + wins.sum(axis=0)
    results['win_rate'] = results['total_wins'] / results['total_matches']

    results = results.sort_values('strength', ascending=False).reset_index(drop=True)
        
    for j,caption in enumerate(datum["captions"]):
        model = caption["model"]
        strength = results.strength.loc[results.model==model].iloc[0]
        
        caption_data[i]["captions"][j]["strength"] = strength

In [None]:
with open("captions_for_web.json","w") as f:
    json.dump(caption_data,f,indent=4)

# Stabilization

In [None]:
all_models = pd.concat([df['selected_model'], df['other_model']]).unique()

In [None]:
model_to_idx = {model: idx for idx, model in enumerate(all_models)}

In [None]:
n_models = len(all_models)

In [None]:
rankstrings = []
for i in range(1350):
    
    tmp = df.loc[:i]
    wins = np.zeros((n_models, n_models))
    
    for _, row in tmp.iterrows():
        winner_idx = model_to_idx[row['selected_model']]
        loser_idx = model_to_idx[row['other_model']]
        wins[winner_idx, loser_idx] += 1
    
    n_models = len(wins)

    # Initialize strengths equally
    initial_strengths = np.ones(n_models) / n_models

    result = minimize(neg_log_likelihood, np.log(initial_strengths), method='BFGS')
    strengths = np.exp(result.x)
    strengths = strengths / sum(strengths)

    results = pd.DataFrame({'model': all_models,'strength': strengths})
    results['total_wins'] = wins.sum(axis=1)
    results['total_matches'] = wins.sum(axis=1) + wins.sum(axis=0)
    results['win_rate'] = results['total_wins'] / results['total_matches']

    results = results.sort_values('strength', ascending=False).reset_index(drop=True)
    
    rankstring = "".join([str(model_to_idx[item]) for item in results.model])
    rankstrings.append((i,rankstring))