In [None]:
# %%

import pandas as pd
from scipy import stats
from statsmodels.formula.api import ols  # type: ignore
from statsmodels.stats.anova import anova_lm  # type: ignore
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from embedding_utils import (
    load_embeddings,
    extract_llm_human_similarities_for_anova,
)

In [None]:
# %%

all_embeddings = {
    "Base": load_embeddings("../data/embeddings.csv"),
    "Portuguese": load_embeddings("../data/embeddings_br.csv"),
    "German": load_embeddings("../data/embeddings_de.csv"),
    "Spanish": load_embeddings("../data/embeddings_es.csv"),
    "French": load_embeddings("../data/embeddings_fr.csv"),
}

language_codes = ["Base", "Portuguese", "German", "Spanish", "French"]

In [None]:
# %%

df_anova = extract_llm_human_similarities_for_anova(all_embeddings, language_codes)
df_anova = df_anova[
    ~df_anova["actor1"].isin(["gemini", "bison"])
]  # Remove Gemini and Bison from analysis

Processing languages: 100%|██████████| 5/5 [00:42<00:00,  8.59s/it]


In [None]:
# %%

output_file = "../results/anova_data.csv"
df_anova.to_csv(output_file, index=False)

In [None]:
# %%

print("\n=== DESCRIPTIVE STATISTICS ===")
print("\nBy Actor (LLM + reasoning type):")
print(df_anova.groupby("actor1")["similarity"].describe())

print("\nBy Language:")
print(df_anova.groupby("language")["similarity"].describe())

# # %%
# print("\n=== ONE-WAY ANOVA: Comparing Actors ===")
# actor_model = ols("similarity ~ C(actor1)", data=df_anova).fit()
# actor_anova_table = anova_lm(actor_model, typ=2)
# print(actor_anova_table)

# # %%
# print("\n=== ONE-WAY ANOVA: Comparing Languages ===")
# language_model = ols("similarity ~ C(language)", data=df_anova).fit()
# language_anova_table = anova_lm(language_model, typ=2)
# print(language_anova_table)


=== DESCRIPTIVE STATISTICS ===

By Actor (LLM + reasoning type):
           count      mean       std       min       25%       50%       75%  \
actor1                                                                         
claude   35200.0  0.443907  0.149189 -0.093526  0.344198  0.457664  0.553225   
gemma    56812.0  0.412250  0.147336 -0.123496  0.312816  0.420206  0.518762   
gpt3.5   35200.0  0.431731  0.152288 -0.157070  0.327947  0.441166  0.544672   
gpt4     24394.0  0.449698  0.161081 -0.112287  0.340416  0.461528  0.566727   
llama    35200.0  0.398551  0.169290 -0.193726  0.294246  0.418923  0.522491   
mistral  35200.0  0.390742  0.158669 -0.131123  0.283102  0.400584  0.506198   

              max  
actor1             
claude   0.884231  
gemma    0.841023  
gpt3.5   0.874765  
gpt4     0.904920  
llama    0.852901  
mistral  0.856723  

By Language:
               count      mean       std       min       25%       50%  \
language                                     

In [None]:
# %%

print("\n=== TWO-WAY ANOVA: Actor × Language ===")
model = ols("similarity ~ C(actor1) * C(language)", data=df_anova).fit()
anova_table = anova_lm(model, typ=2)
print(anova_table)


=== TWO-WAY ANOVA: Actor × Language ===
                            sum_sq        df            F  PR(>F)
C(actor1)               102.718378       5.0   919.657495     0.0
C(language)             243.908706       4.0  2729.702238     0.0
C(actor1):C(language)   158.380327      20.0   354.502420     0.0
Residual               4958.588359  221976.0          NaN     NaN


In [None]:
# %%

print("\n=== POST-HOC TEST: Tukey's HSD ===")
tukey_results = pairwise_tukeyhsd(
    df_anova["similarity"],
    df_anova["actor1"] + "_" + df_anova["language"],
    alpha=0.005,
    use_var="unequal",
)

tukey_df = pd.DataFrame(
    tukey_results._results_table.data[1:],  # type: ignore[attr-defined]
    columns=tukey_results._results_table.data[0],  # type: ignore[attr-defined]
)
tukey_df[["model1", "language1"]] = tukey_df["group1"].str.split("_", n=1, expand=True)
tukey_df[["model2", "language2"]] = tukey_df["group2"].str.split("_", n=1, expand=True)
filtered_tukey = tukey_df[
    (tukey_df["model1"] == tukey_df["model2"])
    | (tukey_df["language1"] == tukey_df["language2"])
]
print(
    filtered_tukey[
        ["group1", "group2", "meandiff", "p-adj", "lower", "upper", "reject"]
    ].to_string(index=False)
)


=== POST-HOC TEST: Tukey's HSD ===
            group1             group2  meandiff  p-adj   lower   upper  reject
       claude_Base      claude_French    0.0226 0.0000  0.0081  0.0372    True
       claude_Base      claude_German   -0.1352 0.0000 -0.1499 -0.1204    True
       claude_Base  claude_Portuguese    0.0980 0.0000  0.0799  0.1160    True
       claude_Base     claude_Spanish    0.0828 0.0000  0.0545  0.1110    True
       claude_Base         gemma_Base   -0.0303 0.0000 -0.0348 -0.0257    True
       claude_Base        gpt3.5_Base   -0.0128 0.0000 -0.0180 -0.0076    True
       claude_Base          gpt4_Base    0.0079 0.0000  0.0019  0.0140    True
       claude_Base         llama_Base   -0.0242 0.0000 -0.0293 -0.0192    True
       claude_Base       mistral_Base   -0.0439 0.0000 -0.0490 -0.0387    True
     claude_French      claude_German   -0.1578 0.0000 -0.1779 -0.1378    True
     claude_French  claude_Portuguese    0.0753 0.0000  0.0528  0.0979    True
     claude_Fren