In [None]:
# %%

import pandas as pd
from scipy import stats
from statsmodels.formula.api import ols  # type: ignore
from statsmodels.stats.anova import anova_lm  # type: ignore
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from embedding_utils import (
    load_embeddings,
    extract_llm_human_similarities_for_anova,
)

In [None]:
# %%

all_embeddings = {
    "Base": load_embeddings("../data/embeddings.csv"),
    "Portuguese": load_embeddings("../data/embeddings_br.csv"),
    "German": load_embeddings("../data/embeddings_de.csv"),
    "Spanish": load_embeddings("../data/embeddings_es.csv"),
    "French": load_embeddings("../data/embeddings_fr.csv"),
}

language_codes = ["Base", "Portuguese", "German", "Spanish", "French"]

In [None]:
# %%

df_anova = extract_llm_human_similarities_for_anova(all_embeddings, language_codes)
output_file = "../results/anova_data.csv"
df_anova.to_csv(output_file, index=False)

Processing languages: 100%|██████████| 5/5 [00:42<00:00,  8.56s/it]


In [None]:
# %%

print("\n=== POST-HOC TEST: Tukey's HSD ===")
tukey_results = pairwise_tukeyhsd(
    df_anova["similarity"],
    df_anova["actor1"] + "_" + df_anova["language"],
    alpha=0.005,
    use_var="unequal",
)

tukey_df = pd.DataFrame(
    tukey_results._results_table.data[1:],  # type: ignore[attr-defined]
    columns=tukey_results._results_table.data[0],  # type: ignore[attr-defined]
)
tukey_df[["model1", "language1"]] = tukey_df["group1"].str.split("_", n=1, expand=True)
tukey_df[["model2", "language2"]] = tukey_df["group2"].str.split("_", n=1, expand=True)
filtered_tukey = tukey_df[
    (tukey_df["model1"] == tukey_df["model2"])
    | (tukey_df["language1"] == tukey_df["language2"])
]
print(
    filtered_tukey[
        ["group1", "group2", "meandiff", "p-adj", "lower", "upper", "reject"]
    ].to_string(index=False)
)


=== POST-HOC TEST: Tukey's HSD ===
            group1             group2  meandiff  p-adj   lower   upper  reject
        bison_Base        claude_Base   -0.0237 0.0000 -0.0293 -0.0181    True
        bison_Base         gemma_Base   -0.0540 0.0000 -0.0591 -0.0489    True
        bison_Base        gpt3.5_Base   -0.0365 0.0000 -0.0422 -0.0309    True
        bison_Base          gpt4_Base   -0.0158 0.0000 -0.0223 -0.0093    True
        bison_Base         llama_Base   -0.0480 0.0000 -0.0535 -0.0424    True
        bison_Base       mistral_Base   -0.0676 0.0000 -0.0732 -0.0620    True
       claude_Base      claude_French    0.0226 0.0000  0.0078  0.0375    True
       claude_Base      claude_German   -0.1352 0.0000 -0.1501 -0.1202    True
       claude_Base  claude_Portuguese    0.0980 0.0000  0.0796  0.1163    True
       claude_Base     claude_Spanish    0.0828 0.0000  0.0541  0.1114    True
       claude_Base         gemma_Base   -0.0303 0.0000 -0.0349 -0.0256    True
       claude_Ba

In [None]:
# %%

print("\n=== DESCRIPTIVE STATISTICS ===")
print("\nBy Actor (LLM + reasoning type):")
print(df_anova.groupby("actor1")["similarity"].describe())

print("\nBy Language:")
print(df_anova.groupby("language")["similarity"].describe())

# # %%
# print("\n=== ONE-WAY ANOVA: Comparing Actors ===")
# actor_model = ols("similarity ~ C(actor1)", data=df_anova).fit()
# actor_anova_table = anova_lm(actor_model, typ=2)
# print(actor_anova_table)

# # %%
# print("\n=== ONE-WAY ANOVA: Comparing Languages ===")
# language_model = ols("similarity ~ C(language)", data=df_anova).fit()
# language_anova_table = anova_lm(language_model, typ=2)
# print(language_anova_table)


=== DESCRIPTIVE STATISTICS ===

By Actor (LLM + reasoning type):
           count      mean       std       min       25%       50%       75%  \
actor1                                                                         
bison    32418.0  0.468642  0.171823 -0.122836  0.353789  0.482966  0.596684   
claude   35200.0  0.443907  0.149189 -0.093526  0.344198  0.457664  0.553225   
gemini    2782.0  0.420075  0.138107 -0.026329  0.316160  0.429498  0.520325   
gemma    56812.0  0.412250  0.147336 -0.123496  0.312816  0.420206  0.518762   
gpt3.5   35200.0  0.431731  0.152288 -0.157070  0.327947  0.441166  0.544672   
gpt4     24394.0  0.449698  0.161081 -0.112287  0.340416  0.461528  0.566727   
llama    35200.0  0.398551  0.169290 -0.193726  0.294246  0.418923  0.522491   
mistral  35200.0  0.390742  0.158669 -0.131123  0.283102  0.400584  0.506198   

              max  
actor1             
bison    0.925996  
claude   0.884231  
gemini   0.769867  
gemma    0.841023  
gpt3.5   0.87

In [None]:
# %%

df_anova = df_anova[
    ~df_anova["actor1"].isin(["gemini", "bison"])
]  # Remove Gemini and Bison from analysis

print("\n=== TWO-WAY ANOVA: Actor × Language ===")
model = ols("similarity ~ C(actor1) * C(language)", data=df_anova).fit()
anova_table = anova_lm(model, typ=2)
print(anova_table)


=== TWO-WAY ANOVA: Actor × Language ===
                            sum_sq        df            F  PR(>F)
C(actor1)               102.718378       5.0   919.657495     0.0
C(language)             243.908706       4.0  2729.702238     0.0
C(actor1):C(language)   158.380327      20.0   354.502420     0.0
Residual               4958.588359  221976.0          NaN     NaN
