In [None]:
# %%

import pandas as pd
from scipy import stats
from typing import Dict, List, Tuple
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from statsmodels.formula.api import ols  # type: ignore
from statsmodels.stats.anova import anova_lm  # type: ignore

In [None]:
# %%

def load_embeddings(embeddings_file: str) -> Dict[str, np.ndarray]:
    """Load embeddings from CSV and organize them by column.

    Args:
        embeddings_file: Path to the CSV file containing embeddings

    Returns:
        Dictionary where keys are column names and values are numpy arrays
    """
    df = pd.read_csv(embeddings_file)
    embedding_cols = [col for col in df.columns if col.endswith("_embedding")]
    embeddings_dict = {}

    for col in embedding_cols:

        def parse_embedding(x):
            """Convert string representation to numpy array."""
            if pd.isna(x):
                return np.zeros(384, dtype=np.float32)
            return np.fromstring(x.strip("[]"), sep=" ", dtype=np.float32)

        embeddings = df[col].apply(parse_embedding).values
        embeddings_array = np.vstack(embeddings)
        embeddings_dict[col] = embeddings_array

    return embeddings_dict


def identify_actors_and_reasons(
    embeddings_dict: Dict[str, np.ndarray],
) -> Tuple[List[str], List[str]]:
    """Identify actors and reason types from embedding column names.

    Args:
        embeddings_dict: Dictionary mapping column names to embedding arrays

    Returns:
        Tuple of (actors, reason_types) sorted lists
    """
    actors = set()
    reason_types = set()

    for col in embeddings_dict.keys():
        if col in ["selftext_embedding"]:
            continue

        if col == "top_comment_embedding":
            actors.add("redditor")
            reason_types.add("top_comment")
            continue

        parts = col.replace("_embedding", "").split("_")

        actor = parts[0]
        reason = "_".join(parts[1:])

        actors.add(actor)
        reason_types.add(reason)

    desired_order = [
        "redditor",
        "gpt3.5",
        "gpt4",
        "claude",
        "bison",
        "gemini",
        "llama",
        "mistral",
        "gemma",
    ]
    actors_list = list(actors)
    actors_known = [a for a in actors_list if a in desired_order]
    actors_unknown = sorted([a for a in actors_list if a not in desired_order])
    actors_sorted = (
        sorted(actors_known, key=lambda a: desired_order.index(a)) + actors_unknown
    )

    actors = actors_sorted
    reason_types = sorted(list(reason_types))

    return actors, reason_types

In [None]:
# %%

def extract_llm_human_similarities_for_anova(
    all_embeddings_dict: Dict[str, Dict[str, np.ndarray]],
    language_codes: List[str],
) -> pd.DataFrame:
    """Extract all LLM vs human similarity pairs for ANOVA analysis.

    Creates a table with all reasoning comparisons from each model for each scenario,
    allowing statistical comparison of how different models and languages compare to humans.
    All reasonings from the same model are treated as samples from that model.

    Args:
        all_embeddings_dict: Dictionary mapping language codes to embedding dictionaries
        language_codes: List of language codes to process

    Returns:
        DataFrame with columns: actor1, actor2, language, similarity, scenario_idx
        where actor1 is the model name (e.g. "claude"), actor2 is always "human"
    """

    results = []

    for language_code in tqdm(language_codes, desc="Processing languages"):
        embeddings_dict = all_embeddings_dict[language_code]
        actors, reason_types = identify_actors_and_reasons(embeddings_dict)

        # Get human embedding
        if "top_comment_embedding" not in embeddings_dict:
            continue
        human_embeddings = embeddings_dict["top_comment_embedding"]
        n_scenarios = human_embeddings.shape[0]

        # Process each LLM actor
        for actor in actors:
            if actor == "redditor":
                continue

            # Get all available reasoning types for this actor
            for reason_type in reason_types:
                col_name = f"{actor}_{reason_type}_embedding"
                if col_name not in embeddings_dict:
                    continue

                llm_embeddings = embeddings_dict[col_name]

                # Compare each scenario
                for scenario_idx in range(n_scenarios):
                    llm_emb = llm_embeddings[scenario_idx].reshape(1, -1)
                    human_emb = human_embeddings[scenario_idx].reshape(1, -1)

                    llm_emb_norm = normalize(llm_emb, norm="l2")
                    human_emb_norm = normalize(human_emb, norm="l2")

                    similarity = cosine_similarity(llm_emb_norm, human_emb_norm)[0, 0]

                    results.append(
                        {
                            "actor1": actor,
                            "actor2": "human",
                            "language": language_code,
                            "similarity": float(similarity),
                            "scenario_idx": scenario_idx,
                        }
                    )

    return pd.DataFrame(results)

In [None]:
# %%

all_embeddings = {
    "Base": load_embeddings("../data/embeddings.csv"),
    "Portuguese": load_embeddings("../data/embeddings_br.csv"),
    "German": load_embeddings("../data/embeddings_de.csv"),
    "Spanish": load_embeddings("../data/embeddings_es.csv"),
    "French": load_embeddings("../data/embeddings_fr.csv"),
}

language_codes = ["Base", "Portuguese", "German", "Spanish", "French"]

In [None]:
# %%

# Extract all LLM vs human similarities
df_anova = extract_llm_human_similarities_for_anova(all_embeddings, language_codes)

Processing languages: 100%|██████████| 5/5 [00:42<00:00,  8.57s/it]


In [None]:
# %%

# Save the data
output_file = "../results/anova_data.csv"
df_anova.to_csv(output_file, index=False)

In [None]:
# %%

# Basic statistics
print("\n=== DESCRIPTIVE STATISTICS ===")
print("\nBy Actor (LLM + reasoning type):")
print(df_anova.groupby("actor1")["similarity"].describe())

print("\nBy Language:")
print(df_anova.groupby("language")["similarity"].describe())


=== DESCRIPTIVE STATISTICS ===

By Actor (LLM + reasoning type):
           count      mean       std       min       25%       50%       75%  \
actor1                                                                         
bison    32418.0  0.468642  0.171823 -0.122836  0.353789  0.482966  0.596684   
claude   35200.0  0.443907  0.149189 -0.093526  0.344198  0.457664  0.553225   
gemini    2782.0  0.420075  0.138107 -0.026329  0.316160  0.429498  0.520325   
gemma    56812.0  0.412250  0.147336 -0.123496  0.312816  0.420206  0.518762   
gpt3.5   35200.0  0.431731  0.152288 -0.157070  0.327947  0.441166  0.544672   
gpt4     24394.0  0.449698  0.161081 -0.112287  0.340416  0.461528  0.566727   
llama    35200.0  0.398551  0.169290 -0.193726  0.294246  0.418923  0.522491   
mistral  35200.0  0.390742  0.158669 -0.131123  0.283102  0.400584  0.506198   

              max  
actor1             
bison    0.925996  
claude   0.884231  
gemini   0.769867  
gemma    0.841023  
gpt3.5   0.87

In [None]:
# %%

# Perform One-Way ANOVA for actors
print("\n=== ONE-WAY ANOVA: Comparing Actors ===")
actor_groups = [
    group["similarity"].values for name, group in df_anova.groupby("actor1")
]
f_stat_actor, p_value_actor = stats.f_oneway(*actor_groups)
print(f"F-statistic: {f_stat_actor:.4f}")
print(f"p-value: {p_value_actor:.4e}")
if p_value_actor < 0.05:
    print("Result: Significant differences exist between actors (p < 0.05)")
else:
    print("Result: No significant differences between actors (p >= 0.05)")

print("\nStatsmodels:")
actor_model = ols("similarity ~ C(actor1)", data=df_anova).fit()
actor_anova_table = anova_lm(actor_model, typ=2)
print(actor_anova_table)


=== ONE-WAY ANOVA: Comparing Actors ===
F-statistic: 957.5433
p-value: 0.0000e+00
Result: Significant differences exist between actors (p < 0.05)

Statsmodels:
                sum_sq        df           F  PR(>F)
C(actor1)   166.033028       7.0  957.543295     0.0
Residual   6370.970806  257198.0         NaN     NaN


In [None]:
# %%

# Perform One-Way ANOVA for languages
print("\n=== ONE-WAY ANOVA: Comparing Languages ===")
language_groups = [
    group["similarity"].values for name, group in df_anova.groupby("language")
]
f_stat_lang, p_value_lang = stats.f_oneway(*language_groups)
print(f"F-statistic: {f_stat_lang:.4f}")
print(f"p-value: {p_value_lang:.4e}")
if p_value_lang < 0.05:
    print("Result: Significant differences exist between languages (p < 0.05)")
else:
    print("Result: No significant differences between languages (p >= 0.05)")

print("\nStatsmodels:")
language_model = ols("similarity ~ C(language)", data=df_anova).fit()
language_anova_table = anova_lm(language_model, typ=2)
print(language_anova_table)


=== ONE-WAY ANOVA: Comparing Languages ===
F-statistic: 2710.8010
p-value: 0.0000e+00
Result: Significant differences exist between languages (p < 0.05)

Statsmodels:
                  sum_sq        df            F  PR(>F)
C(language)   264.441706       4.0  2710.801022     0.0
Residual     6272.562127  257201.0          NaN     NaN


In [None]:
# %%

# Two-Way ANOVA using statsmodels (if available)
print("\n=== TWO-WAY ANOVA: Actor × Language ===")
model = ols(
    "similarity ~ C(actor1) + C(language) + C(actor1):C(language)", data=df_anova
).fit()
anova_table = anova_lm(model, typ=2)
print(anova_table)


=== TWO-WAY ANOVA: Actor × Language ===
                             sum_sq        df           F    PR(>F)
C(actor1)              1.515557e+02       7.0  936.781786  0.000000
C(language)           -1.579529e-07       4.0   -0.000002  1.000000
C(actor1):C(language)  6.679330e-01      28.0    1.032141  0.408768
Residual               5.943711e+03  257171.0         NaN       NaN


