In [None]:
# %%

import pandas as pd
from scipy import stats
from typing import Dict, List, Tuple
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from statsmodels.formula.api import ols  # type: ignore
from statsmodels.stats.anova import anova_lm  # type: ignore

In [None]:
# %%

def load_embeddings(embeddings_file: str) -> Dict[str, np.ndarray]:
    """Load embeddings from CSV and organize them by column.

    Args:
        embeddings_file: Path to the CSV file containing embeddings

    Returns:
        Dictionary where keys are column names and values are numpy arrays
    """
    df = pd.read_csv(embeddings_file)
    embedding_cols = [col for col in df.columns if col.endswith("_embedding")]
    embeddings_dict = {}

    for col in embedding_cols:

        def parse_embedding(x):
            """Convert string representation to numpy array."""
            if pd.isna(x):
                return np.zeros(384, dtype=np.float32)
            return np.fromstring(x.strip("[]"), sep=" ", dtype=np.float32)

        embeddings = df[col].apply(parse_embedding).values
        embeddings_array = np.vstack(embeddings)
        embeddings_dict[col] = embeddings_array

    return embeddings_dict


def identify_actors_and_reasons(
    embeddings_dict: Dict[str, np.ndarray],
) -> Tuple[List[str], List[str]]:
    """Identify actors and reason types from embedding column names.

    Args:
        embeddings_dict: Dictionary mapping column names to embedding arrays

    Returns:
        Tuple of (actors, reason_types) sorted lists
    """
    actors = set()
    reason_types = set()

    for col in embeddings_dict.keys():
        if col in ["selftext_embedding"]:
            continue

        if col == "top_comment_embedding":
            actors.add("redditor")
            reason_types.add("top_comment")
            continue

        parts = col.replace("_embedding", "").split("_")

        actor = parts[0]
        reason = "_".join(parts[1:])

        actors.add(actor)
        reason_types.add(reason)

    desired_order = [
        "redditor",
        "gpt3.5",
        "gpt4",
        "claude",
        "bison",
        "gemini",
        "llama",
        "mistral",
        "gemma",
    ]
    actors_list = list(actors)
    actors_known = [a for a in actors_list if a in desired_order]
    actors_unknown = sorted([a for a in actors_list if a not in desired_order])
    actors_sorted = (
        sorted(actors_known, key=lambda a: desired_order.index(a)) + actors_unknown
    )

    actors = actors_sorted
    reason_types = sorted(list(reason_types))

    return actors, reason_types

In [None]:
# %%

def extract_llm_human_similarities_for_anova(
    all_embeddings_dict: Dict[str, Dict[str, np.ndarray]],
    language_codes: List[str],
) -> pd.DataFrame:
    """Extract all LLM vs human similarity pairs for ANOVA analysis.

    Creates a table with individual reasoning comparisons (not averaged) for each scenario,
    allowing statistical comparison of how different models and languages compare to humans.

    Args:
        all_embeddings_dict: Dictionary mapping language codes to embedding dictionaries
        language_codes: List of language codes to process

    Returns:
        DataFrame with columns: actor1, actor2, language, similarity
        where actor1 is like "claude_reason_2", actor2 is always "human"
    """

    results = []

    for language_code in tqdm(language_codes, desc="Processing languages"):
        embeddings_dict = all_embeddings_dict[language_code]
        actors, reason_types = identify_actors_and_reasons(embeddings_dict)

        # Get human embedding
        if "top_comment_embedding" not in embeddings_dict:
            continue
        human_embeddings = embeddings_dict["top_comment_embedding"]
        n_scenarios = human_embeddings.shape[0]

        # Process each LLM actor
        for actor in actors:
            if actor == "redditor":
                continue

            # Get all available reasoning types for this actor
            for reason_type in reason_types:
                col_name = f"{actor}_{reason_type}_embedding"
                if col_name not in embeddings_dict:
                    continue

                llm_embeddings = embeddings_dict[col_name]

                # Compare each scenario
                for scenario_idx in range(n_scenarios):
                    llm_emb = llm_embeddings[scenario_idx].reshape(1, -1)
                    human_emb = human_embeddings[scenario_idx].reshape(1, -1)

                    llm_emb_norm = normalize(llm_emb, norm="l2")
                    human_emb_norm = normalize(human_emb, norm="l2")

                    similarity = cosine_similarity(llm_emb_norm, human_emb_norm)[0, 0]

                    results.append(
                        {
                            "actor1": f"{actor}_{reason_type}",
                            "actor2": "human",
                            "language": language_code,
                            "similarity": float(similarity),
                            "scenario_idx": scenario_idx,
                        }
                    )

    return pd.DataFrame(results)

In [None]:
# %%

all_embeddings = {
    "Base": load_embeddings("../data/embeddings.csv"),
    "Portuguese": load_embeddings("../data/embeddings_br.csv"),
    "German": load_embeddings("../data/embeddings_de.csv"),
    "Spanish": load_embeddings("../data/embeddings_es.csv"),
    "French": load_embeddings("../data/embeddings_fr.csv"),
}

language_codes = ["Base", "Portuguese", "German", "Spanish", "French"]

In [None]:
# %%

# Extract all LLM vs human similarities
df_anova = extract_llm_human_similarities_for_anova(all_embeddings, language_codes)

Processing languages: 100%|██████████| 5/5 [00:42<00:00,  8.48s/it]


In [None]:
# %%

# Save the data
output_file = "../results/anova_data.csv"
df_anova.to_csv(output_file, index=False)

In [None]:
# %%

# Basic statistics
print("\n=== DESCRIPTIVE STATISTICS ===")
print("\nBy Actor (LLM + reasoning type):")
print(df_anova.groupby("actor1")["similarity"].describe())

print("\nBy Language:")
print(df_anova.groupby("language")["similarity"].describe())


=== DESCRIPTIVE STATISTICS ===

By Actor (LLM + reasoning type):
                    count      mean       std       min       25%       50%  \
actor1                                                                        
bison_reason_1    10806.0  0.468715  0.172539 -0.113236  0.354980  0.482823   
bison_reason_2    10806.0  0.468603  0.171504 -0.089466  0.352683  0.483204   
bison_reason_3    10806.0  0.468608  0.171439 -0.122836  0.354023  0.482802   
claude_reason_1   12197.0  0.443336  0.148981 -0.093526  0.343479  0.457004   
claude_reason_2   12197.0  0.443532  0.149045 -0.081075  0.343165  0.457342   
claude_reason_3   10806.0  0.444976  0.149595 -0.064444  0.345770  0.459301   
gemini_reason_1    1391.0  0.420860  0.137378  0.010950  0.319416  0.431945   
gemini_reason_2    1391.0  0.419289  0.138878 -0.026329  0.311234  0.428001   
gemma_reason_1    12197.0  0.402712  0.147288 -0.086656  0.302422  0.410749   
gemma_reason_2    12197.0  0.404031  0.148104 -0.123496  0.305200

In [None]:
# %%

# Perform One-Way ANOVA for actors
print("\n=== ONE-WAY ANOVA: Comparing Actors ===")
actor_groups = [
    group["similarity"].values for name, group in df_anova.groupby("actor1")
]
f_stat_actor, p_value_actor = stats.f_oneway(*actor_groups)
print(f"F-statistic: {f_stat_actor:.4f}")
print(f"p-value: {p_value_actor:.4e}")
if p_value_actor < 0.05:
    print("Result: Significant differences exist between actors (p < 0.05)")
else:
    print("Result: No significant differences between actors (p >= 0.05)")

print("\nStatsmodels:")
actor_model = ols("similarity ~ C(actor1)", data=df_anova).fit()
actor_anova_table = anova_lm(actor_model, typ=2)
print(actor_anova_table)


=== ONE-WAY ANOVA: Comparing Actors ===
F-statistic: 317.7166
p-value: 0.0000e+00
Result: Significant differences exist between actors (p < 0.05)

Statsmodels:
                sum_sq        df           F  PR(>F)
C(actor1)   180.608452      23.0  317.716596     0.0
Residual   6356.395381  257182.0         NaN     NaN


In [None]:
# %%

# Perform One-Way ANOVA for languages
print("\n=== ONE-WAY ANOVA: Comparing Languages ===")
language_groups = [
    group["similarity"].values for name, group in df_anova.groupby("language")
]
f_stat_lang, p_value_lang = stats.f_oneway(*language_groups)
print(f"F-statistic: {f_stat_lang:.4f}")
print(f"p-value: {p_value_lang:.4e}")
if p_value_lang < 0.05:
    print("Result: Significant differences exist between languages (p < 0.05)")
else:
    print("Result: No significant differences between languages (p >= 0.05)")

print("\nStatsmodels:")
language_model = ols("similarity ~ C(language)", data=df_anova).fit()
language_anova_table = anova_lm(language_model, typ=2)
print(language_anova_table)


=== ONE-WAY ANOVA: Comparing Languages ===
F-statistic: 2710.8010
p-value: 0.0000e+00
Result: Significant differences exist between languages (p < 0.05)

Statsmodels:
                  sum_sq        df            F  PR(>F)
C(language)   264.441706       4.0  2710.801022     0.0
Residual     6272.562127  257201.0          NaN     NaN


In [None]:
# %%

# Two-Way ANOVA using statsmodels (if available)
print("\n=== TWO-WAY ANOVA: Actor × Language ===")
model = ols(
    "similarity ~ C(actor1) + C(language) + C(actor1):C(language)", data=df_anova
).fit()
anova_table = anova_lm(model, typ=2)
print(anova_table)


=== TWO-WAY ANOVA: Actor × Language ===
                             sum_sq        df             F    PR(>F)
C(actor1)              7.318789e-08      23.0  1.377462e-07  1.000000
C(language)            1.116527e-09       4.0  1.208308e-08  0.999912
C(actor1):C(language)  6.014597e-01      92.0  2.830003e-01  0.989146
Residual               5.939928e+03  257128.0           NaN       NaN


