ICC for manual vs. automatic (and for bias subgroups)

In [None]:
# setup
import os
import sys
import numpy as np
import pandas as pd
import pingouin as pg
from scipy import stats

sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")

from regression.model_evaluation_helpers import adjust_pvals

# paths
demographics = "/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/data/demographics_data.csv"
scores_path  = "/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/data/language_scores_all_subjects.csv"
manual_scores = "/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/resources/Score_Validierung.xlsx"
out_dir = "/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/regression/bias/ICC"
os.makedirs(out_dir, exist_ok=True)

# helpers 
def compute_icc_rows(df_pair, score_m, score_auto, label_score_type):
    sub = df_pair[[score_m, score_auto]].dropna().copy()
    n = len(sub)
    if n < 2:
        return []

    # scores: first all manual, then all automatic
    scores_long = pd.concat(
        [sub[score_m].reset_index(drop=True),
         sub[score_auto].reset_index(drop=True)],
        ignore_index=True
    )

    # order has to match
    raters = np.repeat(["manual", "automatic"], repeats=n)
    subjects = np.tile(np.arange(n), 2)

    df_long = pd.DataFrame({"subject": subjects, "rater": raters, "score": scores_long})

    icc_tbl = pg.intraclass_corr(data=df_long, targets="subject", raters="rater", ratings="score")
    rows = []
    for _, r in icc_tbl.iterrows():
        if r["Type"] in ["ICC2"]:
            rows.append({
                "Score": label_score_type,
                "ICC_Type": r["Type"],
                "ICC": r["ICC"],
                "CI95_lower": r["CI95%"][0],
                "CI95_upper": r["CI95%"][1],
                "F": r["F"], # remove
                "df1": r["df1"], # remove
                "df2": r["df2"], # remove
                "p_value": r["pval"], # remove
                "n": n
            })
    return rows


In [None]:
# load data
# demographics: Subject_ID, Gender, Country, Age
demo = pd.read_csv(demographics)
required_demo = {"Subject_ID", "Gender", "Country", "Age"}
missing_demo = required_demo - set(demo.columns)
if missing_demo:
    raise ValueError(f"Demographics file is missing columns: {missing_demo}")
# optional: deduplicate by Subject_ID -> ?
demo = demo.drop_duplicates("Subject_ID", keep="first")

# automatic scores
auto = pd.read_csv(scores_path).rename(columns={
    "SemanticFluencyScore": "semantic_fluency_score",
    "PhonemicFluencyScore": "phonemic_fluency_score",
    "PictureNamingScore":   "picture_naming_score",
})
required_auto = {"Subject_ID", "semantic_fluency_score", "phonemic_fluency_score", "picture_naming_score"}
missing_auto = required_auto - set(auto.columns)
if missing_auto:
    raise ValueError(f"Automatic scores file is missing columns: {missing_auto}")
auto = auto.drop_duplicates("Subject_ID", keep="first")

# manual scores – sheet "scores_manuell"
xls = pd.ExcelFile(manual_scores)
sheet_name = "scores_manuell" if "scores_manuell" in xls.sheet_names else xls.sheet_names[0]
man = pd.read_excel(manual_scores, sheet_name=sheet_name, engine="openpyxl").rename(columns={
    "SemanticFluencyScore_m": "semantic_fluency_score_m",
    "PhonemicFluencyScore_m": "phonemic_fluency_score_m",
    "PictureNamingScore_m":   "picture_naming_score_m",
    # if manual sheet used unsuffixed names:
    "semantic_fluency_score": "semantic_fluency_score_m",
    "phonemic_fluency_score": "phonemic_fluency_score_m",
    "picture_naming_score":   "picture_naming_score_m",
})
required_man = {"Subject_ID", "semantic_fluency_score_m", "phonemic_fluency_score_m", "picture_naming_score_m"}
missing_man = required_man - set(man.columns)
if missing_man:
    raise ValueError(f"Manual scores sheet is missing columns: {missing_man}")
man = man.drop_duplicates("Subject_ID", keep="first")


In [None]:
# merge: manual & automatic on Subject_ID, then demographics
scores = man.merge(
    auto[["Subject_ID", "semantic_fluency_score", "phonemic_fluency_score", "picture_naming_score"]],
    on="Subject_ID", how="inner"
).merge(
    demo[["Subject_ID", "Gender", "Country", "Age"]],
    on="Subject_ID", how="left"
)

print("Merged shape (manual∩auto + demographics):", scores.shape)
print("Country counts:\n", scores["Country"].value_counts(dropna=False))
print("Gender counts:\n", scores["Gender"].value_counts(dropna=False))

# pairing
pairing_map = {
    "SemanticFluency": ("semantic_fluency_score_m", "semantic_fluency_score"),
    "PhonemicFluency": ("phonemic_fluency_score_m", "phonemic_fluency_score"),
    "PictureNaming":   ("picture_naming_score_m",   "picture_naming_score"),
}


In [11]:
# overall ICCs (2,1)
overall_rows = []
for label, (mcol, acol) in pairing_map.items():
    overall_rows += compute_icc_rows(scores, mcol, acol, label)

overall_df = pd.DataFrame(overall_rows)
overall_path = os.path.join(out_dir, "icc_overall.csv")
overall_df.to_csv(overall_path, index=False)
print(f"Saved overall ICCs -> {overall_path}")

# bias subgroups (UK, US, m, f, and combinations) -> ICC (2,1)
subgroups = {
    "UK": scores[scores["Country"].str.lower() == "uk"],
    "US": scores[scores["Country"].str.lower() == "usa"],
    "Male": scores[scores["Gender"] == "m"],
    "Female": scores[scores["Gender"] == "f"],
    "UK_Male": scores[(scores["Country"].str.lower() == "uk")  & (scores["Gender"] == "m")],
    "US_Male": scores[(scores["Country"].str.lower() == "usa") & (scores["Gender"] == "m")],
    "UK_Female": scores[(scores["Country"].str.lower() == "uk")  & (scores["Gender"] == "f")],
    "US_Female": scores[(scores["Country"].str.lower() == "usa") & (scores["Gender"] == "f")],
}

rows = []
ns = []
for name, df_sub in subgroups.items():
    for label, (mcol, acol) in pairing_map.items():
        sub = df_sub[[mcol, acol]].dropna()
        n = len(sub)
        ns.append({"Subgroup": name, "Score": label, "n": n})
        if n < 2:
            continue
        rows += [{**r, "Subgroup": name} for r in compute_icc_rows(df_sub, mcol, acol, label, n)]

subgroup_icc = pd.DataFrame(rows)

subgroup_icc_path = os.path.join(out_dir, "icc_bias_subgroups.csv")
subgroup_icc.to_csv(subgroup_icc_path, index=False)
print(f"Saved subgroup ICCs -> {subgroup_icc_path}")

print("\noverall ICC2 head:")
print(overall_df[overall_df["ICC_Type"] == "ICC2"].head())
print("\nSubgroup ICC2 head:")
print(subgroup_icc[subgroup_icc["ICC_Type"] == "ICC2"].head())


Merged shape (manual∩auto + demographics): (40, 15)
Country counts:
 Country
usa    20
uk     20
Name: count, dtype: int64
Gender counts:
 Gender
f    25
m    15
Name: count, dtype: int64
[sanity] SemanticFluency: n=40  r=0.983
[sanity] PhonemicFluency: n=40  r=0.984
[sanity] PictureNaming: n=40  r=0.970
Saved overall ICCs -> /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/regression/bias/ICC/icc_overall.csv
Saved subgroup ICCs -> /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/regression/bias/ICC/icc_bias_subgroups.csv
Saved subgroup n per (Subgroup×Score) -> /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/regression/bias/ICC/icc_bias_subgroups_nsamples.csv
Saved ICC(2,1) only (overall) -> /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/regression/bias/ICC/icc_overall_ICC2_only.csv
Saved ICC(2,1) only (subgroups) -> /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/regressi

In [13]:
# PictureNaming: mean difference between manual and automatic score (paired t-tests, overall and by subgroup)

# define subgroups (same as above)
subgroups = {
    "Overall":   pd.Series(True, index=scores.index),
    "UK":        scores["Country"].str.lower().eq("uk"),
    "US":        scores["Country"].str.lower().eq("usa"),
    "Male":      scores["Gender"].eq("m"),
    "Female":    scores["Gender"].eq("f"),
    "UK Male":   scores["Country"].str.lower().eq("uk")  & scores["Gender"].eq("m"),
    "UK Female": scores["Country"].str.lower().eq("uk")  & scores["Gender"].eq("f"),
    "US Male":   scores["Country"].str.lower().eq("usa") & scores["Gender"].eq("m"),
    "US Female": scores["Country"].str.lower().eq("usa") & scores["Gender"].eq("f"),
}

# choose only Picture Naming  (automatic − manual)
mcol, acol, label = "picture_naming_score_m", "picture_naming_score", "PictureNaming"

rows = []
for name, mask in subgroups.items():
    sub = scores.loc[mask, [mcol, acol]].dropna()
    n = len(sub)
    if n < 2:
        rows.append({"Subgroup": name, "Score": label, "n": n,
                     "MeanDiff(Auto-Manual)": np.nan,
                     "95%CI_low": np.nan, "95%CI_high": np.nan,
                     "Paired_t_pvalue": np.nan})
        continue

    diff = sub[acol] - sub[mcol]  # >0 => automatic higher
    mean_diff = float(diff.mean())
    sd = float(diff.std(ddof=1))
    se = sd / np.sqrt(n)
    ci_low, ci_high = stats.t.interval(confidence=0.95, df=n-1, loc=mean_diff, scale=se)
    _, p = stats.ttest_rel(sub[acol], sub[mcol])

    rows.append({"Subgroup": name, "Score": label, "n": n,
                 "MeanDiff(Auto-Manual)": mean_diff,
                 "95%CI_low": float(ci_low), "95%CI_high": float(ci_high),
                 "Paired_t_pvalue": float(p)})

tbl = pd.DataFrame(rows)

# Holm–Bonferroni across subgroup tests for multiple testing (exclude "Overall")
mask_adj = tbl["Subgroup"] != "Overall"
tbl.loc[mask_adj, "p_adj"] = adjust_pvals(
    tbl.loc[mask_adj, "Paired_t_pvalue"].astype(float).values,
    method="holm", alpha=0.05
)

# ensure order and save
order = ["Overall","UK","US","Male","Female","UK Male","UK Female","US Male","US Female"]
tbl["Subgroup"] = pd.Categorical(tbl["Subgroup"], categories=order, ordered=True)
tbl = tbl.sort_values(["Score","Subgroup"]).reset_index(drop=True)

print("\nPicture Naming mean differences (auto-manual) by subgroup:")
print(tbl)

out_csv = os.path.join(out_dir, "mean_diffs_PictureNaming_by_subgroup.csv")
tbl.to_csv(out_csv, index=False)
print(f"\nsaved -> {out_csv}")



Paired bias (Auto − Manual) for PictureNaming by subgroup:
    Subgroup          Score   n  MeanDiff(Auto-Manual)  95%CI_low  95%CI_high  \
0    Overall  PictureNaming  40              -0.675000  -0.919397   -0.430603   
1         UK  PictureNaming  20              -0.850000  -1.286799   -0.413201   
2         US  PictureNaming  20              -0.500000  -0.740086   -0.259914   
3       Male  PictureNaming  15              -0.666667  -1.164924   -0.168410   
4     Female  PictureNaming  25              -0.680000  -0.964987   -0.395013   
5    UK Male  PictureNaming   8              -0.750000  -1.723935    0.223935   
6  UK Female  PictureNaming  12              -0.916667  -1.420490   -0.412843   
7    US Male  PictureNaming   7              -0.571429  -1.065779   -0.077078   
8  US Female  PictureNaming  13              -0.461538  -0.775091   -0.147986   

   Paired_t_pvalue     p_adj  
0         0.000002       NaN  
1         0.000649  0.003891  
2         0.000338  0.002365  
3    