manual vs automatic score validation: subgroup ICC and bias tests

In [4]:
import os
import numpy as np
import pandas as pd
from scipy import stats

from regression.model_evaluation import adjust_pvals
from additional_analyses.icc import compute_icc
from config.constants import GIT_DIRECTORY

# paths
demographics_path = os.path.join(GIT_DIRECTORY, "data", "demographics_data.csv")
scores_path       = os.path.join(GIT_DIRECTORY, "data", "language_scores_all_subjects.csv")
manual_scores_xls = os.path.join(GIT_DIRECTORY, "resources", "Score_Validierung.xlsx")

# load demographics: Subject_ID, Gender, Country, Age
demographics = pd.read_csv(demographics_path)
required_demo = {"Subject_ID", "Gender", "Country", "Age"}
missing_demo = required_demo - set(demographics.columns)

# load automatic scores
automatic_scores = pd.read_csv(scores_path)

# load manual scores (excel sheet 'scores_manuell')
xls = pd.ExcelFile(manual_scores_xls)
sheet_name = "scores_manuell"
manual_scores = pd.read_excel(manual_scores_xls, sheet_name=sheet_name, engine="openpyxl")

# merge manual & automatic on Subject_ID, then add demographics
scores = manual_scores.merge(automatic_scores, on="Subject_ID", how="inner").merge(
    demographics[["Subject_ID", "Gender", "Country", "Age"]],
    on="Subject_ID",
    how="left",
)

print("Merged shape (manual∩auto + demographics):", scores.shape)
print("Country counts:\n", scores["Country"].value_counts(dropna=False))
print("Gender counts:\n", scores["Gender"].value_counts(dropna=False))

Merged shape (manual∩auto + demographics): (40, 15)
Country counts:
 Country
usa    20
uk     20
Name: count, dtype: int64
Gender counts:
 Gender
f    25
m    15
Name: count, dtype: int64


In [5]:
# pairing of manual vs automatic score columns
pairing_map = {
    "SemanticFluency": ("semantic_fluency_score_m", "SemanticFluencyScore"),
    "PhonemicFluency": ("phonemic_fluency_score_m", "PhonemicFluencyScore"),
    "PictureNaming":   ("picture_naming_score_m",   "PictureNamingScore"),
}

# ICC(2,1) per subgroup (country, gender, combinations)
subgroups_icc = {
    "UK":        scores["Country"].str.lower().eq("uk"),
    "US":        scores["Country"].str.lower().eq("usa"),
    "Male":      scores["Gender"].eq("m"),
    "Female":    scores["Gender"].eq("f"),
    "UK_Male":   scores["Country"].str.lower().eq("uk")  & scores["Gender"].eq("m"),
    "US_Male":   scores["Country"].str.lower().eq("usa") & scores["Gender"].eq("m"),
    "UK_Female": scores["Country"].str.lower().eq("uk")  & scores["Gender"].eq("f"),
    "US_Female": scores["Country"].str.lower().eq("usa") & scores["Gender"].eq("f"),
}

icc_rows = []
for subgroup_name, mask in subgroups_icc.items():
    df_sub = scores.loc[mask].copy()
    for label, (mcol, acol) in pairing_map.items():
        results = compute_icc(df_sub, mcol, acol, label)
        # add subgroup column to each result row
        for r in results:
            r["Subgroup"] = subgroup_name
            icc_rows.append(r)

subgroup_icc = pd.DataFrame(icc_rows)
subgroup_icc = subgroup_icc[subgroup_icc["ICC_Type"] == "ICC2"].reset_index(drop=True)
subgroup_icc

Unnamed: 0,Score,ICC_Type,ICC,CI95_lower,CI95_upper,n,Subgroup
0,SemanticFluency,ICC2,0.984665,0.92,1.0,20,UK
1,PhonemicFluency,ICC2,0.96864,0.8,0.99,20,UK
2,PictureNaming,ICC2,0.755906,0.2,0.92,20,UK
3,SemanticFluency,ICC2,0.97318,0.93,0.99,20,US
4,PhonemicFluency,ICC2,0.979168,0.9,0.99,20,US
5,PictureNaming,ICC2,0.982972,0.86,1.0,20,US
6,SemanticFluency,ICC2,0.984262,0.88,1.0,15,Male
7,PhonemicFluency,ICC2,0.976977,0.85,0.99,15,Male
8,PictureNaming,ICC2,0.962444,0.82,0.99,15,Male
9,SemanticFluency,ICC2,0.972897,0.93,0.99,25,Female


In [6]:
# PictureNaming: mean difference (automatic − manual) with paired t-tests

subgroups_ttest = {
    "Overall":   pd.Series(True, index=scores.index),
    "UK":        scores["Country"].str.lower().eq("uk"),
    "US":        scores["Country"].str.lower().eq("usa"),
    "Male":      scores["Gender"].eq("m"),
    "Female":    scores["Gender"].eq("f"),
    "UK Male":   scores["Country"].str.lower().eq("uk")  & scores["Gender"].eq("m"),
    "UK Female": scores["Country"].str.lower().eq("uk")  & scores["Gender"].eq("f"),
    "US Male":   scores["Country"].str.lower().eq("usa") & scores["Gender"].eq("m"),
    "US Female": scores["Country"].str.lower().eq("usa") & scores["Gender"].eq("f"),
}

mcol = "picture_naming_score_m"
acol = "PictureNamingScore"
score_label = "PictureNaming"

rows = []
for subgroup_name, mask in subgroups_ttest.items():
    sub = scores.loc[mask, [mcol, acol]].dropna()
    n = len(sub)

    if n < 2:
        rows.append(
            {
                "Subgroup": subgroup_name,
                "Score": score_label,
                "n": n,
                "MeanDiff(Auto-Manual)": np.nan,
                "95%CI_low": np.nan,
                "95%CI_high": np.nan,
                "Paired_t_pvalue": np.nan,
            }
        )
        continue

    diff = sub[acol] - sub[mcol]  # > 0 => automatic higher than manual
    mean_diff = float(diff.mean())
    sd = float(diff.std(ddof=1))
    se = sd / np.sqrt(n)
    ci_low, ci_high = stats.t.interval(
        confidence=0.95,
        df=n - 1,
        loc=mean_diff,
        scale=se,
    )
    _, p = stats.ttest_rel(sub[acol], sub[mcol])

    rows.append(
        {
            "Subgroup": subgroup_name,
            "Score": score_label,
            "n": n,
            "MeanDiff(Auto-Manual)": mean_diff,
            "95%CI_low": float(ci_low),
            "95%CI_high": float(ci_high),
            "Paired_t_pvalue": float(p),
        }
    )

table = pd.DataFrame(rows)

# Holm–Bonferroni across subgroup tests (exclude "Overall")
mask_adj = table["Subgroup"] != "Overall"
table.loc[mask_adj, "p_adj"] = adjust_pvals(
    table.loc[mask_adj, "Paired_t_pvalue"].astype(float).values,
    method="holm",
    alpha=0.05,
)

order = ["Overall", "UK", "US", "Male", "Female", "UK Male", "UK Female", "US Male", "US Female"]
table["Subgroup"] = pd.Categorical(table["Subgroup"], categories=order, ordered=True)
table = table.sort_values(["Score", "Subgroup"]).reset_index(drop=True)

table


Unnamed: 0,Subgroup,Score,n,MeanDiff(Auto-Manual),95%CI_low,95%CI_high,Paired_t_pvalue,p_adj
0,Overall,PictureNaming,40,-0.675,-0.919397,-0.430603,2e-06,
1,UK,PictureNaming,20,-0.85,-1.286799,-0.413201,0.000649,0.003891
2,US,PictureNaming,20,-0.5,-0.740086,-0.259914,0.000338,0.002365
3,Male,PictureNaming,15,-0.666667,-1.164924,-0.16841,0.01236,0.037079
4,Female,PictureNaming,25,-0.68,-0.964987,-0.395013,5e-05,0.000403
5,UK Male,PictureNaming,8,-0.75,-1.723935,0.223935,0.111416,0.111416
6,UK Female,PictureNaming,12,-0.916667,-1.42049,-0.412843,0.00207,0.010351
7,US Male,PictureNaming,7,-0.571429,-1.065779,-0.077078,0.03002,0.060039
8,US Female,PictureNaming,13,-0.461538,-0.775091,-0.147986,0.007532,0.030129
