compare tasks (cookieTheft, picnicScene, journaling) to see which task's feature set best predicts a target score

uses the intersection of subjects (data available for full model, for all three tasks)

new: bootstrapping for CIs and mean out-of-fold R^2

In [1]:
# setup
import sys
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyRegressor

# add project root
sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")

from config.constants import GIT_DIRECTORY
from config.feature_sets import get_linguistic_features, get_acoustic_features
from regression.regression_helpers import stratified_cross_validation
from regression.model_evaluation_helpers import (
    load_task_dataframe, get_model_feature_list, complete_subjects, normalize_oof_df, bootstrap_summary_from_oof, compare_models_bootstrap, bootstrap_metrics_from_oof
)

# define parameters
tasks = ["cookieTheft", "picnicScene", "journaling"]
score_names = ["PictureNamingScore","SemanticFluencyScore","PhonemicFluencyScore"]
oof_dir = os.path.join(GIT_DIRECTORY, "results", "oof_results2")
os.makedirs(oof_dir, exist_ok=True)

# regression model
model_type = RandomForestRegressor # LinearRegression or RandomForestRegressor
model_params = {"n_estimators":625, "random_state":42, "min_samples_leaf":4, "max_features":"sqrt", "bootstrap":True, "max_depth":14, "min_samples_split":9}

In [2]:
def main():
    scores_df = pd.read_csv(os.path.join(GIT_DIRECTORY, "data/language_scores_all_subjects.csv"))
    demographics = pd.read_csv(os.path.join(GIT_DIRECTORY, "data/demographics_data.csv"))

    # prepare demographics
    for col in ["Gender","Education","Country"]:
        demographics[col] = demographics[col].astype("string").str.lower().str.strip()
    demographics["Socioeconomic"] = pd.to_numeric(demographics["Socioeconomic"], errors="coerce")
    demographics["Gender"] = demographics["Gender"].map({"f":0, "m":1})
    education_map = {"less_than_highschool":1,"high_school":2,"vocational":3,"bachelor":4,"master":5,"phd":6,"no_answer":np.nan}
    demographics["Education"] = demographics["Education"].map(education_map)
    demographics["Education_level"] = demographics["Education"].map({1:0, 2:1, 3:1, 4:2, 5:2, 6:2})
    demographics["Country"] = demographics["Country"].map({"uk":0,"usa":1})
    demographics.drop(columns=["Language"], inplace=True, errors="ignore")

    # model configs
    demographic = ["Age","Gender","Education_level","Country","Socioeconomic"]
    linguistic = get_linguistic_features()
    acoustic = get_acoustic_features()
    model_configs = {
        "baseline": [],
        "demographics": demographic,
        "acoustic": sorted(list(acoustic)),
        "linguistic": sorted(list(linguistic)),
        "linguistic+acoustic": sorted(list(linguistic|acoustic)),
        "full": sorted(list(linguistic|acoustic)) + demographic
    }

    all_oof_rows = []
    fold_rows = []

    # loop all scores once
    for tgt in score_names:
        # per-task dataframes for this score
        task_dfs = {t: load_task_dataframe(t, tgt, scores_df, demographics) for t in tasks}

        # feature-complete intersection of subjects (based on full model)
        full_requested = model_configs["full"]
        full_cols_by_task = {t: get_model_feature_list(task_dfs[t].columns, full_requested, tgt) for t in tasks}
        subject_sets = [complete_subjects(task_dfs[t], full_cols_by_task[t], tgt) for t in tasks]
        full_subjects = set.intersection(*subject_sets)
        pd.Series(sorted(full_subjects), name="Subject_ID").to_csv(
            os.path.join(oof_dir, f"{tgt}_full_subjects.csv"), index=False
        )
        print(f"{tgt}: intersection of subjects across all tasks: N={len(full_subjects)}")

        # fit all models for this score & each task on fixed subject-pool
        for t in tasks:
            df_t = task_dfs[t]
            df_t = df_t[df_t["Subject_ID"].isin(full_subjects)].copy()

            for model_name, selected in model_configs.items():
                # baseline via DummyRegressor
                if model_name == "baseline":
                    # dummy feature column of ones
                    df_use = df_t.dropna(subset=[tgt]).copy()
                    X = pd.DataFrame(np.ones((len(df_use), 1)), index=df_use.index, columns=["__dummy__"])
                    fcols = ["__dummy__"]
                    mtype, mparams = DummyRegressor, {"strategy": "mean"}
                else:
                    # other models using features
                    fcols = get_model_feature_list(df_t.columns, selected, tgt)
                    if not fcols:
                        continue
                    df_use = df_t.dropna(subset=[tgt] + fcols).copy()
                    if df_use.empty:
                        continue
                    X = df_use[fcols]
                    mtype, mparams = model_type, model_params

                model_df = pd.concat([df_use[["Subject_ID","fold"]], X, df_use[tgt].rename(tgt)], axis=1)
                print(f"{t} | {model_name} | N={len(model_df)} | features used={0 if model_name=='baseline' else len(fcols)}")

                # cross-validation
                r2_list, rmse_list, mae_list, all_preds = stratified_cross_validation(
                    df=model_df,
                    fold_column="fold",
                    model_type=mtype,
                    model_params=mparams,
                    target_column=tgt,
                    feature_columns=fcols
                )

                # collect fold metrics
                for k, (r2, rmse, mae) in enumerate(zip(r2_list, rmse_list, mae_list)):
                    fold_rows.append({"target":tgt,"task":t,"model":model_name,"fold":k,
                                      "r2":r2,"rmse":rmse,"mae":mae,"estimator":mtype.__name__})

                # store normalized out-of-fold (OOF) predictions per subject
                all_preds = all_preds.rename(columns={"y_test":"y_true"})
                oof_df = normalize_oof_df(all_preds, target_col=tgt)
                oof_df["task"] = t; oof_df["model"] = model_name; oof_df["target"] = tgt

                # merge demographics into OOF rows for later subsetting
                cols_keep = ["Subject_ID", "Age", "Gender", "Education_level", "Country"]
                oof_df = oof_df.merge(demographics[cols_keep], on="Subject_ID", how="left")
                oof_df["Gender_label"]  = oof_df["Gender"].map({0:"f", 1:"m"})
                oof_df["Country_label"] = oof_df["Country"].map({0:"uk", 1:"usa"})
                oof_df["AgeGroup"] = pd.cut(oof_df["Age"], bins=[-np.inf, 65, 75, np.inf], labels=["<65", "65â€“75", ">75"])

                all_oof_rows.append(oof_df)

        # per-score summaries (bootstrap)
        oof_so_far = pd.concat(all_oof_rows, ignore_index=True)
        oof_score  = oof_so_far[oof_so_far["target"] == tgt].copy()

        _, summ_df = bootstrap_summary_from_oof(oof_score, group_cols=("target","task","model"), n_boot=1000, ci=0.95, random_state=42)
        summ_df.to_csv(os.path.join(oof_dir, f"bootstrap_summary_{tgt}.csv"), index=False)

        met = bootstrap_metrics_from_oof(oof_score, group_cols=("target","task","model"), n_boot=1000, ci=0.95, random_state=42)
        met.to_csv(os.path.join(oof_dir, f"bootstrap_metrics_{tgt}.csv"), index=False)

        # per-score pairwise differences
        rows = []
        for t in tasks:
            rows.append(compare_models_bootstrap(oof_score, task=t, target=tgt, n_boot=1000, random_state=42))
        pd.concat(rows, ignore_index=True).sort_values(["task","p_boot","model_a","model_b"]).to_csv(
            os.path.join(oof_dir, f"pairwise_bootstrap_diffs_{tgt}.csv"), index=False
        )

    # store results
    oof_all = pd.concat(all_oof_rows, ignore_index=True)
    oof_all.to_csv(os.path.join(oof_dir, "oof_preds_all_scores.csv"), index=False)
    pd.DataFrame(fold_rows).to_csv(os.path.join(oof_dir, "cv_folds_all_scores.csv"), index=False)

if __name__ == "__main__":
    main()

PictureNamingScore: intersection of subjects across all tasks: N=959
cookieTheft | baseline | N=959 | features used=0
cookieTheft | demographics | N=959 | features used=5
cookieTheft | acoustic | N=959 | features used=19
cookieTheft | linguistic | N=959 | features used=38
cookieTheft | linguistic+acoustic | N=959 | features used=57
cookieTheft | full | N=959 | features used=62
picnicScene | baseline | N=959 | features used=0
picnicScene | demographics | N=959 | features used=5
picnicScene | acoustic | N=959 | features used=19
picnicScene | linguistic | N=959 | features used=36
picnicScene | linguistic+acoustic | N=959 | features used=55
picnicScene | full | N=959 | features used=60
journaling | baseline | N=959 | features used=0
journaling | demographics | N=959 | features used=5
journaling | acoustic | N=959 | features used=19
journaling | linguistic | N=959 | features used=36
journaling | linguistic+acoustic | N=959 | features used=55
journaling | full | N=959 | features used=60
Sema