figure out: why is fold 2 performance so bad?

ideas: plot distribution of scores for each split, plot actual vs. predicted score for each split (-> check outliers), maybe look at mean of certain features in each split?


In [6]:
# setup
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression

# add project root
sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")

from config.constants import GIT_DIRECTORY
from regression.regression_helpers import stratified_cross_validation
from regression.evaluation_helpers import plot_per_fold_predictions, format_title

# set font for all plots
plt.rcParams['font.family'] = 'Arial'

# set parameters
target = "PictureNamingScore"
task_name = "cookieTheft"
model_type = "LinearRegression"
n_folds = 5

# load splits and target score
features = pd.read_csv(os.path.join(GIT_DIRECTORY, f"results/features/filtered/{task_name}_filtered.csv"))
scores = pd.read_csv(os.path.join(GIT_DIRECTORY, "data/language_scores_all_subjects.csv"))
folds = pd.read_csv(os.path.join(GIT_DIRECTORY, f"data/{task_name}_stratified_folds.csv"))

df = pd.merge(features, scores[["Subject_ID", target]], on="Subject_ID").dropna()
df = pd.merge(df, folds[["Subject_ID", "fold"]], on="Subject_ID")

X = df.drop(columns=["Subject_ID", target, "fold"])
y = df[target]

In [7]:
# run cross-validation
r2_list, rmse_list, mae_list, all_preds_df = stratified_cross_validation(
    df=df,
    fold_column="fold",
    model_class=LinearRegression,
    model_params=None,
    target_column=target,
    feature_columns=X.columns,
    model_name=model_type
)

In [8]:
# plot predicted vs. actual for each fold

output_dir = os.path.join(GIT_DIRECTORY, "results", "plots", "fold_performance")
plot_per_fold_predictions(all_preds_df, task_name, target, model_type, output_dir, n_folds=n_folds)

plot: actual vs predicted scores for fold 1 saved to /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/fold2/fold1_actual_vs_predicted_PictureNamingScore.png
plot: actual vs predicted scores for fold 2 saved to /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/fold2/fold2_actual_vs_predicted_PictureNamingScore.png
plot: actual vs predicted scores for fold 3 saved to /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/fold2/fold3_actual_vs_predicted_PictureNamingScore.png
plot: actual vs predicted scores for fold 4 saved to /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/fold2/fold4_actual_vs_predicted_PictureNamingScore.png
plot: actual vs predicted scores for fold 5 saved to /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/fold2/fold5_actual_vs_predicted_PictureNamingScore.png


In [9]:
# plot distribution of scores for each split
format_target = format_title(target)
format_task = format_title(task_name)

for fold in range(1, n_folds + 1):
    fold_df = df[df["fold"] == fold]
    scores = fold_df[target]

    plt.figure(figsize=(10, 6))
    plt.hist(scores, bins=range(int(scores.min()), int(scores.max()) + 1), color='slateblue', edgecolor='black')
    plt.xlabel(f"{format_target}", fontsize=14, fontweight='bold')
    plt.ylabel("Number of People", fontsize=14, fontweight='bold')
    plt.title(f"{format_target} Distribution (Fold {fold}, {format_task})", fontsize=14, fontweight='bold')
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    plot_path = os.path.join(GIT_DIRECTORY, "results", "plots", "fold_performance", f"fold{fold}_{target}_distribution.png")
    plt.savefig(plot_path, dpi=300)
    plt.close()
