In [5]:
# test the effect of different alphas for Lasso and Ridge Regression and different number of trees for Random Forest Regression

# setup
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor

# add project root
sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")

from config.constants import GIT_DIRECTORY
from regression.regression_functions import stratified_cross_validation

# define paths and load data
task_name = "cookieTheft"
target = "SemanticFluencyScore"
save_dir = os.path.join(GIT_DIRECTORY, "results/plots/hyperparameter_tuning")
os.makedirs(save_dir, exist_ok=True)

features = pd.read_csv(os.path.join(GIT_DIRECTORY, f"results/features/filtered/{task_name}_filtered.csv"))
scores = pd.read_csv(os.path.join(GIT_DIRECTORY, "data/language_scores_all_subjects.csv"))
df = pd.merge(features, scores[["Subject_ID", target]], on="Subject_ID").dropna()

# load folds
fold_df = pd.read_csv(os.path.join(GIT_DIRECTORY, "data", f"{task_name}_stratified_folds.csv"))
df = pd.merge(df, fold_df[["Subject_ID", "fold"]], on="Subject_ID")
X = df.drop(columns=["Subject_ID", target])
y = df[target]


In [6]:
# ridge & lasso
alphas = [0.001, 0.01, 0.1, 1, 10, 100]
ridge_results, lasso_results = [], []

for alpha in alphas:
    for model_class, result_list, name in [(Ridge, ridge_results, "Ridge"), (Lasso, lasso_results, "Lasso")]:
        r2_list, _, _, _ = stratified_cross_validation(
            df=df,
            fold_column="fold",
            model_class=model_class,
            model_params={"alpha": alpha},
            target_column=target,
            feature_columns=X.columns
        )
        result_list.append({"alpha": alpha, "mean_r2": np.mean(r2_list)})

Fold 2: R² = 0.071, RMSE = 5.47, MAE = 4.27
Fold 3: R² = -1.626, RMSE = 9.11, MAE = 4.51
Fold 4: R² = 0.125, RMSE = 4.81, MAE = 3.96
Fold 5: R² = 0.071, RMSE = 5.57, MAE = 4.29
Fold 6: R² = 0.133, RMSE = 5.28, MAE = 4.06
Fold 2: R² = 0.075, RMSE = 5.46, MAE = 4.26
Fold 3: R² = -1.566, RMSE = 9.01, MAE = 4.49
Fold 4: R² = 0.130, RMSE = 4.79, MAE = 3.95


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Fold 5: R² = 0.074, RMSE = 5.56, MAE = 4.29
Fold 6: R² = 0.137, RMSE = 5.27, MAE = 4.05
Fold 2: R² = 0.071, RMSE = 5.47, MAE = 4.27
Fold 3: R² = -1.625, RMSE = 9.11, MAE = 4.51
Fold 4: R² = 0.125, RMSE = 4.81, MAE = 3.96
Fold 5: R² = 0.071, RMSE = 5.57, MAE = 4.29
Fold 6: R² = 0.133, RMSE = 5.28, MAE = 4.06
Fold 2: R² = 0.098, RMSE = 5.39, MAE = 4.21
Fold 3: R² = -1.201, RMSE = 8.34, MAE = 4.43
Fold 4: R² = 0.143, RMSE = 4.76, MAE = 3.91


  model = cd_fast.enet_coordinate_descent(


Fold 5: R² = 0.090, RMSE = 5.52, MAE = 4.26
Fold 6: R² = 0.145, RMSE = 5.25, MAE = 4.04
Fold 2: R² = 0.072, RMSE = 5.47, MAE = 4.27
Fold 3: R² = -1.616, RMSE = 9.10, MAE = 4.51
Fold 4: R² = 0.126, RMSE = 4.80, MAE = 3.96
Fold 5: R² = 0.072, RMSE = 5.57, MAE = 4.29
Fold 6: R² = 0.134, RMSE = 5.28, MAE = 4.06
Fold 2: R² = 0.125, RMSE = 5.31, MAE = 4.17
Fold 3: R² = -0.005, RMSE = 5.64, MAE = 4.17
Fold 4: R² = 0.148, RMSE = 4.74, MAE = 3.92
Fold 5: R² = 0.106, RMSE = 5.47, MAE = 4.20
Fold 6: R² = 0.160, RMSE = 5.20, MAE = 4.00
Fold 2: R² = 0.078, RMSE = 5.45, MAE = 4.25
Fold 3: R² = -1.530, RMSE = 8.95, MAE = 4.49
Fold 4: R² = 0.131, RMSE = 4.79, MAE = 3.94
Fold 5: R² = 0.075, RMSE = 5.56, MAE = 4.28
Fold 6: R² = 0.136, RMSE = 5.27, MAE = 4.05
Fold 2: R² = 0.049, RMSE = 5.53, MAE = 4.42
Fold 3: R² = 0.064, RMSE = 5.44, MAE = 4.33
Fold 4: R² = 0.059, RMSE = 4.98, MAE = 4.11
Fold 5: R² = 0.052, RMSE = 5.63, MAE = 4.43
Fold 6: R² = 0.070, RMSE = 5.47, MAE = 4.31
Fold 2: R² = 0.098, RMSE = 5.

In [11]:
# random forest
rf_results = []
n_trees_list = [10, 50, 100, 150, 200]

for n in n_trees_list:
    r2_list, _, _, _ = stratified_cross_validation(
        df=df,
        fold_column="fold",
        model_class=RandomForestRegressor,
        model_params={"n_estimators": n, "random_state": 42},
        target_column=target,
        feature_columns=X.columns
    )
    rf_results.append({"n_trees": n, "mean_r2": np.mean(r2_list)})

Fold 2: R² = 0.006, RMSE = 5.66, MAE = 4.33
Fold 3: R² = 0.117, RMSE = 5.29, MAE = 4.24
Fold 4: R² = -0.050, RMSE = 5.26, MAE = 4.29
Fold 5: R² = 0.037, RMSE = 5.67, MAE = 4.38
Fold 6: R² = 0.054, RMSE = 5.52, MAE = 4.37
Fold 2: R² = 0.068, RMSE = 5.48, MAE = 4.26
Fold 3: R² = 0.167, RMSE = 5.13, MAE = 4.06
Fold 4: R² = 0.097, RMSE = 4.88, MAE = 4.02
Fold 5: R² = 0.084, RMSE = 5.53, MAE = 4.26
Fold 6: R² = 0.080, RMSE = 5.44, MAE = 4.26
Fold 2: R² = 0.088, RMSE = 5.42, MAE = 4.19
Fold 3: R² = 0.186, RMSE = 5.07, MAE = 4.00
Fold 4: R² = 0.110, RMSE = 4.85, MAE = 3.93
Fold 5: R² = 0.110, RMSE = 5.45, MAE = 4.23
Fold 6: R² = 0.083, RMSE = 5.44, MAE = 4.23
Fold 2: R² = 0.088, RMSE = 5.42, MAE = 4.19
Fold 3: R² = 0.173, RMSE = 5.12, MAE = 4.05
Fold 4: R² = 0.124, RMSE = 4.81, MAE = 3.89
Fold 5: R² = 0.122, RMSE = 5.42, MAE = 4.19
Fold 6: R² = 0.091, RMSE = 5.41, MAE = 4.18
Fold 2: R² = 0.086, RMSE = 5.43, MAE = 4.19
Fold 3: R² = 0.173, RMSE = 5.11, MAE = 4.06
Fold 4: R² = 0.126, RMSE = 4.80

In [12]:
# plot results
def plot_results(data, x, y, title, xlabel, filename, xscale=None):
    plt.figure(figsize=(6, 4))
    plt.plot(data[x], data[y], marker="o")
    if xscale:
        plt.xscale(xscale)
    plt.xlabel(xlabel)
    plt.ylabel("Mean R²")
    plt.title(title)
    plt.grid(True, linestyle="--", alpha=0.5)
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, filename), dpi=300)
    plt.close()

plot_results(pd.DataFrame(ridge_results), "alpha", "mean_r2",
             "Ridge: Effect of Alpha", "Alpha (log scale)", "ridge_alpha_effect.png", xscale="log")

plot_results(pd.DataFrame(lasso_results), "alpha", "mean_r2",
             "Lasso: Effect of Alpha", "Alpha (log scale)", "lasso_alpha_effect.png", xscale="log")

plot_results(pd.DataFrame(rf_results), "n_trees", "mean_r2",
             "Random Forest: Effect of n_estimators", "Number of Trees", "rf_n_estimators_effect.png")
