In [5]:
# setup
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

# project path setup
sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")

from config.constants import GIT_DIRECTORY
from regression.train_regression_models import train_and_evaluate_regression_model
from feature_selection.feature_selection_functions import load_filtered_features


task_name = "cookieTheft"
target = "PhonemicFluencyScore"

selected_columns = [
    "n_words",
    "ttr",
    "mattr",
    "filler_word_ratio",
    "concreteness_score",
    "aoa_average",
    "average_word_length",
    "brunets_index",
    "honores_statistic",
    "guirauds_statistic",
    "ADJ",
    "ADP",
    "ADV",
    "AUX",
    "CCONJ",
    "DET",
    "INTJ",
    "NOUN",
    "NUM",
    "PART",
    "PRON",
    "SCONJ",
    "VERB",
    "NOUN/VERB",
    "PRON/NOUN",
    "DET/NOUN",
    "AUX/VERB",
    "OPEN/CLOSED",
    "POS_ENTROPY",
    "LEXICAL_DENSITY",
    "speech_rate",
    "pause_ratio",
    "n_pauses",
    "avg_pause_duration",
    "articulation_rate",
    "hesitation_ratio",
    "eGeMAPS_jitterLocal_sma3nz_amean",
    "eGeMAPS_shimmerLocaldB_sma3nz_amean"
]

# load filtered features
features = load_filtered_features(
    task_name=task_name,
    features_dir=os.path.join(GIT_DIRECTORY, "results/features"),
    selected_columns=selected_columns
).dropna(subset=["Subject_ID"])

# load scores
scores = pd.read_csv(os.path.join(GIT_DIRECTORY, "resources/language_scores_all_subjects.csv"))
scores = scores[["Subject_ID", target]].dropna()

# keep only subjects that exist in both
shared_ids = set(features["Subject_ID"]) & set(scores["Subject_ID"])

X = features[features["Subject_ID"].isin(shared_ids)].copy()
y = scores[scores["Subject_ID"].isin(shared_ids)].copy()

# reset indices and drop ID
X = X.sort_values("Subject_ID").reset_index(drop=True).drop(columns=["Subject_ID"])
y = y.sort_values("Subject_ID").reset_index(drop=True)[target]

# drop rows with missing values in X, align y accordingly
X = X.dropna()
y = y.loc[X.index].reset_index(drop=True)

# scale features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# run regression
model, metrics, X_train, X_test, y_train, y_test, y_pred_train, y_pred_test = train_and_evaluate_regression_model(
    X_scaled, y,
    model_class=LinearRegression,
    model_params=None
)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

print("Evaluation metrics:", metrics)

X_train shape: (765, 38)
X_test shape: (192, 38)
Evaluation metrics: {'R2': -0.0009174411008896488, 'RMSE': np.float64(4.524395189815523), 'MAE': 3.588877665037545}


In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

r2_list, rmse_list, mae_list = [], [], []
fold_predictions = []

for fold, (train_index, test_index) in enumerate(kf.split(X_scaled)):
    X_train = X_scaled.iloc[train_index]
    X_test = X_scaled.iloc[test_index]
    y_train = y.iloc[train_index]
    y_test = y.iloc[test_index]

    model = LinearRegression().fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluate
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)

    # Store metrics
    r2_list.append(r2)
    rmse_list.append(rmse)
    mae_list.append(mae)

    # Save predictions
    fold_predictions.append(pd.DataFrame({
        "y_test": y_test.values,
        "y_pred": y_pred,
        "fold": fold
    }))

    print(f"Fold {fold + 1}: R² = {r2:.3f}, RMSE = {rmse:.3f}, MAE = {mae:.3f}")

In [None]:
# CORRELATION MATRIX for selected features

# set style
plt.rcParams["font.family"] = "Arial"
sns.set(style="whitegrid")

# combine scaled features with target
X_with_target = X_scaled.copy()
X_with_target[target] = y.values

# compute correlation matrix
corr = X_with_target.corr()

# save as CSV
corr_csv_path = os.path.join(GIT_DIRECTORY, f"results/regression/correlation_matrix_{task_name}_{target}_selected.csv")
corr.to_csv(corr_csv_path)
print(f"Correlation matrix saved to CSV:\n{corr_csv_path}")

# plot heatmap
plt.figure(figsize=(18, 16))
sns.heatmap(
    corr,
    annot=False,
    cmap="coolwarm",
    center=0,
    square=True,
    cbar_kws={"shrink": 0.8},
    linewidths=0.5
)

plt.xticks(rotation=90, fontsize=6)
plt.yticks(rotation=0, fontsize=6)
plt.title(f"{task_name.title()} – Correlation Matrix ({target})", fontsize=16, fontweight="bold", pad=20)
plt.tight_layout()

# save plot
plot_path = os.path.join(GIT_DIRECTORY, f"results/plots/correlation_matrix_{task_name}_{target}_selected.png")
plt.savefig(plot_path, dpi=300, bbox_inches="tight")
plt.close()

print(f"Heatmap saved to:\n{plot_path}")

In [10]:
# Compute feature–feature correlation matrix (excluding target)
feature_corr = X_scaled.corr()

# Mask the upper triangle to avoid duplicate pairs and self-correlation
mask = np.triu(np.ones_like(feature_corr, dtype=bool))

# Unstack the matrix to long format and drop self-pairs
high_corr_pairs = (
    feature_corr.where(~mask)  # apply mask
    .stack()                   # convert to long format
    .reset_index()             # make a DataFrame
)
high_corr_pairs.columns = ['Feature1', 'Feature2', 'Correlation']

# Filter by threshold (e.g., > 0.7 or < -0.7)
threshold = 0.7
high_corr_pairs = high_corr_pairs[high_corr_pairs['Correlation'].abs() > threshold]

# Sort by absolute correlation
high_corr_pairs = high_corr_pairs.reindex(high_corr_pairs['Correlation'].abs().sort_values(ascending=False).index)

print("Highly intercorrelated feature pairs (|r| > 0.7):\n")
print(high_corr_pairs)


Highly intercorrelated feature pairs (|r| > 0.7):

              Feature1           Feature2  Correlation
626   hesitation_ratio        pause_ratio     0.998570
22       brunets_index                ttr    -0.950396
496           n_pauses            n_words     0.939208
296          PRON/NOUN               PRON     0.923080
123               INTJ  filler_word_ratio     0.918097
338           AUX/VERB                AUX     0.837596
275          NOUN/VERB               VERB    -0.812903
591  articulation_rate        speech_rate     0.766562
293          PRON/NOUN               NOUN    -0.743734
433    LEXICAL_DENSITY        OPEN/CLOSED     0.739118
270          NOUN/VERB               NOUN     0.732008
395        POS_ENTROPY               NOUN    -0.730987
