In [19]:
# setup
import sys
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# add project root to sys.path
sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")

from config.constants import GIT_DIRECTORY
from regression.train_regression_models import train_and_evaluate_regression_model

# define task and target
task_name = "cookieTheft"
target = "PictureNamingScore"

# load feature and score data
features_path = os.path.join(GIT_DIRECTORY, f"results/features/filtered/{task_name}_filtered.csv")
scores_path = os.path.join(GIT_DIRECTORY, "resources/language_scores_all_subjects.csv")

features = pd.read_csv(features_path)
scores = pd.read_csv(scores_path)

# merge and drop missing values
df = pd.merge(features, scores[["Subject_ID", target]], on="Subject_ID").dropna()

# define X and y
X = df.drop(columns=["Subject_ID", target])
y = df[target]

# train and evaluate using different regression models
model, metrics, X_train, X_test, y_train, y_test, y_pred_train, y_pred_test = train_and_evaluate_regression_model(
    X, y,
    model_class=LinearRegression,  # Linear Regression, Ridge, Lasso, Random Forest
    model_params=None,
    test_size=0.2,
    random_state=42
)

In [20]:
# CORRELATION MATRIX

# set style
plt.rcParams["font.family"] = "Arial"
sns.set(style="whitegrid")

# standardize the filtered feature set
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# add target column to standardized data
X_with_target = X_scaled.copy()
X_with_target[target] = y.values

# compute correlations
corr = X_with_target.corr()

# save full correlation matrix as CSV
corr_csv_path = os.path.join(GIT_DIRECTORY, f"results/regression/correlation_matrix_{task_name}_{target}.csv")
corr.to_csv(corr_csv_path)
print(f"\ncorrelation matrix saved as CSV to:\n{corr_csv_path}")

# plot full heatmap with small, rotated labels
plt.figure(figsize=(18, 16))  # make it big to handle many labels
sns.heatmap(
    corr,
    annot=False,           # skip annotation (numbers), too cluttered
    cmap="coolwarm",
    center=0,
    square=True,
    cbar_kws={"shrink": 0.8},
    linewidths=0.5
)

plt.xticks(rotation=90, fontsize=6)  # rotate + shrink
plt.yticks(rotation=0, fontsize=6)

plt.title(f"Correlation Matrix with {target}", fontsize=16, fontweight="bold", pad=20)
plt.tight_layout()

# save full plot
full_corr_path = os.path.join(GIT_DIRECTORY, f"results/plots/correlation_matrix_{task_name}_{target}.png")
plt.savefig(full_corr_path, dpi=300, bbox_inches="tight")
plt.close()

print(f"heatmap saved to:\n{full_corr_path}")



correlation matrix saved as CSV to:
/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/regression/correlation_matrix_cookieTheft_PictureNamingScore.csv
heatmap saved to:
/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/correlation_matrix_cookieTheft_PictureNamingScore.png
