In [14]:
# test different kinds of transformations for Picture Naming Score
# square root, cube root, logarithm, reciprocal, box-cox, Yeo-Johnson

# setup
import numpy as np
import pandas as pd
from scipy.stats import boxcox, yeojohnson
import matplotlib.pyplot as plt
import sys
import os
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# add project root
sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")

from config.constants import GIT_DIRECTORY

# define save path
save_dir = os.path.join(GIT_DIRECTORY, "results/plots/picturenaming_transformation")
os.makedirs(save_dir, exist_ok=True)

# task and target
task_name = "cookieTheft"
target = "PictureNamingScore"

# load features and target
features = pd.read_csv(os.path.join(GIT_DIRECTORY, f"results/features/filtered/{task_name}_filtered.csv"))
scores = pd.read_csv(os.path.join(GIT_DIRECTORY, "data/language_scores_all_subjects.csv"))

# merge and drop missing
df = pd.merge(features, scores[["Subject_ID", target]], on="Subject_ID").dropna()

# extract X and y
X = df.drop(columns=["Subject_ID", target])
y = df[target]

# load pre-generated folds
folds_path = os.path.join(GIT_DIRECTORY, "data", f"{task_name}_stratified_folds.csv")
fold_df = pd.read_csv(folds_path)

# merge fold info into df
df = pd.merge(df, fold_df[["Subject_ID", "fold"]], on="Subject_ID")

In [15]:
# function to test different transformations
def transform_score(y, method="none"):
    """
    Apply a transformation to the target variable.
    - method: ['none', 'sqrt', 'cbrt', 'log', 'reciprocal', 'boxcox', 'yeojohnson']
    """
    y = y.copy()
    name = method

    if method == "none":
        return y, "None"

    elif method == "sqrt":
        y_transformed = np.sqrt(y)

    elif method == "cbrt":
        y_transformed = np.cbrt(y)

    elif method == "log":
        if (y <= 0).any():
            y = y + 1 - y.min()  # shift to make all values positive
        y_transformed = np.log(y)

    elif method == "reciprocal":
        if (y == 0).any():
            y = y + 1e-6
        y_transformed = 1 / y

    elif method == "boxcox":
        if (y <= 0).any():
            y = y + 1 - y.min()
        y_transformed, _ = boxcox(y)

    elif method == "yeojohnson":
        y_transformed, _ = yeojohnson(y)

    return pd.Series(y_transformed, index=y.index), name

# adapted function for transformation of score
def stratified_cross_validation2(
    df, fold_column, model_class, model_params,
    target_column, n_folds=5, feature_columns=None, target_transform="none"
):
    r2_scores = []
    rmse_scores = []
    mae_scores = []
    all_preds = []

    for fold in range(n_folds):
        train_df = df[df[fold_column] != fold]
        test_df = df[df[fold_column] == (fold + 1)]

        X_train = train_df[feature_columns]
        y_train = train_df[target_column]
        X_test = test_df[feature_columns]
        y_test = test_df[target_column]

        # optional: apply transformation to score
        y_train_transformed, _ = transform_score(y_train, method=target_transform)
        y_test_transformed, _ = transform_score(y_test, method=target_transform)

        # standardize features
        scaler = StandardScaler()
        X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
        X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

        # train model
        model = model_class(**model_params) if model_params else model_class()
        model.fit(X_train_scaled, y_train_transformed)
        y_pred = model.predict(X_test_scaled)

        # evaluate using transformed target
        r2 = r2_score(y_test_transformed, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test_transformed, y_pred))
        mae = mean_absolute_error(y_test_transformed, y_pred)

        r2_scores.append(r2)
        rmse_scores.append(rmse)
        mae_scores.append(mae)

        fold_df = pd.DataFrame({
            "y_test": y_test_transformed.values,
            "y_pred": y_pred,
            "fold": fold
        })
        all_preds.append(fold_df)

        print(f"Fold {fold+1}: R² = {r2:.3f}, RMSE = {rmse:.2f}, MAE = {mae:.2f}")

    all_preds_df = pd.concat(all_preds, ignore_index=True)

    return r2_scores, rmse_scores, mae_scores, all_preds_df


In [16]:
transformations = ["none", "sqrt", "cbrt", "log", "reciprocal", "boxcox", "yeojohnson"]

for tfm in transformations:
    print(f"\nTrying transformation: {tfm}")

    r2_list, rmse_list, mae_list, all_preds = stratified_cross_validation2(
        df=df,
        fold_column="fold",
        model_class=LinearRegression,
        model_params=None,
        target_column="PictureNamingScore",
        feature_columns=X.columns,
        n_folds=5,
        target_transform=tfm
    )

    # plot predictions
    plt.figure(figsize=(6, 5))
    for fold in all_preds['fold'].unique():
        fold_df = all_preds[all_preds['fold'] == fold]
        plt.scatter(fold_df["y_test"], fold_df["y_pred"], alpha=0.6, label=f"Fold {fold+1}")

    plt.plot(
        [all_preds["y_test"].min(), all_preds["y_test"].max()],
        [all_preds["y_test"].min(), all_preds["y_test"].max()],
        linestyle="--", color="gray", label="Perfect Prediction"
    )

    plt.xlabel("Actual Score")
    plt.ylabel("Predicted Score")
    plt.title(f"{target} - {tfm} transformation")
    plt.legend()
    plt.grid(True, linestyle="--", alpha=0.5)
    plt.tight_layout()

    # save
    plot_path = os.path.join(save_dir, f"{tfm}_prediction_plot.png".lower())
    plt.savefig(plot_path, dpi=300)
    plt.close()
    print(f"Saved plot to: {plot_path}")

    print(f"R² mean: {np.mean(r2_list):.3f}, RMSE: {np.mean(rmse_list):.2f}, MAE: {np.mean(mae_list):.2f}")


Trying transformation: none
Fold 1: R² = -0.017, RMSE = 2.36, MAE = 1.76
Fold 2: R² = 0.200, RMSE = 2.29, MAE = 1.75
Fold 3: R² = 0.246, RMSE = 2.27, MAE = 1.66
Fold 4: R² = 0.446, RMSE = 1.97, MAE = 1.50
Fold 5: R² = 0.242, RMSE = 2.45, MAE = 1.72
Saved plot to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/picturenaming_transformation/none_prediction_plot.png
R² mean: 0.223, RMSE: 2.27, MAE: 1.68

Trying transformation: sqrt
Fold 1: R² = -0.016, RMSE = 0.32, MAE = 0.22
Fold 2: R² = 0.185, RMSE = 0.32, MAE = 0.23
Fold 3: R² = 0.243, RMSE = 0.30, MAE = 0.21
Fold 4: R² = 0.518, RMSE = 0.26, MAE = 0.19
Fold 5: R² = 0.238, RMSE = 0.33, MAE = 0.22
Saved plot to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/picturenaming_transformation/sqrt_prediction_plot.png
R² mean: 0.234, RMSE: 0.31, MAE: 0.21

Trying transformation: cbrt
Fold 1: R² = -0.017, RMSE = 0.14, MAE = 0.10
Fold 2: R² = 0.179, RMSE = 0.14, MAE = 0.10
Fold 3: R² = 0

In [23]:
score = df[target]
methods = transformations

for m in methods:
    y_trans, name = transform_score(score, method=m)

    if name == "None":
        bins = np.arange(0, 22) - 0.5  # one bin per score
    else:
        bins = 30  # default for transformed

    plt.figure(figsize=(5, 3))
    plt.hist(y_trans, bins=bins, color="steelblue", edgecolor="white")
    plt.title(f"{name} transformation")
    plt.grid(True, linestyle="--", alpha=0.5)
    plt.tight_layout()

    # Save histograms too
    hist_path = os.path.join(save_dir, f"{name}_histogram.png".lower())
    plt.savefig(hist_path, dpi=300)
    plt.close()
    print(f"Saved histogram to: {hist_path}")


Saved histogram to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/picturenaming_transformation/none_histogram.png
Saved histogram to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/picturenaming_transformation/sqrt_histogram.png
Saved histogram to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/picturenaming_transformation/cbrt_histogram.png
Saved histogram to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/picturenaming_transformation/log_histogram.png
Saved histogram to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/picturenaming_transformation/reciprocal_histogram.png
Saved histogram to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/picturenaming_transformation/boxcox_histogram.png
Saved histogram to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/picturenaming_transformati