In [1]:
# test different kinds of transformations for Picture Naming Score
# square root, cube root, logarithm, reciprocal, box-cox

# setup
import numpy as np
import pandas as pd
from scipy.stats import boxcox, yeojohnson
import matplotlib.pyplot as plt
import sys
import os
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# add project root
sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")

from config.constants import GIT_DIRECTORY

# define save path
save_dir = os.path.join(GIT_DIRECTORY, "results/plots/picturenaming_transformation")
os.makedirs(save_dir, exist_ok=True)

# task and target
task_name = "cookieTheft"
target = "PictureNamingScore"

# load features and target
features = pd.read_csv(os.path.join(GIT_DIRECTORY, f"results/features/filtered/{task_name}_filtered.csv"))
scores = pd.read_csv(os.path.join(GIT_DIRECTORY, "data/language_scores_all_subjects.csv"))

# merge and drop missing
df = pd.merge(features, scores[["Subject_ID", target]], on="Subject_ID").dropna()

# extract X and y
X = df.drop(columns=["Subject_ID", target])
y = df[target]

# load pre-generated folds
folds_path = os.path.join(GIT_DIRECTORY, "data", f"{task_name}_stratified_folds.csv")
fold_df = pd.read_csv(folds_path)

# merge fold info into df
df = pd.merge(df, fold_df[["Subject_ID", "fold"]], on="Subject_ID")

In [2]:
# function to test different transformations
def transform_score(y, method="none", fit_lambda=True, train_params=None): # if fit_lambda=True -> fit lambda, training; if fit_lambda=False -> apply given train_params, testing
    y = y.copy()
    name = method

    if method == "none":
        return y, name, None

    elif method == "sqrt":
        return np.sqrt(y), name, None

    elif method == "cbrt":
        return np.cbrt(y), name, None

    elif method == "log": # compresses high values
        shift = 1
        y_reversed = 20 - y # because of left-skewed distribution
        y_shifted = y_reversed + shift # to avoid log(0)
        return np.log(y_shifted), name, shift

    elif method == "reciprocal": # very strong compression of large values
        shift = 1e-6
        y_shifted = y + shift
        return 1 / y_shifted, name, shift

    elif method == "boxcox": # automatically selects value to stabilize variance and normalize
        if fit_lambda:
            shift = 0
            if (y <= 0).any():
                shift = 1 - y.min()
                y = y + shift
            y_transformed, lmbda = boxcox(y)
            return y_transformed, name, (lmbda, shift)
        else:
            lmbda, shift = train_params
            y_shifted = y + shift
            y_transformed = boxcox(y_shifted, lmbda=lmbda)
            return y_transformed, name, (lmbda, shift)

    return pd.Series(y_transformed, index=y.index), name

# adapted function for transformation of score
def stratified_cross_validation2(
    df, fold_column, model_class, model_params,
    target_column, n_folds=5, feature_columns=None, target_transform="none"
):
    r2_scores = []
    rmse_scores = []
    mae_scores = []
    all_preds = []

    for fold in range(1, n_folds + 1):
        train_df = df[df[fold_column] != fold]
        test_df = df[df[fold_column] == fold]

        X_train = train_df[feature_columns]
        y_train = train_df[target_column]
        X_test = test_df[feature_columns]
        y_test = test_df[target_column]

        # apply transformation to score
        y_train_transformed, _, train_params = transform_score(y_train, method=target_transform, fit_lambda=True)
        y_test_transformed, _, _ = transform_score(y_test, method=target_transform, fit_lambda=False, train_params=train_params)

        # standardize features
        scaler = StandardScaler()
        X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
        X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

        # train model
        model = model_class(**model_params) if model_params else model_class()
        model.fit(X_train_scaled, y_train_transformed)
        y_pred = model.predict(X_test_scaled)

        # evaluate using transformed target
        r2 = r2_score(y_test_transformed, y_pred)

        r2_scores.append(r2)

        fold_df = pd.DataFrame({
            "y_test": y_test_transformed,
            "y_pred": y_pred,
            "fold": fold
        }, index=y_test.index)
        all_preds.append(fold_df)

        print(f"Fold {fold}: R² = {r2:.3f}")

    all_preds_df = pd.concat(all_preds, ignore_index=True)

    return r2_scores, all_preds_df


In [3]:
transformations = ["none", "sqrt", "cbrt", "log", "reciprocal", "boxcox"]

for tfm in transformations:
    print(f"\ntransformation: {tfm}")

    r2_list, all_preds = stratified_cross_validation2(
        df=df,
        fold_column="fold",
        model_class=LinearRegression,
        model_params=None,
        target_column="PictureNamingScore",
        feature_columns=X.columns,
        n_folds=5,
        target_transform=tfm
    )

    # plot predictions
    plt.figure(figsize=(6, 5))
    for fold in all_preds['fold'].unique():
        fold_df = all_preds[all_preds['fold'] == fold]
        plt.scatter(fold_df["y_test"], fold_df["y_pred"], alpha=0.6, label=f"Fold {fold}")

    plt.plot(
        [all_preds["y_test"].min(), all_preds["y_test"].max()],
        [all_preds["y_test"].min(), all_preds["y_test"].max()],
        linestyle="--", color="gray", label="Perfect Prediction"
    )

    plt.xlabel("Actual Score")
    plt.ylabel("Predicted Score")
    plt.title(f"{target} - {tfm} transformation")
    plt.legend()
    plt.grid(True, linestyle="--", alpha=0.5)
    plt.tight_layout()

    # save
    plot_path = os.path.join(save_dir, f"{tfm}_prediction_plot.png".lower())
    plt.savefig(plot_path, dpi=300)
    plt.close()
    print(f"Saved plot to: {plot_path}")

    print(f"R² mean: {np.mean(r2_list):.3f}")


transformation: none
Fold 1: R² = -0.198
Fold 2: R² = -0.147
Fold 3: R² = 0.112
Fold 4: R² = 0.315
Fold 5: R² = 0.173
Saved plot to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/picturenaming_transformation/none_prediction_plot.png
R² mean: 0.051

transformation: sqrt
Fold 1: R² = -0.182
Fold 2: R² = -0.412
Fold 3: R² = 0.114
Fold 4: R² = 0.367
Fold 5: R² = 0.175
Saved plot to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/picturenaming_transformation/sqrt_prediction_plot.png
R² mean: 0.012

transformation: cbrt
Fold 1: R² = -0.176
Fold 2: R² = -0.537
Fold 3: R² = 0.112
Fold 4: R² = 0.379
Fold 5: R² = 0.175
Saved plot to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/picturenaming_transformation/cbrt_prediction_plot.png
R² mean: -0.009

transformation: log
Fold 1: R² = -0.241
Fold 2: R² = -0.005
Fold 3: R² = 0.035
Fold 4: R² = 0.096
Fold 5: R² = 0.152
Saved plot to: /Users/gilanorup/Desktop/S

In [4]:
score = df[target]
methods = transformations

for m in methods:
    y_trans, name, _ = transform_score(score, method=m)

    if name == "None":
        bins = np.arange(0, 22) - 0.5  # one bin per score
    else:
        bins = 30  # default for transformed

    plt.figure(figsize=(5, 3))
    plt.hist(y_trans, bins=bins, color="steelblue", edgecolor="white")
    plt.title(f"{name} transformation")
    plt.grid(True, linestyle="--", alpha=0.5)
    plt.tight_layout()

    # Save histograms too
    hist_path = os.path.join(save_dir, f"{name}_histogram.png".lower())
    plt.savefig(hist_path, dpi=300)
    plt.close()
    print(f"Saved histogram to: {hist_path}")


Saved histogram to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/picturenaming_transformation/none_histogram.png
Saved histogram to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/picturenaming_transformation/sqrt_histogram.png
Saved histogram to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/picturenaming_transformation/cbrt_histogram.png
Saved histogram to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/picturenaming_transformation/log_histogram.png
Saved histogram to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/picturenaming_transformation/reciprocal_histogram.png
Saved histogram to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/picturenaming_transformation/boxcox_histogram.png
