In [16]:
# === Learning curves (white bg, bigger labels, A/B/C panel) ===
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

# --- consistent white style (kills grey backgrounds) ---
plt.rcParams.update({
    "figure.facecolor": "white",
    "axes.facecolor": "white",
    "savefig.facecolor": "white",
    "axes.edgecolor": "black",
    "grid.color": "#E5E5E5",
    "grid.alpha": 0.8,
    "axes.spines.top": False,
    "axes.spines.right": False
})

# add project root
sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")
from config.constants import GIT_DIRECTORY

# ---------------- inputs ----------------
task_name = "cookieTheft"
targets = [
    "PictureNamingScore",
    "SemanticFluencyScore",
    "PhonemicFluencyScore",
]  # panel will be A,B,C in this order

# paths
features_path = "/Users/gilanorup/Desktop/Studium/MSc/MA/x_old_files/old_results/features/cookieTheft_filtered1.csv"
scores_path   = os.path.join(GIT_DIRECTORY, "data/language_scores_all_subjects.csv")

# output folder
output_dir = os.path.join(GIT_DIRECTORY, "results", "plots", "learning_curves")
os.makedirs(output_dir, exist_ok=True)

# ---------------- data ----------------
features = pd.read_csv(features_path)
scores   = pd.read_csv(scores_path)

# helper to compute one learning curve (keeps your exact logic)
def compute_learning_curve(X, y, test_size=0.2, random_state=42, n_points=10):
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

    X_trainval, X_test, y_trainval, y_test = train_test_split(
        X_scaled, y, test_size=test_size, random_state=random_state
    )

    train_sizes = np.linspace(0.1, 1.0, n_points)
    r2_scores = []

    for frac in train_sizes:
        if frac >= 1.0:
            X_sub, y_sub = X_trainval, y_trainval
        else:
            X_sub, _, y_sub, _ = train_test_split(
                X_trainval, y_trainval, train_size=frac, random_state=random_state
            )
        model = LinearRegression()
        model.fit(X_sub, y_sub)
        y_pred = model.predict(X_test)
        r2_scores.append(r2_score(y_test, y_pred))

    return (train_sizes * 100).astype(int), r2_scores

# collect per-target results for the panel
panel_results = {}

print("\nRunning learning curves...\n")
for t in targets:
    df = pd.merge(features, scores[["Subject_ID", t]], on="Subject_ID").dropna()
    X = df.drop(columns=["Subject_ID", t])
    y = df[t]

    train_sizes_pct, r2_scores = compute_learning_curve(X, y, test_size=0.2, random_state=42, n_points=10)
    panel_results[t] = (train_sizes_pct, r2_scores)

    # save per-score CSV
    per_csv = os.path.join(output_dir, f"learning_curve_{task_name}_{t}.csv")
    pd.DataFrame({"TrainingSize%": train_sizes_pct, "Test_R2": r2_scores}).to_csv(per_csv, index=False)

    # save per-score plot (single)
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.plot(train_sizes_pct, r2_scores, marker="o", linestyle="-")
    ax.set_xlabel("Training Set Size (%)", fontsize=14)
    ax.set_ylabel("Test $R^2$", fontsize=14)
    ax.grid(True)
    ax.tick_params(axis="both", labelsize=12)
    fig.tight_layout()
    per_png = os.path.join(output_dir, f"learning_curve_plot_{task_name}_{t}.png")
    fig.savefig(per_png, dpi=300)
    plt.close(fig)

    print(f"{t}: saved CSV → {per_csv}")
    print(f"{t}: saved PNG → {per_png}")

# ---------------- panel (A/B/C) ----------------
# ---------------- vertical panel (A/B/C stacked, original style) ----------------
letters = ["A", "B", "C"]
titles  = {
    "PictureNamingScore": "Picture Naming",
    "SemanticFluencyScore": "Semantic Fluency",
    "PhonemicFluencyScore": "Phonemic Fluency",
}

# three rows, one column
fig, axes = plt.subplots(3, 1, figsize=(8, 14), sharex=False, sharey=False)

for ax, letter, t in zip(axes, letters, targets):
    train_sizes_pct, r2_scores = panel_results[t]
    ax.plot(train_sizes_pct, r2_scores, marker="o", linestyle="-", color="steelblue")
    ax.set_xlabel("Training Set Size (%)", fontsize=16)
    ax.set_ylabel("Test R²", fontsize=16)
    ax.grid(True)
    ax.tick_params(axis="both", labelsize=14)
    ax.axvline(x=80, color="grey", linestyle="--", linewidth=1, alpha=0.7)
    # letter label (optional)
    ax.text(-0.10, 1.05, letter, transform=ax.transAxes,
            fontsize=18, fontweight="bold", va="top", ha="left")

fig.tight_layout()
panel_png = os.path.join(output_dir, f"learning_curves_panel_vertical_{task_name}.png")
fig.savefig(panel_png, dpi=300, bbox_inches="tight")
plt.close(fig)

print(f"\nSaved vertical panel plot to:\n{panel_png}\n")




Running learning curves...

PictureNamingScore: saved CSV → /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/learning_curves/learning_curve_cookieTheft_PictureNamingScore.csv
PictureNamingScore: saved PNG → /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/learning_curves/learning_curve_plot_cookieTheft_PictureNamingScore.png
SemanticFluencyScore: saved CSV → /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/learning_curves/learning_curve_cookieTheft_SemanticFluencyScore.csv
SemanticFluencyScore: saved PNG → /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/learning_curves/learning_curve_plot_cookieTheft_SemanticFluencyScore.png
PhonemicFluencyScore: saved CSV → /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/plots/learning_curves/learning_curve_cookieTheft_PhonemicFluencyScore.csv
PhonemicFluencyScore: saved PNG → /Users/gilanorup/Desktop/Studium/MSc/MA/