In [6]:
# based on IQR, but read more about that and do something else

# setup
import pandas as pd
import os
import sys

# add project root to sys.path
sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")

from config.constants import GIT_DIRECTORY

# task and target
task_name = "cookieTheft"
target = "PhonemicFluencyScore"

# paths
features_path = os.path.join(GIT_DIRECTORY, f"results/features/{task_name}.csv")
scores_path = os.path.join(GIT_DIRECTORY, "resources/language_scores_all_subjects.csv")

# load data
features = pd.read_csv(features_path)
scores = pd.read_csv(scores_path)

# merge and drop missing
df = pd.merge(features, scores[["Subject_ID", target]], on="Subject_ID").dropna()
print(f"Original merged size: {len(df)}")


Original merged size: 957


In [7]:
# function to remove outliers based on IQR (and return removed subject IDs)

def remove_iqr_outliers(df, column, subject_id="Subject_ID", verbose=True):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    filtered = df[(df[column] >= lower) & (df[column] <= upper)]
    removed_df = df[(df[column] < lower) | (df[column] > upper)]
    percent = 100 * len(removed_df) / len(df)
    if verbose:
        print(f"{column}: Removed {len(removed_df)} rows ({percent:.2f}%) | Range kept: {lower:.2f} to {upper:.2f}")
    return filtered, removed_df[[subject_id, column]]


In [9]:
# clean all three scores and collect removed subject IDs
targets = ["PhonemicFluencyScore", "SemanticFluencyScore", "PictureNamingScore"]
removed_ids_all = []

# plot original distributions
for score in targets:
    plot_score_distribution(scores, score, "Before Cleaning", f"{score}_before_cleaning")

# remove outliers
for score in targets:
    _, removed = remove_iqr_outliers(scores, score)
    removed_ids_all.append(removed["Subject_ID"])

PhonemicFluencyScore: Removed 7 rows (0.70%) | Range kept: 3.00 to 27.00
SemanticFluencyScore: Removed 9 rows (0.90%) | Range kept: 5.00 to 37.00
PictureNamingScore: Removed 80 rows (7.98%) | Range kept: 14.00 to 22.00


In [10]:
# combine all removed subject IDs
all_removed_ids = pd.concat(removed_ids_all).drop_duplicates()
print(f"Total unique subjects removed: {len(all_removed_ids)}")

# filter scores â†’ keep only non-outlier subjects
scores_cleaned = scores[~scores["Subject_ID"].isin(all_removed_ids)]

# plot cleaned distributions
for score in targets:
    plot_score_distribution(scores_cleaned, score, "After Cleaning", f"{score}_after_cleaning")

Total unique subjects removed: 94


In [11]:
# save final cleaned scores
combined_path = os.path.join(GIT_DIRECTORY, "resources/combined_language_scores_cleaned.csv")
scores_cleaned.to_csv(combined_path, index=False)
print(f"Cleaned combined scores saved to:\n{combined_path}")

# save outlier Subject_IDs
outlier_log_path = os.path.join(GIT_DIRECTORY, "results/preprocessing/removed_subject_ids_combined.csv")
all_removed_ids.to_frame(name="Subject_ID").to_csv(outlier_log_path, index=False)
print(f"Removed subject IDs saved to:\n{outlier_log_path}")

Cleaned combined scores saved to:
/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/resources/combined_language_scores_cleaned.csv
Removed subject IDs saved to:
/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/preprocessing/removed_subject_ids_combined.csv


In [12]:
# new score distribution plots
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "Arial"

def plot_score_distribution(data, column, title_suffix, filename_suffix):
    score_counts = data[column].value_counts().sort_index()
    plt.figure(figsize=(10, 6))
    plt.bar(score_counts.index, score_counts.values, color="slateblue", edgecolor="black", width=1.0)
    plt.xlabel("Score", fontsize=14, fontweight="bold")
    plt.ylabel("Number of People", fontsize=14, fontweight="bold")
    plt.title(f"{column} Score Distribution ({title_suffix})", fontsize=16, fontweight="bold")
    plt.grid(axis="y", linestyle="--", alpha=0.7)

    # save to plots folder
    filename_suffix = title_suffix.lower().replace(" ", "_")
    filename = f"{column}_{filename_suffix}.png"
    output_plot_path = os.path.join(GIT_DIRECTORY, "results/plots", filename)
    os.makedirs(os.path.dirname(output_plot_path), exist_ok=True)
    plt.savefig(output_plot_path, dpi=300, bbox_inches="tight")
    plt.close()

# plot for each score (before + after)
for score in targets:
    plot_score_distribution(scores, score, "Before Cleaning", "before_cleaning")
    plot_score_distribution(scores_cleaned, score, "After Cleaning", "after_cleaning")

