null values
- how many missing values / 0 are there per feature?


In [30]:
# setup
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# add project root
sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")

from config.constants import GIT_DIRECTORY

# set parameters
task_name = "cookieTheft"
id_column = "Subject_ID"

# load features and targets
features = pd.read_csv(os.path.join(GIT_DIRECTORY, f"results/features/{task_name}.csv"))
targets = pd.read_csv(os.path.join(GIT_DIRECTORY, "data/language_scores_all_subjects.csv"))

df = pd.merge(features, targets, on="Subject_ID")

target_cols = ["SemanticFluencyScore", "PhonemicFluencyScore", "PictureNamingScore"]
feature_cols = [col for col in df.columns if col not in [id_column] + target_cols]


In [31]:
# count missing and zero values per feature

missing_counts = df[feature_cols].isna().sum()
zero_counts = (df[feature_cols] == 0).sum()
total_rows = len(df)

missing_data = pd.DataFrame({
    "missing_count": missing_counts[feature_cols],
    "missing_percent": (missing_counts[feature_cols] / total_rows) * 100,
    "zero_count": zero_counts[feature_cols],
    "zero_percent": (zero_counts[feature_cols] / total_rows) * 100,
})

In [32]:
# filter features with > 20% missing or zeros

print(missing_data.sort_values(by="missing_percent", ascending=False).head())

threshold = 20
bad_features = missing_data[(missing_data["missing_percent"] > threshold) | (missing_data["zero_percent"] > threshold)]
print("features with >20% missing or zero:")
print(bad_features)

             missing_count  missing_percent  zero_count  zero_percent
fam_verbs              246        24.723618           0           0.0
img_verbs              246        24.723618           0           0.0
fam_nouns              194        19.497487           0           0.0
img_nouns              194        19.497487           0           0.0
img_content             35         3.517588           0           0.0
features with >20% missing or zero:
                      missing_count  missing_percent  zero_count  zero_percent
empty_word_ratio                  1         0.100503         740     74.371859
adjacent_repetitions              1         0.100503         512     51.457286
fam_verbs                       246        24.723618           0      0.000000
img_verbs                       246        24.723618           0      0.000000
um_ratio                          1         0.100503         220     22.110553
uh_ratio                          1         0.100503         400     4

In [33]:
# save count for missing values and zero values (features only)
missing_counts = df[feature_cols].isna().sum()
zero_counts = (df[feature_cols] == 0).sum()
total_rows = len(df)

feature_missing_data = pd.DataFrame({
    "missing_count": missing_counts,
    "missing_percent": (missing_counts / total_rows) * 100,
    "zero_count": zero_counts,
    "zero_percent": (zero_counts / total_rows) * 100,
}).sort_values(by=["missing_percent", "zero_percent"], ascending=False)

save_path = os.path.join(GIT_DIRECTORY, f"results/data_preparation/missing_values/{task_name}_missing_data.csv")
feature_missing_data.to_csv(save_path)