null values
- how many missing values / 0 are there per feature?


In [8]:
# setup
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# add project root
sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")

from config.constants import GIT_DIRECTORY

# set parameters
task_name = "cookieTheft"
id_column = "Subject_ID"

# load features and targets
features = pd.read_csv(os.path.join(GIT_DIRECTORY, f"results/features/{task_name}.csv"))
targets = pd.read_csv(os.path.join(GIT_DIRECTORY, "data/language_scores_all_subjects.csv"))

df = pd.merge(features, targets, on="Subject_ID")

target_cols = ["SemanticFluencyScore", "PhonemicFluencyScore", "PictureNamingScore"]
feature_cols = [col for col in df.columns if col not in [id_column] + target_cols]


In [9]:
# count missing and zero values per feature

missing_counts = df[feature_cols].isna().sum()
zero_counts = (df[feature_cols] == 0).sum()
total_rows = len(df)

missing_data = pd.DataFrame({
    "missing_count": missing_counts,
    "missing_percent": (missing_counts / total_rows) * 100,
    "zero_count": zero_counts,
    "zero_percent": (zero_counts / total_rows) * 100,
})

In [12]:
# filter features with > 20% missing or zeros

print(missing_data.sort_values(by="missing_percent", ascending=False).head())

threshold = 20
bad_features = missing_data[(missing_data["missing_percent"] > threshold) | (missing_data["zero_percent"] > threshold)]
print("features with >20% missing or zero:")
print(bad_features)

           missing_count  missing_percent  zero_count  zero_percent
img_verbs            309        30.838323           0           0.0
fam_verbs            309        30.838323           0           0.0
mattr_50              41         4.091816           0           0.0
mattr_40              23         2.295409           0           0.0
img_nouns             21         2.095808           0           0.0
features with >20% missing or zero:
                      missing_count  missing_percent  zero_count  zero_percent
empty_word_ratio                  3         0.299401         936     93.413174
adjacent_repetitions              3         0.299401         652     65.069860
fam_verbs                       309        30.838323           0      0.000000
img_verbs                       309        30.838323           0      0.000000
um_ratio                          3         0.299401         374     37.325349
uh_ratio                          3         0.299401         487     48.602794
er_

In [11]:
# save count for missing values and zero values for all features and targets
missing_counts = df.isna().sum()
zero_counts = (df == 0).sum()
total_rows = len(df)

all_missing_data = pd.DataFrame({
    "missing_count": missing_counts,
    "missing_percent": (missing_counts / total_rows) * 100,
    "zero_count": zero_counts,
    "zero_percent": (zero_counts / total_rows) * 100,
})
all_missing_data_sorted = all_missing_data.sort_values(by=["missing_percent", "zero_percent"], ascending=False)

save_path = os.path.join(GIT_DIRECTORY, f"results/data_preparation/missing_values/{task_name}_missing_data.csv")
all_missing_data_sorted.to_csv(save_path)