check how many people are missing in stratified splits (compared to original sample size) and how many people have missing values on scores


In [2]:
import os
import pandas as pd

from config.constants import GIT_DIRECTORY, SCORES

# paths
demographics_path = os.path.join(GIT_DIRECTORY, "data", "demographics_data.csv")
folds_path = os.path.join(GIT_DIRECTORY, "data", "stratified_folds.csv")
scores_path = os.path.join(GIT_DIRECTORY, "data", "language_scores_all_subjects.csv")

# read data
demographics = pd.read_csv(demographics_path)
folds = pd.read_csv(folds_path)
scores = pd.read_csv(scores_path)

In [4]:
# 1) read full N from demographics
full_N = pd.read_csv(demographics_path)["Subject_ID"].nunique()

# 2) read new N from stratified splits
df_folds = pd.read_csv(folds_path)
new_N = df_folds["Subject_ID"].nunique()

# 3) calculate difference (removed before splitting)
removed = full_N - new_N

print(f"Full N (demographics): {full_N}")
print(f"Kept N (stratified_folds.csv): {new_N}")
print(f"Removed before splitting: {removed}")

# 4) table with N for each fold (1-5)
fold_table = (
    df_folds["fold"]
    .value_counts()
    .reindex([1, 2, 3, 4, 5], fill_value=0)
    .rename("N")
    .reset_index()
    .rename(columns={"index": "fold"})
    .sort_values("fold")
)

print("\nPer-fold N:")
print(fold_table.to_string(index=False))


Full N (demographics): 1003
Kept N (stratified_folds.csv): 988
Removed before splitting: 15

Per-fold N:
 fold   N
    1 198
    2 198
    3 198
    4 197
    5 197


In [None]:
# sets for comparisons
demo_ids = set(demographics["Subject_ID"])
kept_ids = set(folds["Subject_ID"])
removed_ids = sorted(demo_ids - kept_ids)

print("\nRemoved before splitting (Subject_ID):")
print(removed_ids if removed_ids else "None")

# missing scores per target
score_cols = [c for c in SCORES if c in scores.columns]

scores["Subject_ID"] = scores["Subject_ID"].astype(demographics["Subject_ID"].dtype)

all_score_ids = set(scores["Subject_ID"])

missing_overall = {} # IDs missing the score among all subjects present in the scores file
missing_among_kept = {} # IDs missing the score restricted to the kept_id set (after demographics filtering)

for col in score_cols:
    # IDs that appear in the score file but have NaN for this score
    ids_missing = set(scores.loc[scores[col].isna(), "Subject_ID"])
    missing_overall[col] = sorted(ids_missing)
    # among kept subjects only:
    missing_among_kept[col] = sorted(ids_missing & kept_ids)

# print summaries
print("\nMissing scores (overall):")
for col in score_cols:
    print(f"- {col}: {len(missing_overall[col])} IDs missing")

print("\nMissing scores (among kept subjects only):")
for col in score_cols:
    print(f"- {col}: {len(missing_among_kept[col])} IDs missing")

# Show the actual ID lists
for col in score_cols:
    print(f"\nIDs missing {col} (overall):")
    print(missing_overall[col] if missing_overall[col] else "None")
    print(f"IDs missing {col} (among kept):")
    print(missing_among_kept[col] if missing_among_kept[col] else "None")


Removed before splitting (Subject_ID):
[98, 169, 172, 278, 488, 498, 631, 707, 867, 1059, 1069, 1125, 1128, 1319, 1349]

Missing scores (overall):
- PhonemicFluencyScore: 1 IDs missing
- PictureNamingScore: 13 IDs missing
- SemanticFluencyScore: 1 IDs missing

Missing scores (among kept subjects only):
- PhonemicFluencyScore: 1 IDs missing
- PictureNamingScore: 13 IDs missing
- SemanticFluencyScore: 1 IDs missing

IDs missing PhonemicFluencyScore (overall):
[43]
IDs missing PhonemicFluencyScore (among kept):
[43]

IDs missing PictureNamingScore (overall):
[41, 43, 44, 46, 49, 50, 54, 56, 59, 61, 253, 303, 1079]
IDs missing PictureNamingScore (among kept):
[41, 43, 44, 46, 49, 50, 54, 56, 59, 61, 253, 303, 1079]

IDs missing SemanticFluencyScore (overall):
[99]
IDs missing SemanticFluencyScore (among kept):
[99]
