In [1]:
# check how many missing values there are

import pandas as pd

df = pd.read_csv("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/data/demographics_data.csv")

In [4]:
# demographics: age
column_to_check = "Age"
invalid_entries = [" "]

# convert to string in case of mixed types
col_data = df[column_to_check].astype(str)

# create mask for invalid string entries
invalid_mask = col_data.isin(invalid_entries)

# create mask for actual NaNs
nan_mask = df[column_to_check].isna()

# combine both
mask = invalid_mask | nan_mask

# show results
print(f"Total invalid entries in column '{column_to_check}': {mask.sum()}")
print(f"Row indices: {df[mask].index.tolist()}")


Total invalid entries in column 'Age': 0
Row indices: []


In [5]:
# demographics: gender
column_to_check = "Gender"
invalid_entries = [" ", "other", "no_answer"]

# convert to string in case of mixed types
col_data = df[column_to_check].astype(str)

# create mask for invalid string entries
invalid_mask = col_data.isin(invalid_entries)

# create mask for actual NaNs
nan_mask = df[column_to_check].isna()

# combine both
mask = invalid_mask | nan_mask

# show results
print(f"Total invalid entries in column '{column_to_check}': {mask.sum()}")
print(f"Row indices: {df[mask].index.tolist()}")


Total invalid entries in column 'Gender': 7
Row indices: [70, 324, 492, 620, 772, 779, 964]


In [6]:
# demographics: education
column_to_check = "Education"
invalid_entries = [" ", "other", "no_answer"]

# convert to string in case of mixed types
col_data = df[column_to_check].astype(str)

# create mask for invalid string entries
invalid_mask = col_data.isin(invalid_entries)

# create mask for actual NaNs
nan_mask = df[column_to_check].isna()

# combine both
mask = invalid_mask | nan_mask

# show results
print(f"Total invalid entries in column '{column_to_check}': {mask.sum()}")
print(f"Row indices: {df[mask].index.tolist()}")


Total invalid entries in column 'Education': 4
Row indices: [72, 314, 428, 492]


In [7]:
# demographics: language
column_to_check = "Language"
invalid_entries = [" ", "english_other", "no_answer"]

# convert to string in case of mixed types
col_data = df[column_to_check].astype(str)

# create mask for invalid string entries
invalid_mask = col_data.isin(invalid_entries)

# create mask for actual NaNs
nan_mask = df[column_to_check].isna()

# combine both
mask = invalid_mask | nan_mask

# show results
print(f"Total invalid entries in column '{column_to_check}': {mask.sum()}")
print(f"Row indices: {df[mask].index.tolist()}")


Total invalid entries in column 'Language': 13
Row indices: [49, 53, 55, 172, 247, 429, 662, 825, 909, 947, 948, 950, 953]


In [8]:
# demographics: country
column_to_check = "Country"
invalid_entries = [" ", "other", "no_answer"]

# convert to string in case of mixed types
col_data = df[column_to_check].astype(str)

# create mask for invalid string entries
invalid_mask = col_data.isin(invalid_entries)

# create mask for actual NaNs
nan_mask = df[column_to_check].isna()

# combine both
mask = invalid_mask | nan_mask

# show results
print(f"Total invalid entries in column '{column_to_check}': {mask.sum()}")
print(f"Row indices: {df[mask].index.tolist()}")


Total invalid entries in column 'Country': 1
Row indices: [72]


In [2]:
# demographics: SES
column_to_check = "Socioeconomic"
invalid_entries = [" ", "no_answer"]

# convert to string in case of mixed types
col_data = df[column_to_check].astype(str)

# create mask for invalid string entries
invalid_mask = col_data.isin(invalid_entries)

# create mask for actual NaNs
nan_mask = df[column_to_check].isna()

# combine both
mask = invalid_mask | nan_mask

# show results
print(f"Total invalid entries in column '{column_to_check}': {mask.sum()}")
print(f"Row indices: {df[mask].index.tolist()}")

Total invalid entries in column 'Socioeconomic': 5
Row indices: [155, 779, 820, 821, 984]


In [3]:
# total affected subjects

# define columns and invalid entries
columns_to_check = ["Age", "Gender", "Education", "Language", "Country", "Socioeconomic"]
invalid_entries = [" ", "no_answer", "other", "english_other"]

# convert specified columns to string
col_data = df[columns_to_check].astype(str)

# create masks for invalid values and NaNs
invalid_mask = col_data.isin(invalid_entries)
nan_mask = df[columns_to_check].isna()
combined_mask = invalid_mask | nan_mask

# find rows where any of the specified columns have invalid entries
rows_with_invalid = combined_mask.any(axis=1)

# get affected rows and unique Subject_IDs
affected_rows = df[rows_with_invalid]
affected_subjects = affected_rows["Subject_ID"].unique().tolist()

# output
print(f"Total affected subjects: {len(affected_subjects)}")
print(f"Affected Subject_IDs: {affected_subjects}")

Total affected subjects: 27
Affected Subject_IDs: [138, 142, 145, 169, 172, 278, 303, 400, 488, 498, 631, 632, 707, 867, 922, 1059, 1069, 1125, 1128, 1133, 1244, 1297, 1298, 1302, 1308, 1319, 1349]


In [4]:
### check language scores

# load data
df_scores = pd.read_csv("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/data/language_scores_all_subjects.csv")
id_column = "Subject_ID"
score_cols = ["SemanticFluencyScore", "PhonemicFluencyScore", "PictureNamingScore"]

total_n = len(df_scores)

rows = []
missing_subjects_long = []
zero_subjects_long = []

for col in score_cols:
     # ensure numeric (non-numeric -> NaN)
    s = pd.to_numeric(df_scores[col], errors="coerce")

    miss_mask = s.isna()
    zero_mask = s.eq(0)

    miss_ids = df_scores.loc[miss_mask, id_column].tolist()
    zero_ids = df_scores.loc[zero_mask, id_column].tolist()

    rows.append({
        "score": col,
        "n_total": total_n,
        "n_missing": miss_mask.sum(),
        "pct_missing": miss_mask.mean() * 100,
        "n_zero": zero_mask.sum(),
        "pct_zero": zero_mask.mean() * 100,
    })

    # keep long-form lists (one row per affected subject)
    missing_subjects_long.extend([{"score": col, id_column: sid} for sid in miss_ids])
    zero_subjects_long.extend([{"score": col, id_column: sid} for sid in zero_ids])

summary = pd.DataFrame(rows).sort_values(["pct_missing", "pct_zero", "score"], ascending=[False, False, True])

print("\nPer-score missing/zero summary:")
print(summary)

print("\nSubjects with missing per score:")
for col in score_cols:
    ids = [row[id_column] for row in missing_subjects_long if row["score"] == col]
    print(f"  {col}: {len(ids)} -> {ids}")

print("\nSubjects with zero per score:")
for col in score_cols:
    ids = [row[id_column] for row in zero_subjects_long if row["score"] == col]
    print(f"  {col}: {len(ids)} -> {ids}")

affected_any_missing = sorted(set([r[id_column] for r in missing_subjects_long]))
affected_any_zero = sorted(set([r[id_column] for r in zero_subjects_long]))
print("\nTotal subjects with any missing:", len(affected_any_missing), "->", affected_any_missing)
print("Total subjects with any zero:", len(affected_any_zero), "->", affected_any_zero)



Per-score missing/zero summary:
                  score  n_total  n_missing  pct_missing  n_zero  pct_zero
2    PictureNamingScore     1003         13     1.296112       0  0.000000
0  SemanticFluencyScore     1003          1     0.099701       2  0.199402
1  PhonemicFluencyScore     1003          1     0.099701       0  0.000000

Subjects with missing per score:
  SemanticFluencyScore: 1 -> [99]
  PhonemicFluencyScore: 1 -> [43]
  PictureNamingScore: 13 -> [41, 43, 44, 46, 49, 50, 54, 56, 59, 61, 253, 303, 1079]

Subjects with zero per score:
  SemanticFluencyScore: 2 -> [389, 1097]
  PhonemicFluencyScore: 0 -> []
  PictureNamingScore: 0 -> []

Total subjects with any missing: 14 -> [41, 43, 44, 46, 49, 50, 54, 56, 59, 61, 99, 253, 303, 1079]
Total subjects with any zero: 2 -> [389, 1097]
