In [1]:
import pandas as pd

# load
df = pd.read_csv("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/data/stratified_folds.csv")

# count people in groups
counts = (df.groupby(["age_group", "edu_group"])
            .size()
            .reset_index(name="n"))
print(counts)

# matrix (age rows × education-level columns) with totals
xtab = pd.crosstab(df["age_group"], df["edu_group"], margins=True)
print(xtab)

# just totals per age_group or per edu_group
by_age = df["age_group"].value_counts().sort_index()
by_edu = df["edu_group"].value_counts().sort_index()
print(by_age); print(by_edu)


  age_group edu_group    n
0    65to75      high  279
1    65to75       low   13
2    65to75    medium  166
3    over75      high   27
4    over75    medium   11
5   under65      high  288
6   under65       low   11
7   under65    medium  193
edu_group  high  low  medium  All
age_group                        
65to75      279   13     166  458
over75       27    0      11   38
under65     288   11     193  492
All         594   24     370  988
age_group
65to75     458
over75      38
under65    492
Name: count, dtype: int64
edu_group
high      594
low        24
medium    370
Name: count, dtype: int64


In [2]:
# make simpler groups
df["age_simple"] = df["age_group"].replace({"under65": "<65", "65to75": ">65", "over75": ">65"})
df["edu_simple"] = df["edu_group"].replace({"low": "low+medium", "medium": "low+medium", "high": "high"})

# crosstab
xtab = pd.crosstab(df["age_simple"], df["edu_simple"], margins=True)
print(xtab)


edu_simple  high  low+medium  All
age_simple                       
<65          288         204  492
>65          306         190  496
All          594         394  988


In [6]:
# paths
stratified_folds_path = "/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/data/stratified_folds.csv"
scores_path = "/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/data/language_scores_all_subjects.csv"
out_dir = "/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/data/classification_groups.csv"

# load
demographics = pd.read_csv(stratified_folds_path)
scores = pd.read_csv(scores_path)

# add the simplified education- and age-groups to a new csv-file with the scores
age_map = {"under65": "<65", "65to75": "≥65", "over75": "≥65"}
demographics["age_bin"] = demographics["age_group"].map(age_map).fillna(demographics.get("age_group"))

edu_map = {"low": "low+medium", "medium": "low+medium", "high": "high"}
demographics["education_bin"] = demographics["edu_group"].map(edu_map).fillna(demographics.get("edu_group"))

cols_scores = [
    "Subject_ID",
    "PictureNamingScore",
    "SemanticFluencyScore",
    "PhonemicFluencyScore",
]
need_in_scores = [c for c in cols_scores if c in scores.columns]
merged = (
    demographics[["Subject_ID", "age_bin", "education_bin"]]
    .merge(scores[need_in_scores], on="Subject_ID", how="inner")
)

# quick counts to check
print("\ncounts by age_bin × education_bin")
print(pd.crosstab(merged["age_bin"], merged["education_bin"], margins=True))

# save
merged.to_csv(out_dir, index=False)



counts by age_bin × education_bin
education_bin  high  low+medium  All
age_bin                             
<65             288         204  492
≥65             306         190  496
All             594         394  988
