### **Computing the demographics of the study population**

This notebook will mostly pull all demographic information for each follow-up and produce
a neat demographic table.

In [10]:
# Imports
import os

import matplotlib.pyplot as plt
from matplotlib import font_manager
import numpy as np
import pandas as pd
import seaborn as sns

In [8]:
# Setting up data and output paths.
repository_dir = "~/code/Gagnon_BrainAge_2025/"
data_abcd = "/Volumes/T7/BrainProfilesAssociations/ABCD/"
output_folder = "/Volumes/T7/BrainAgeStudy/"

# Create output folder if it doesn't exist.
output_dir = f"{output_folder}/Demographics/"
os.makedirs(output_dir, exist_ok=True)

In [9]:
# Fetch Harding font.
font_files = []
for fontpath in font_manager.findSystemFonts(fontpaths=None, fontext='ttf'):
    if "Harding".lower() in fontpath.lower():
        font_files.append(fontpath)
for font_file in font_files:
    font_manager.fontManager.addfont(font_file)

# Set Harding font.
plt.rcParams['font.family'] = 'Harding Text Web'

# Set the Set2 color palette as an iterable.
cmap = sns.color_palette("Set2", 8)

In [13]:
# Load data for baseline follow-up.
BAG_df = pd.read_csv(f"{output_folder}/BrainAge/brainAgeGaps.csv")
dx_df = pd.read_excel(f"{output_folder}/ABCD/abcd_dx_labels.xlsx")
demo_df = pd.read_csv(f"{output_folder}/ABCD/abcd_p_demo.csv")

# Format the diagnosis data, starting by dropping the "_" in subjectkey and adding "sub-".
dx_df['subjectkey'] = dx_df['subjectkey'].str.replace('_', '')
dx_df['subjectkey'] = 'sub-' + dx_df['subjectkey'].astype(str)

# Locking to baseline only.
demo_df_baseline = demo_df.loc[demo_df['eventname'] == 'baseline_year_1_arm_1']

demo_vars = [
    "src_subject_id",
    "demo_sex_v2",
    "race_ethnicity",
    "demo_prnt_ed_v2",
    "demo_prtnr_ed_v2",
    "demo_comb_income_v2"
]
demo_df_baseline = demo_df_baseline[demo_vars]
demo_df_baseline.columns = ["subjectkey", "Sex", "Ethnicity", "Parent_ed1", "Parent_ed2", "Income"]

# Compute some demographics variables.
# Highest education level (parent). Taking the highest amongst the two parents.
demo_df_baseline.loc[:, 'Parent_ed2'] = demo_df_baseline['Parent_ed2'].replace([777, 999, np.nan], 0)
demo_df_baseline.loc[:, 'high_edu'] = demo_df_baseline[['Parent_ed1', 'Parent_ed2']].values.max(1)

# Group levels together (<13 = 1, no high school, 13-14 = 2, high school, ged or equivalent,
# 15-17 = 3, some college, 18 = 4, bachelor, >19 = 5, postgraduate)
def create_edu_groups(x):
    if x < 13:
        return 1
    elif x in [13, 14]:
        return 2
    elif x in [15, 16, 17]:
        return 3
    elif x == 18:
        return 4
    elif x in [19, 20, 21]:
        return 5
    else:
        return 0

demo_df_baseline.loc[:, 'edu_groups'] = demo_df_baseline['high_edu'].apply(create_edu_groups)

# Group levels of income together ( <6 = 1, < 50 000, 6-8 = 2, 50-100 000, >9 = 3, >100 000).
def create_income_groups(x):
    if x < 6:
        return 1
    elif x in [6, 7, 8]:
        return 2
    elif x in [9, 10]:
        return 3
    else:
        return 0

demo_df_baseline.loc[:, 'income_groups'] = demo_df_baseline['Income'].apply(create_income_groups)

# Selecting the data to merge.
demo_df_baseline = demo_df_baseline.rename(columns={"subjectkey": "sid"})
demo_df_baseline["sid"] = demo_df_baseline["sid"].str.replace("_", "")
demo_df_baseline["sid"] = "sub-" + demo_df_baseline["sid"]
demo_df_baseline = demo_df_baseline[["sid", "income_groups", "edu_groups", "Ethnicity"]]
diagnosis_df = dx_df.rename(columns={"subjectkey": "sid"})

# Merge dataframes.
merged_df = BAG_df.merge(diagnosis_df, on="sid", how="left")
merged_df = merged_df.merge(demo_df_baseline, on="sid", how="left")

  demo_df = pd.read_csv(f"{output_folder}/ABCD/abcd_p_demo.csv")


In [22]:
# Split into male/female.
male_df = merged_df[merged_df['sex'] == 1]
female_df = merged_df[merged_df['sex'] == 2]

# Ethnicity groups.
abcd_ethn_m = male_df['Ethnicity'].value_counts()
abcd_ethn_f = female_df['Ethnicity'].value_counts()

# Parental education groups.
abcd_edu_m = male_df['edu_groups'].value_counts()
abcd_edu_f = female_df['edu_groups'].value_counts()

# Income groupsù
abcd_inc_m = male_df['income_groups'].value_counts()
abcd_inc_f = female_df['income_groups'].value_counts()

In [24]:
# Fetching the mean values and appending them to a list.
male_desc = [
    [male_df.count().iloc[0], np.round(male_df.count().iloc[0] * 100 / merged_df.shape[0], 2)],
    [np.round(male_df.age.mean(), 2), np.round(male_df.age.std(), 2)],
    [abcd_ethn_m.loc[1], np.round(abcd_ethn_m.loc[1] * 100 / merged_df.shape[0], 2)],
    [abcd_ethn_m.loc[2], np.round(abcd_ethn_m.loc[2] * 100 / merged_df.shape[0], 2)],
    [abcd_ethn_m.loc[3], np.round(abcd_ethn_m.loc[3] * 100 / merged_df.shape[0], 2)],
    [abcd_ethn_m.loc[4], np.round(abcd_ethn_m.loc[4] * 100 / merged_df.shape[0], 2)],
    [abcd_ethn_m.loc[5], np.round(abcd_ethn_m.loc[5] * 100 / merged_df.shape[0], 2)],
    [abcd_edu_m.loc[1], np.round(abcd_edu_m.loc[1] * 100 / merged_df.shape[0], 2)],
    [abcd_edu_m.loc[2], np.round(abcd_edu_m.loc[2] * 100 / merged_df.shape[0], 2)],
    [abcd_edu_m.loc[3], np.round(abcd_edu_m.loc[3] * 100 / merged_df.shape[0], 2)],
    [abcd_edu_m.loc[4], np.round(abcd_edu_m.loc[4] * 100 / merged_df.shape[0], 2)],
    [abcd_edu_m.loc[5], np.round(abcd_edu_m.loc[5] * 100 / merged_df.shape[0], 2)],
    [abcd_inc_m.loc[1], np.round(abcd_inc_m.loc[1] * 100 / merged_df.shape[0], 2)],
    [abcd_inc_m.loc[2], np.round(abcd_inc_m.loc[2] * 100 / merged_df.shape[0], 2)],
    [abcd_inc_m.loc[3], np.round(abcd_inc_m.loc[3] * 100 / merged_df.shape[0], 2)],
    [male_df.AD.value_counts().iloc[1], np.round(male_df.AD.value_counts().iloc[1] * 100 / merged_df.shape[0], 2)],
    [male_df.ADHD.value_counts().iloc[1], np.round(male_df.ADHD.value_counts().iloc[1] * 100 / merged_df.shape[0], 2)],
    [male_df.CD.value_counts().iloc[1], np.round(male_df.CD.value_counts().iloc[1] * 100 / merged_df.shape[0], 2)],
    [male_df.DD.value_counts().iloc[1], np.round(male_df.DD.value_counts().iloc[1] * 100 / merged_df.shape[0], 2)],
    [male_df.OCD.value_counts().iloc[1], np.round(male_df.OCD.value_counts().iloc[1] * 100 / merged_df.shape[0], 2)],
    [male_df.ODD.value_counts().iloc[1], np.round(male_df.ODD.value_counts().iloc[1] * 100 / merged_df.shape[0], 2)]
]

female_desc = [
    [female_df.count().iloc[0], np.round(female_df.count().iloc[0] * 100 / merged_df.shape[0], 2)],
    [np.round(female_df.age.mean(), 2), np.round(female_df.age.std(), 2)],
    [abcd_ethn_f.loc[1], np.round(abcd_ethn_f.loc[1] * 100 / merged_df.shape[0], 2)],
    [abcd_ethn_f.loc[2], np.round(abcd_ethn_f.loc[2] * 100 / merged_df.shape[0], 2)],
    [abcd_ethn_f.loc[3], np.round(abcd_ethn_f.loc[3] * 100 / merged_df.shape[0], 2)],
    [abcd_ethn_f.loc[4], np.round(abcd_ethn_f.loc[4] * 100 / merged_df.shape[0], 2)],
    [abcd_ethn_f.loc[5], np.round(abcd_ethn_f.loc[5] * 100 / merged_df.shape[0], 2)],
    [abcd_edu_f.loc[1], np.round(abcd_edu_f.loc[1] * 100 / merged_df.shape[0], 2)],
    [abcd_edu_f.loc[2], np.round(abcd_edu_f.loc[2] * 100 / merged_df.shape[0], 2)],
    [abcd_edu_f.loc[3], np.round(abcd_edu_f.loc[3] * 100 / merged_df.shape[0], 2)],
    [abcd_edu_f.loc[4], np.round(abcd_edu_f.loc[4] * 100 / merged_df.shape[0], 2)],
    [abcd_edu_f.loc[5], np.round(abcd_edu_f.loc[5] * 100 / merged_df.shape[0], 2)],
    [abcd_inc_f.loc[1], np.round(abcd_inc_f.loc[1] * 100 / merged_df.shape[0], 2)],
    [abcd_inc_f.loc[2], np.round(abcd_inc_f.loc[2] * 100 / merged_df.shape[0], 2)],
    [abcd_inc_f.loc[3], np.round(abcd_inc_f.loc[3] * 100 / merged_df.shape[0], 2)],
    [female_df.AD.value_counts().iloc[1], np.round(female_df.AD.value_counts().iloc[1] * 100 / merged_df.shape[0], 2)],
    [female_df.ADHD.value_counts().iloc[1], np.round(female_df.ADHD.value_counts().iloc[1] * 100 / merged_df.shape[0], 2)],
    [female_df.CD.value_counts().iloc[1], np.round(female_df.CD.value_counts().iloc[1] * 100 / merged_df.shape[0], 2)],
    [female_df.DD.value_counts().iloc[1], np.round(female_df.DD.value_counts().iloc[1] * 100 / merged_df.shape[0], 2)],
    [female_df.OCD.value_counts().iloc[1], np.round(female_df.OCD.value_counts().iloc[1] * 100 / merged_df.shape[0], 2)],
    [female_df.ODD.value_counts().iloc[1], np.round(female_df.ODD.value_counts().iloc[1] * 100 / merged_df.shape[0], 2)]
]

In [26]:
df_2y

Unnamed: 0,subjectkey,Site,Sex,Ethnicity,Parent_ed1,Parent_ed2,Income,high_edu,ParentalEducation,IncomeGroups,...,Internalization,Externalization,Stress,EFPS,MEM,VA,Cluster #3,Cluster #4,Cluster #1,Cluster #2
0,NDAR_INV00CY2MDM,site20,1,1,15,0,6,15,3,2,...,1.020123,3.495262,1.545284,-0.167966,-0.444910,-0.632933,0.161772,0.491750,0.164940,0.181537
1,NDAR_INV00HEV6HB,site12,1,2,13,0,999,13,2,0,...,0.015344,1.786694,0.332668,-0.401209,-0.568049,-0.368084,0.089811,0.637372,0.115641,0.157176
2,NDAR_INV00LH735Y,site03,1,3,13,13,6,13,2,2,...,-0.831512,-0.279471,-0.565088,0.134622,0.715226,0.044369,0.049324,0.099186,0.722971,0.128519
3,NDAR_INV00U4FTRU,site04,2,5,12,14,6,14,2,2,...,0.437570,2.475335,1.578049,-0.451149,-0.727621,-0.591611,0.138785,0.548812,0.135747,0.176656
4,NDAR_INV00X2TBWJ,site14,2,3,18,0,8,18,4,2,...,0.075813,-0.195367,-0.727131,0.397619,0.119776,0.384524,0.013823,0.012353,0.954400,0.019424
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7364,NDAR_INVZZLZCKAY,site06,2,1,15,15,9,15,3,3,...,2.219043,0.767139,1.728070,-0.877669,-0.283203,-0.191424,0.608477,0.131070,0.110258,0.150195
7365,NDAR_INVZZNX6W2P,site14,1,1,18,18,9,18,4,3,...,-0.534862,0.522330,-0.044702,-0.030656,0.117436,0.113831,0.069512,0.536851,0.231052,0.162585
7366,NDAR_INVZZPKBDAC,site12,2,1,19,18,10,19,5,3,...,-0.728954,-0.631710,-0.547606,0.342048,-0.096243,0.061500,0.058934,0.066926,0.612647,0.261493
7367,NDAR_INVZZZ2ALR6,site08,2,5,21,21,10,21,5,3,...,-0.517214,-0.653624,-0.741341,0.514459,0.319883,0.657772,0.037168,0.035440,0.875484,0.051908


In [28]:
# Now let's fetch the 2year follow-up and do the same thing.
df_2y = pd.read_excel(f"{output_folder}/fcm_2y.xlsx")

# Set male/female groups.
male_df_2y = df_2y[df_2y['Sex'] == 1]
female_df_2y = df_2y[df_2y['Sex'] == 2]

# Ethnicity groups.
abcd_ethn_m_2y = male_df_2y['Ethnicity'].value_counts()
abcd_ethn_f_2y = female_df_2y['Ethnicity'].value_counts()

# Parental education groups.
abcd_edu_m_2y = male_df_2y['ParentalEducation'].value_counts()
abcd_edu_f_2y = female_df_2y['ParentalEducation'].value_counts()

# Income groups.
abcd_inc_m_2y = male_df_2y['IncomeGroups'].value_counts()
abcd_inc_f_2y = female_df_2y['IncomeGroups'].value_counts()

In [30]:
# Fetching the mean values and appending them to a list.
male_desc_2y = [
    [male_df_2y.count().iloc[0], np.round(male_df_2y.count().iloc[0] * 100 / df_2y.shape[0], 2)],
    [np.round(male_df_2y.AgeMonths.mean(), 2), np.round(male_df_2y.AgeMonths.std(), 2)],
    [abcd_ethn_m_2y.loc[1], np.round(abcd_ethn_m_2y.loc[1] * 100 / df_2y.shape[0], 2)],
    [abcd_ethn_m_2y.loc[2], np.round(abcd_ethn_m_2y.loc[2] * 100 / df_2y.shape[0], 2)],
    [abcd_ethn_m_2y.loc[3], np.round(abcd_ethn_m_2y.loc[3] * 100 / df_2y.shape[0], 2)],
    [abcd_ethn_m_2y.loc[4], np.round(abcd_ethn_m_2y.loc[4] * 100 / df_2y.shape[0], 2)],
    [abcd_ethn_m_2y.loc[5], np.round(abcd_ethn_m_2y.loc[5] * 100 / df_2y.shape[0], 2)],
    [abcd_edu_m_2y.loc[1], np.round(abcd_edu_m_2y.loc[1] * 100 / df_2y.shape[0], 2)],
    [abcd_edu_m_2y.loc[2], np.round(abcd_edu_m_2y.loc[2] * 100 / df_2y.shape[0], 2)],
    [abcd_edu_m_2y.loc[3], np.round(abcd_edu_m_2y.loc[3] * 100 / df_2y.shape[0], 2)],
    [abcd_edu_m_2y.loc[4], np.round(abcd_edu_m_2y.loc[4] * 100 / df_2y.shape[0], 2)],
    [abcd_edu_m_2y.loc[5], np.round(abcd_edu_m_2y.loc[5] * 100 / df_2y.shape[0], 2)],
    [abcd_inc_m_2y.loc[1], np.round(abcd_inc_m_2y.loc[1] * 100 / df_2y.shape[0], 2)],
    [abcd_inc_m_2y.loc[2], np.round(abcd_inc_m_2y.loc[2] * 100 / df_2y.shape[0], 2)],
    [abcd_inc_m_2y.loc[3], np.round(abcd_inc_m_2y.loc[3] * 100 / df_2y.shape[0], 2)],
    [male_df_2y.AD.value_counts().iloc[1], np.round(male_df_2y.AD.value_counts().iloc[1] * 100 / df_2y.shape[0], 2)],
    [male_df_2y.ADHD.value_counts().iloc[1], np.round(male_df_2y.ADHD.value_counts().iloc[1] * 100 / df_2y.shape[0], 2)],
    [male_df_2y.CD.value_counts().iloc[1], np.round(male_df_2y.CD.value_counts().iloc[1] * 100 / df_2y.shape[0], 2)],
    [male_df_2y.DD.value_counts().iloc[1], np.round(male_df_2y.DD.value_counts().iloc[1] * 100 / df_2y.shape[0], 2)],
    [male_df_2y.OCD.value_counts().iloc[1], np.round(male_df_2y.OCD.value_counts().iloc[1] * 100 / df_2y.shape[0], 2)],
    [male_df_2y.ODD.value_counts().iloc[1], np.round(male_df_2y.ODD.value_counts().iloc[1] * 100 / df_2y.shape[0], 2)]
]

female_desc_2y = [
    [female_df_2y.count().iloc[0], np.round(female_df_2y.count().iloc[0] * 100 / df_2y.shape[0], 2)],
    [np.round(female_df_2y.AgeMonths.mean(), 2), np.round(female_df_2y.AgeMonths.std(), 2)],
    [abcd_ethn_f_2y.loc[1], np.round(abcd_ethn_f_2y.loc[1] * 100 / df_2y.shape[0], 2)],
    [abcd_ethn_f_2y.loc[2], np.round(abcd_ethn_f_2y.loc[2] * 100 / df_2y.shape[0], 2)],
    [abcd_ethn_f_2y.loc[3], np.round(abcd_ethn_f_2y.loc[3] * 100 / df_2y.shape[0], 2)],
    [abcd_ethn_f_2y.loc[4], np.round(abcd_ethn_f_2y.loc[4] * 100 / df_2y.shape[0], 2)],
    [abcd_ethn_f_2y.loc[5], np.round(abcd_ethn_f_2y.loc[5] * 100 / df_2y.shape[0], 2)],
    [abcd_edu_f_2y.loc[1], np.round(abcd_edu_f_2y.loc[1] * 100 / df_2y.shape[0], 2)],
    [abcd_edu_f_2y.loc[2], np.round(abcd_edu_f_2y.loc[2] * 100 / df_2y.shape[0], 2)],
    [abcd_edu_f_2y.loc[3], np.round(abcd_edu_f_2y.loc[3] * 100 / df_2y.shape[0], 2)],
    [abcd_edu_f_2y.loc[4], np.round(abcd_edu_f_2y.loc[4] * 100 / df_2y.shape[0], 2)],
    [abcd_edu_f_2y.loc[5], np.round(abcd_edu_f_2y.loc[5] * 100 / df_2y.shape[0], 2)],
    [abcd_inc_f_2y.loc[1], np.round(abcd_inc_f_2y.loc[1] * 100 / df_2y.shape[0], 2)],
    [abcd_inc_f_2y.loc[2], np.round(abcd_inc_f_2y.loc[2] * 100 / df_2y.shape[0], 2)],
    [abcd_inc_f_2y.loc[3], np.round(abcd_inc_f_2y.loc[3] * 100 / df_2y.shape[0], 2)],
    [female_df_2y.AD.value_counts().iloc[1], np.round(female_df_2y.AD.value_counts().iloc[1] * 100 / df_2y.shape[0], 2)],
    [female_df_2y.ADHD.value_counts().iloc[1], np.round(female_df_2y.ADHD.value_counts().iloc[1] * 100 / df_2y.shape[0], 2)],
    [female_df_2y.CD.value_counts().iloc[1], np.round(female_df_2y.CD.value_counts().iloc[1] * 100 / df_2y.shape[0], 2)],
    [female_df_2y.DD.value_counts().iloc[1], np.round(female_df_2y.DD.value_counts().iloc[1] * 100 / df_2y.shape[0], 2)],
    [female_df_2y.OCD.value_counts().iloc[1], np.round(female_df_2y.OCD.value_counts().iloc[1] * 100 / df_2y.shape[0], 2)],
    [female_df_2y.ODD.value_counts().iloc[1], np.round(female_df_2y.ODD.value_counts().iloc[1] * 100 / df_2y.shape[0], 2)]
]

In [31]:
# Now for the 4y follow-up.
df_4y = pd.read_excel(f"{output_folder}/fcm_4y.xlsx")

# Set into male/female groups.
male_df_4y = df_4y[df_4y['Sex'] == 1]
female_df_4y = df_4y[df_4y['Sex'] == 2]

# Ethnicity groups.
abcd_ethn_m_4y = male_df_4y['Ethnicity'].value_counts()
abcd_ethn_f_4y = female_df_4y['Ethnicity'].value_counts()

# Parental education groups.
abcd_edu_m_4y = male_df_4y['ParentalEducation'].value_counts()
abcd_edu_f_4y = female_df_4y['ParentalEducation'].value_counts()

# Income groups.
abcd_inc_m_4y = male_df_4y['IncomeGroups'].value_counts()
abcd_inc_f_4y = female_df_4y['IncomeGroups'].value_counts()

In [35]:
# Fetching the mean values and appending them to a list.
male_desc_4y = [
    [male_df_4y.count().iloc[0], np.round(male_df_4y.count().iloc[0] * 100 / df_4y.shape[0], 2)],
    [np.round(male_df_4y.AgeMonths.mean(), 2), np.round(male_df_4y.AgeMonths.std(), 2)],
    [abcd_ethn_m_4y.loc[1], np.round(abcd_ethn_m_4y.loc[1] * 100 / df_4y.shape[0], 2)],
    [abcd_ethn_m_4y.loc[2], np.round(abcd_ethn_m_4y.loc[2] * 100 / df_4y.shape[0], 2)],
    [abcd_ethn_m_4y.loc[3], np.round(abcd_ethn_m_4y.loc[3] * 100 / df_4y.shape[0], 2)],
    [abcd_ethn_m_4y.loc[4], np.round(abcd_ethn_m_4y.loc[4] * 100 / df_4y.shape[0], 2)],
    [abcd_ethn_m_4y.loc[5], np.round(abcd_ethn_m_4y.loc[5] * 100 / df_4y.shape[0], 2)],
    [abcd_edu_m_4y.loc[1], np.round(abcd_edu_m_4y.loc[1] * 100 / df_4y.shape[0], 2)],
    [abcd_edu_m_4y.loc[2], np.round(abcd_edu_m_4y.loc[2] * 100 / df_4y.shape[0], 2)],
    [abcd_edu_m_4y.loc[3], np.round(abcd_edu_m_4y.loc[3] * 100 / df_4y.shape[0], 2)],
    [abcd_edu_m_4y.loc[4], np.round(abcd_edu_m_4y.loc[4] * 100 / df_4y.shape[0], 2)],
    [abcd_edu_m_4y.loc[5], np.round(abcd_edu_m_4y.loc[5] * 100 / df_4y.shape[0], 2)],
    [abcd_inc_m_4y.loc[1], np.round(abcd_inc_m_4y.loc[1] * 100 / df_4y.shape[0], 2)],
    [abcd_inc_m_4y.loc[2], np.round(abcd_inc_m_4y.loc[2] * 100 / df_4y.shape[0], 2)],
    [abcd_inc_m_4y.loc[3], np.round(abcd_inc_m_4y.loc[3] * 100 / df_4y.shape[0], 2)],
    "-", "-", "-", "-", "-", "-",
    #[male_df_4y.AD.value_counts().iloc[1], np.round(male_df_4y.AD.value_counts().iloc[1] * 100 / df_4y.shape[0], 2)],
    #[male_df_4y.ADHD.value_counts().iloc[1], np.round(male_df_4y.ADHD.value_counts().iloc[1] * 100 / df_4y.shape[0], 2)],
    #[male_df_4y.CD.value_counts().iloc[1], np.round(male_df_4y.CD.value_counts().iloc[1] * 100 / df_4y.shape[0], 2)],
    #[male_df_4y.DD.value_counts().iloc[1], np.round(male_df_4y.DD.value_counts().iloc[1] * 100 / df_4y.shape[0], 2)],
    #[male_df_4y.OCD.value_counts().iloc[1], np.round(male_df_4y.OCD.value_counts().iloc[1] * 100 / df_4y.shape[0], 2)],
    #[male_df_4y.ODD.value_counts().iloc[1], np.round(male_df_4y.ODD.value_counts().iloc[1] * 100 / df_4y.shape[0], 2)]
]

female_desc_4y = [
    [female_df_4y.count().iloc[0], np.round(female_df_4y.count().iloc[0] * 100 / df_4y.shape[0], 2)],
    [np.round(female_df_4y.AgeMonths.mean(), 2), np.round(female_df_4y.AgeMonths.std(), 2)],
    [abcd_ethn_f_4y.loc[1], np.round(abcd_ethn_f_4y.loc[1] * 100 / df_4y.shape[0], 2)],
    [abcd_ethn_f_4y.loc[2], np.round(abcd_ethn_f_4y.loc[2] * 100 / df_4y.shape[0], 2)],
    [abcd_ethn_f_4y.loc[3], np.round(abcd_ethn_f_4y.loc[3] * 100 / df_4y.shape[0], 2)],
    [abcd_ethn_f_4y.loc[4], np.round(abcd_ethn_f_4y.loc[4] * 100 / df_4y.shape[0], 2)],
    [abcd_ethn_f_4y.loc[5], np.round(abcd_ethn_f_4y.loc[5] * 100 / df_4y.shape[0], 2)],
    [abcd_edu_f_4y.loc[1], np.round(abcd_edu_f_4y.loc[1] * 100 / df_4y.shape[0], 2)],
    [abcd_edu_f_4y.loc[2], np.round(abcd_edu_f_4y.loc[2] * 100 / df_4y.shape[0], 2)],
    [abcd_edu_f_4y.loc[3], np.round(abcd_edu_f_4y.loc[3] * 100 / df_4y.shape[0], 2)],
    [abcd_edu_f_4y.loc[4], np.round(abcd_edu_f_4y.loc[4] * 100 / df_4y.shape[0], 2)],
    [abcd_edu_f_4y.loc[5], np.round(abcd_edu_f_4y.loc[5] * 100 / df_4y.shape[0], 2)],
    [abcd_inc_f_4y.loc[1], np.round(abcd_inc_f_4y.loc[1] * 100 / df_4y.shape[0], 2)],
    [abcd_inc_f_4y.loc[2], np.round(abcd_inc_f_4y.loc[2] * 100 / df_4y.shape[0], 2)],
    [abcd_inc_f_4y.loc[3], np.round(abcd_inc_f_4y.loc[3] * 100 / df_4y.shape[0], 2)],
    "-", "-", "-", "-", "-", "-"
    #[female_df_4y.AD.value_counts().iloc[1], np.round(female_df_4y.AD.value_counts().iloc[1] * 100 / df_4y.shape[0], 2)],
    #[female_df_4y.ADHD.value_counts().iloc[1], np.round(female_df_4y.ADHD.value_counts().iloc[1] * 100 / df_4y.shape[0], 2)],
    #[female_df_4y.CD.value_counts().iloc[1], np.round(female_df_4y.CD.value_counts().iloc[1] * 100 / df_4y.shape[0], 2)],
    #[female_df_4y.DD.value_counts().iloc[1], np.round(female_df_4y.DD.value_counts().iloc[1] * 100 / df_4y.shape[0], 2)],
    #[female_df_4y.OCD.value_counts().iloc[1], np.round(female_df_4y.OCD.value_counts().iloc[1] * 100 / df_4y.shape[0], 2)],
    #[female_df_4y.ODD.value_counts().iloc[1], np.round(female_df_4y.ODD.value_counts().iloc[1] * 100 / df_4y.shape[0], 2)]
]

In [36]:
# Merge into a clean dataframe and export as csv.
demo_table = pd.DataFrame({
    "baseline_male": male_desc,
    "baseline_female": female_desc,
    "2y_male": male_desc_2y,
    "2y_female": female_desc_2y,
    "4y_male": male_desc_4y,
    "4y_female": female_desc_4y
})
demo_table.index = [
    "N", "Age (months)", "White", "Black or African American", "Hispanic or Latino",
    "Asian", "Other", "No Highschool", "Highschool, GED, or equivalent", "Some college",
    "Bachelor Degree", "Postgraduate Degree", "Income < 50 000$USD", "Income 50 000-100 000$USD",
    "Income > 100 000$USD", "AD (%)", "ADHD (%)", "CD (%)", "DD (%)", "OCD (%)", "ODD (%)"
]
demo_table.to_csv(f"{output_dir}/DemographicsTable.csv")