In this notebook I prepare the xlxs files so that they are compatible to use with Centile (https://centilebrain.org/#/model)
I take the cortical thickness extracted in extract_thickenss_info.sh and the demographics xlxs files and create the required xlxs for Centile divided by sex and HC/patient. 

In [16]:
import numpy as np
import pandas as pd 
import os
import sys
import importlib

In [2]:
from paths import Paths 

In [3]:
RESOURCES_DIR = Paths.RESOURCES
DATA_DIR = Paths.DATA
RAW_DATA_DIR = os.path.join(DATA_DIR, 'raw')

## Prepare the data for centile

In [4]:
from scripts.utils import prepare_fs_default
fs_default = prepare_fs_default()

In [5]:
# The template given by Centile
template = pd.read_excel(f'{RESOURCES_DIR}/template_cortical_thickness.xlsx')

In [6]:
import scripts.utils
importlib.reload (scripts.utils)
from scripts.utils import get_raw_thickness
gmv = get_raw_thickness(RAW_DATA_DIR)

In [7]:
# My demo information
demo1= pd.read_excel(f'{DATA_DIR}/demographics/Demographics_data.xlsx')
demo2= pd.read_excel(f'{DATA_DIR}/demographics/Demographics_Test_data_2025.08.08.xlsx')

In [8]:
# Adjusting the demo xlxs columns
columns_to_keep1 = ['No', 'Sex', 'Age']
demo1 = demo1[columns_to_keep1]
demo1.rename(columns={'No': 'pid', 'Sex': 'sex', 'Age': 'age'}, inplace=True)

sub2 = np.unique(demo2['SubID'])
demo2 = demo2[(demo2['SubID'].isin(sub2))].copy()
columns_to_keep2 = ['ScanID', 'Sex', 'Age']
demo2 = demo2[columns_to_keep2]
demo2.rename(columns={'ScanID': 'pid', 'Sex': 'sex', 'Age': 'age'}, inplace=True)

In [9]:
# Merging them
demo = pd.concat([demo1, demo2])
demo['pid'] = demo['pid'].astype('str')
demo.set_index('pid', inplace=True)

In [10]:
# Merging demo and thickness information
demo_and_gmv = pd.concat([demo, gmv.loc[demo.index]], axis=1, join='inner')

In [11]:
# Dividing the two sexes
from scripts.utils import adjust_thick_template

m, f = adjust_thick_template(demo_and_gmv, template)

m.to_excel(f'{DATA_DIR}/processed/male_chinese_all_with_fu_cortical_thick.xlsx', index=False)
f.to_excel(f'{DATA_DIR}/processed/female_chinese_all_with_fu_cortical_thick.xlsx', index=False)

In [None]:
# Selecting only healthy controls
hc = ['1855', '1856', '1857', '1858', '1859', '1860', '1861', '1862', '1863', '1864']
hc_f = [patient for patient in hc if patient in f['SubjectID'].values]
hc_m = [patient for patient in hc if patient in m['SubjectID'].values]
m[m['SubjectID'].isin(hc_m)].to_excel(f'{DATA_DIR}/processed/male_chinese_hc_cortical_thick.xlsx', index=False)
f[f['SubjectID'].isin(hc_f)].to_excel(f'{DATA_DIR}/processed/female_chinese_hc_cortical_thick.xlsx', index=False)

In [66]:
# Selecting only the patients
pat_f = [patient for patient in f['SubjectID'].values if patient not in hc]
pat_m = [patient for patient in m['SubjectID'].values if patient not in hc]
m[m['SubjectID'].isin(pat_m)].to_excel(f'{DATA_DIR}/processed/male_chinese_pat_cortical_thick.xlsx', index=False)
f[f['SubjectID'].isin(pat_f)].to_excel(f'{DATA_DIR}/processed/female_chinese_pat_cortical_thick.xlsx', index=False)

## Run Cenitle

At this point I can run the Centile model and collect the results. 
I store the results in folders by giving them the same name of the xlxs file of input, plus the suffix _centile_results.

## Merge the results from Centile

Now I can put together the results from the male and female csv file specifying which score to use (MAE, zscore, prediction,...). 

In [14]:
from scripts.utils import merge_centile_results

PROCESSED_DIR = f"{DATA_DIR}/processed"
all_dfs = []
xlsx_files = ['male_chinese_all_with_fu_cortical_thick', 'female_chinese_all_with_fu_cortical_thick']
score = 'zscore'

df = merge_centile_results(xlsx_files, score, PROCESSED_DIR)

Processing male_chinese_all_with_fu_cortical_thick_centile_results
Processing female_chinese_all_with_fu_cortical_thick_centile_results


In [15]:
from scripts.utils import rename_to_fs_lut_labels
df = rename_to_fs_lut_labels(df, fs_default)
df.to_csv(f'{PROCESSED_DIR}/{score}_full_chinese_all_with_fu_cortical_thick.csv')

## Create one big demographics file 

In [27]:
processed_dir = f'{Paths.DATA}/processed'

male_demo = pd.read_excel(f'{processed_dir}/male_chinese_all_with_fu_cortical_thick.xlsx', index_col='SubjectID')
female_demo = pd.read_excel(f'{processed_dir}/female_chinese_all_with_fu_cortical_thick.xlsx', index_col='SubjectID')

demo1= pd.read_excel(f'{DATA_DIR}/demographics/Demographics_data.xlsx', index_col='No')
demo2= pd.read_excel(f'{DATA_DIR}/demographics/Demographics_Test_data_2025.08.08.xlsx', index_col='ScanID')

In [30]:
merged_demo = pd.concat([male_demo, female_demo])
columns_to_keep = ['age', 'sex']
merged_demo = merged_demo[columns_to_keep]

In [33]:
for pid in demo1.index:
    merged_demo.loc[pid, 'diagnosis'] = demo1.loc[pid, 'Diagnosis']

In [35]:
for pid in demo2.index:
    merged_demo.loc[pid, 'diagnosis'] = demo2.loc[pid, 'Diagnosis']

In [36]:
for pid in demo2.index:
    merged_demo.loc[pid, 'wave'] = demo2.loc[pid, 'Wave']

In [38]:
merged_demo.to_csv(f'{DATA_DIR}/demographics/full_demographics.csv')