In [1]:
import simpsom as sps
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import pickle

from collections import OrderedDict

scale = lambda x,a,b: (b-a)*(x-min(x))/(max(x)-min(x)) + a

%matplotlib inline

In [2]:
with open('data_scorenorm/combined_data_score_norms_v3_cont_asd.pkl', 'rb') as file:
    combined_data = pickle.load(file)

In [3]:
score_norms_abcd_val = combined_data['score_norms']['abcd-val']
score_norms_abcd_test = combined_data['score_norms']['abcd-test']
score_norms_abcd_asd = combined_data['score_norms']['abcd-asd']
score_norms_conte = combined_data['score_norms']['conte']
score_norms_ibis_typical = combined_data['score_norms']['ibis-inlier']
score_norms_ibis_hr_typical = combined_data['score_norms']['ibis-hr-inliers']
score_norms_ibis_atypical = combined_data['score_norms']['ibis-atypical']
score_norms_ibis_asd = combined_data['score_norms']['ibis-asd']
score_norms_ibis_ds = combined_data['score_norms']['ibis-ds-sa']

In [4]:
score_norms_abcd = np.concatenate([score_norms_abcd_val, score_norms_abcd_test], axis=0)

score_norms_abcd.shape, score_norms_ibis_typical.shape

((330, 1, 20), (82, 1, 20))

In [5]:
identifiers_abcd_val = combined_data['identifier']['abcd-val']
identifiers_abcd_test = combined_data['identifier']['abcd-test']
identifiers_abcd_asd = combined_data['identifier']['abcd-asd']
identifiers_conte = combined_data['identifier']['conte']
identifiers_ibis_typical = combined_data['identifier']['ibis-inlier']
identifiers_ibis_hr_typical = combined_data['identifier']['ibis-hr-inliers']
identifiers_ibis_atypical = combined_data['identifier']['ibis-atypical']
identifiers_ibis_asd = combined_data['identifier']['ibis-asd']
identifiers_ibis_ds = combined_data['identifier']['ibis-ds-sa']

identifiers_abcd = np.concatenate([identifiers_abcd_val, identifiers_abcd_test], axis=0)
identifiers_abcd.shape

(330,)

In [6]:
print('score_norms_abcd_val\t\t', score_norms_abcd_val.shape)
print('score_norms_abcd_test\t\t', score_norms_abcd_test.shape)
print('score_norms_abcd\t\t', score_norms_abcd.shape)
print('score_norms_abcd_asd\t\t', score_norms_abcd_asd.shape)
print('score_norms_conte\t\t', score_norms_conte.shape)
print('score_norms_ibis_typical\t', score_norms_ibis_typical.shape)
print('score_norms_ibis_hr_typical\t', score_norms_ibis_hr_typical.shape)
print('score_norms_ibis_atypical\t', score_norms_ibis_atypical.shape)
print('score_norms_ibis_asd\t\t', score_norms_ibis_asd.shape)
print('score_norms_ibis_ds\t\t', score_norms_ibis_ds.shape)

score_norms_abcd_val		 (165, 1, 20)
score_norms_abcd_test		 (165, 1, 20)
score_norms_abcd		 (330, 1, 20)
score_norms_abcd_asd		 (161, 1, 20)
score_norms_conte		 (444, 1, 20)
score_norms_ibis_typical	 (82, 1, 20)
score_norms_ibis_hr_typical	 (106, 1, 20)
score_norms_ibis_atypical	 (38, 1, 20)
score_norms_ibis_asd		 (65, 1, 20)
score_norms_ibis_ds		 (28, 1, 20)


In [7]:
score_norms_abcd_val = np.squeeze(score_norms_abcd_val)
score_norms_abcd_test = np.squeeze(score_norms_abcd_test)
score_norms_abcd = np.squeeze(score_norms_abcd)
score_norms_abcd_asd = np.squeeze(score_norms_abcd_asd)
score_norms_conte = np.squeeze(score_norms_conte)
score_norms_ibis_typical = np.squeeze(score_norms_ibis_typical)
score_norms_ibis_hr_typical = np.squeeze(score_norms_ibis_hr_typical)
score_norms_ibis_atypical = np.squeeze(score_norms_ibis_atypical)
score_norms_ibis_asd = np.squeeze(score_norms_ibis_asd)
score_norms_ibis_ds = np.squeeze(score_norms_ibis_ds)

In [8]:
dataset_id_to_name = {-1: "ABCD", 0:"LR-Typical", 1:"HR-Typical", 2:"Atypical", 3:"Down Syndrome", 4:"IBIS-ASD", 5:"ABCD-ASD", 6:"Conte"}
dataset_name_to_id = {v:k for k,v in dataset_id_to_name.items()}
#dataset_list = [abcd_data, ibis_typical, ibis_hr_typical, ibis_atypical, ibis_ds, ibis_asd]
dataset_list = [score_norms_abcd, score_norms_ibis_typical, score_norms_ibis_hr_typical, score_norms_ibis_atypical, score_norms_ibis_ds, score_norms_ibis_asd, score_norms_abcd_asd, score_norms_conte]

X_data = np.concatenate(dataset_list, axis=0)
y_labels = np.concatenate([[i-1]*len(d) for i,d in enumerate(dataset_list)])

assert len(y_labels)==len(X_data)
# X_data = (X_data - np.mean(abcd_data, axis=0)) / np.std(abcd_data, axis=0)
inlier_data = X_data[y_labels <= 0]
test_data = X_data[y_labels > 0]
test_labels = y_labels[y_labels > 0]
train_labels = y_labels[y_labels <= 0]


mu, std = np.mean(inlier_data, axis=0), np.std(inlier_data, axis=0)
inlier_data = (inlier_data - mu) / std
test_data = (test_data  - mu) / std

dssamples = test_data[test_labels == dataset_name_to_id["ABCD-ASD"]]
X_data.shape, inlier_data.shape, len(dssamples), test_data.shape

((1254, 20), (412, 20), 161, (842, 20))

In [9]:
identifiers_list = [identifiers_abcd, identifiers_ibis_typical, identifiers_ibis_hr_typical, identifiers_ibis_atypical, identifiers_ibis_ds, identifiers_ibis_asd, identifiers_abcd_asd, identifiers_conte]
indentifiers_data = np.concatenate(identifiers_list, axis=0)
inlier_identifiers = indentifiers_data[y_labels <= 0] 
test_identifiers = indentifiers_data[y_labels > 0]

## Getting required Metadata for identifying and saving batches.

In [10]:
indentifiers_data.shape

(1254,)

In [11]:
indentifiers_data[-1]

'TWINST0326-1-2'

In [12]:
identifiers_ibis_asd

['IBIS845807',
 'IBIS107524',
 'IBIS903923',
 'IBIS108131',
 'IBIS868281',
 'IBIS110925',
 'IBIS910593',
 'IBIS152174',
 'IBIS919979',
 'IBIS155712',
 'IBIS929204',
 'IBIS160209',
 'IBIS973338',
 'IBIS174982',
 'IBIS983222',
 'IBIS230955',
 'IBIS988197',
 'IBIS235108',
 'IBIS988903',
 'IBIS241033',
 'IBIS291809',
 'IBIS336576',
 'IBIS373485',
 'IBIS389101',
 'IBIS389521',
 'IBIS391729',
 'IBIS396346',
 'IBIS402809',
 'IBIS439857',
 'IBIS447754',
 'IBIS456823',
 'IBIS460612',
 'IBIS463809',
 'IBIS482642',
 'IBIS490878',
 'IBIS502237',
 'IBIS515353',
 'IBIS544847',
 'IBIS552855',
 'IBIS553295',
 'IBIS569748',
 'IBIS570637',
 'IBIS597653',
 'IBIS600014',
 'IBIS603076',
 'IBIS628043',
 'IBIS638999',
 'IBIS641078',
 'IBIS649998',
 'IBIS680028',
 'IBIS688878',
 'IBIS769060',
 'IBIS784198',
 'IBIS792433',
 'IBIS809318',
 'IBIS810826',
 'IBIS816236',
 'IBIS830319',
 'IBIS841812',
 'IBIS930203',
 'IBIS931343',
 'IBIS942002',
 'IBIS943489',
 'IBIS950194',
 'IBIS963992']

In [13]:
ibis_metadata = pd.read_csv("spreadsheets/ibis_metadata_v2.csv")
ibis_metadata.index = ibis_metadata["CandID"].apply(lambda x: "IBIS" + str(x))
ibis_metadata.index.name = "ID"
ibis_metadata = ibis_metadata.astype(np.float32, errors="ignore")

In [14]:
list(ibis_metadata.columns)

['Identifiers',
 'V24 ADOS_Derived,restricted_repetitive_behavior_total',
 'V36 ADOS_Derived,restricted_repetitive_behavior_total',
 'V37Plus ADOS_Derived,restricted_repetitive_behavior_total',
 'V24 ADOS_Derived,severity_score_lookup',
 'V36 ADOS_Derived,severity_score_lookup',
 'V37Plus ADOS_Derived,severity_score_lookup',
 'V24 ADOS_Derived,social_affect_restricted_repetitive_behavior_total',
 'V36 ADOS_Derived,social_affect_restricted_repetitive_behavior_total',
 'V37Plus ADOS_Derived,social_affect_restricted_repetitive_behavior_total',
 'V24 ADOS_Derived,social_affect_total',
 'V36 ADOS_Derived,social_affect_total',
 'V37Plus ADOS_Derived,social_affect_total',
 'ASD_Latest_DSMIV',
 'ASD_Latest_DSMV',
 'CandID',
 'VSA-Subtype',
 'VSA DAS_SA,PC_ABILITY_SCORE',
 'VSA DAS_SA,matrices_ABILITY_SCORE',
 'VSA DAS_SA,rd_ABILITY_SCORE',
 'VSA DAS_SA,seq_qr_ABILITY_SCORE',
 'VSA DAS_SA,verbal_similarities_ABILITY_SCORE',
 'VSA DAS_SA,word_defn_ABILITY_SCORE',
 'Vineland-adapt_behave_comp_PER

In [15]:
ibis_metadata

Unnamed: 0_level_0,Identifiers,"V24 ADOS_Derived,restricted_repetitive_behavior_total","V36 ADOS_Derived,restricted_repetitive_behavior_total","V37Plus ADOS_Derived,restricted_repetitive_behavior_total","V24 ADOS_Derived,severity_score_lookup","V36 ADOS_Derived,severity_score_lookup","V37Plus ADOS_Derived,severity_score_lookup","V24 ADOS_Derived,social_affect_restricted_repetitive_behavior_total","V36 ADOS_Derived,social_affect_restricted_repetitive_behavior_total","V37Plus ADOS_Derived,social_affect_restricted_repetitive_behavior_total",...,CBCL-stress_prob_T_score,CBCL-stress_prob_percentile,CBCL-stress_prob_score,CBCL-thought_prob_T_score,CBCL-thought_prob_percentile,CBCL-thought_prob_score,CBCL-total_problems_T_score,CBCL-total_problems_percentile,CBCL-total_problems_score,Sex
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
IBIS389101,PHI0000,3.0,1.0,-1.0,8.0,1.0,-1.0,17.0,2.0,-1.0,...,-1,-1,-1.0,-1,-1,-1.0,-1,-1,-1.0,Male
IBIS-1,PHI0002,2.0,6.0,-1.0,8.0,10.0,-1.0,17.0,20.0,-1.0,...,-1,-1,-1.0,-1,-1,-1.0,-1,-1,-1.0,-1
IBIS769118,PHI0006,0.0,1.0,-1.0,1.0,1.0,-1.0,3.0,2.0,-1.0,...,50,50,1.0,51,54,1.0,40,16,6.0,Male
IBIS638943,PHI0007,1.0,2.0,-1.0,2.0,2.0,-1.0,4.0,5.0,-1.0,...,74,99,14.0,64,92,5.0,71,99,71.0,Male
IBIS929204,PHI0010,2.0,2.0,-1.0,7.0,6.0,-1.0,16.0,12.0,-1.0,...,57,76,5.0,54,65,2.0,57,76,34.0,Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
IBIS346176,UNC1052,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1,-1,-1.0,-1,-1,-1.0,-1,-1,-1.0,-1
IBIS345986,UNC1053,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1,-1,-1.0,-1,-1,-1.0,-1,-1,-1.0,-1
IBIS490549,UNC1054,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1,-1,-1.0,-1,-1,-1.0,-1,-1,-1.0,-1
IBIS250626,UNC1055,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1,-1,-1.0,-1,-1,-1.0,-1,-1,-1.0,-1


In [16]:
ibis_metadata['Sex'].unique() # There is -1 

array(['Male', '-1', 'Female'], dtype=object)

In [17]:
VSADemographicsMar2023 = pd.read_csv("spreadsheets/VSADemographicsMar2023.csv")
VSADemographicsMar2023.index = VSADemographicsMar2023["VSA demographics,CandID"].apply(lambda x: "IBIS" + str(x))
VSADemographicsMar2023.index.name = "ID"
VSADemographicsMar2023 = VSADemographicsMar2023.astype(np.float32, errors="ignore")

In [18]:
VSADemographicsMar2023

Unnamed: 0_level_0,"VSA demographics,CandID",Identifiers,"VSA demographics,ASD_Ever_DSMIV","VSA demographics,ASD_Ever_DSMV","VSA demographics,Age_at_visit_start","VSA demographics,Risk","VSA demographics,Sex","VSA demographics,candidate_race"
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
IBIS103831,103831.0,UNC0234,No DSMIV ever administered,No DSMV ever administered,131.0,,Male,white
IBIS105477,105477.0,SEA4003,ASD-,ASD-,136.0,,Female,white
IBIS105784,105784.0,UNC0239,"ASD+ (V24, V36)",No DSMV ever administered,134.0,,Male,
IBIS107524,107524.0,STL0200,"ASD+ (V24, V36, VSA-CVD)","ASD+ (V36, VSA-CVD)",135.0,HR,Male,0
IBIS108131,108131.0,SEA1006,ASD+ (V24),ASD+ (V24),84.0,HR,Male,white
...,...,...,...,...,...,...,...,...
IBIS988903,988903.0,SEA0163,"ASD+ (V24, V36, VSA)","ASD+ (V36, VSA)",103.0,HR,Male,
IBIS989241,989241.0,PHI4005,ASD-,ASD-,87.0,,Male,white
IBIS992063,992063.0,STL0003,ASD-,ASD-,137.0,HR,Female,
IBIS995004,995004.0,STL0249,ASD-,ASD-,128.0,HR,Female,0


In [19]:
VSADemographicsMar2023["VSA demographics,Sex"].value_counts()

VSA demographics,Sex
Male      253
Female    170
Name: count, dtype: int64

In [20]:
VSADemographicsMar2023["Identifiers"].apply(lambda x: x[:3]).value_counts()


Identifiers
UNC    112
STL    109
SEA    107
PHI     95
Name: count, dtype: int64

In [21]:
# NOTE: Path in local: /.../ABCD/ABCD_5.1_Spreadsheets/core/imaging/mri_y_adm_info.csv
abcd_mri_y_adm_info = pd.read_csv("spreadsheets/mri_y_adm_info.csv")

In [22]:
abcd_mri_y_adm_info

Unnamed: 0,src_subject_id,eventname,mri_info_visitid,mri_info_manufacturer,mri_info_manufacturersmn,mri_info_deviceserialnumber,mri_info_magneticfieldstrength,mri_info_softwareversion,mri_info_studydate
0,NDAR_INV003RTV85,baseline_year_1_arm_1,S042_INV003RTV85_baseline,SIEMENS,Prisma_fit,HASH96a0c182,3,syngo MR E11,20181001
1,NDAR_INV005V6D2C,baseline_year_1_arm_1,G031_INV005V6D2C_baseline,GE MEDICAL SYSTEMS,DISCOVERY MR750,HASHe3ce02d3,3,27\LX\MR Software release:DV26.0_R01_1725.a,20180422
2,NDAR_INV007W6H7B,baseline_year_1_arm_1,G054_INV007W6H7B_baseline,GE MEDICAL SYSTEMS,DISCOVERY MR750,HASH48f7cbc3,3,25\LX\MR Software release:DV25.0_R02_1549.b,20170224
3,NDAR_INV00BD7VDC,baseline_year_1_arm_1,S090_INV00BD7VDC_baseline,SIEMENS,Prisma_fit,HASH65b39280,3,syngo MR E11,20180612
4,NDAR_INV00CY2MDM,2_year_follow_up_y_arm_1,S021_INV00CY2MDM_2year,SIEMENS,Prisma,HASHd422be27,3,syngo MR E11,20190615
...,...,...,...,...,...,...,...,...,...
22934,NDAR_INVZZZNB0XC,4_year_follow_up_y_arm_1,S011_INVZZZNB0XC_4year,SIEMENS,Prisma,HASH5b0cf1bb,3,syngo MR E11,20210205
22935,NDAR_INVZZZNB0XC,baseline_year_1_arm_1,S011_INVZZZNB0XC_baseline,SIEMENS,Prisma,HASH5b0cf1bb,3,syngo MR E11,20170103
22936,NDAR_INVZZZP87KR,2_year_follow_up_y_arm_1,P023_INVZZZP87KR_2year,Philips Medical Systems,Ingenia,HASH5ac2b20b,3,5.3.1\5.3.1.2,20190802
22937,NDAR_INVZZZP87KR,4_year_follow_up_y_arm_1,P023_INVZZZP87KR_4year,Philips Medical Systems,Ingenia,HASH5ac2b20b,3,5.6.1\5.6.1.2,20210819


In [23]:
# Step 1: Filter the dataframe for samples with eventname "baseline_year_1_arm_1"
abcd_mri_y_adm_info_baseline = abcd_mri_y_adm_info[abcd_mri_y_adm_info['eventname'] == 'baseline_year_1_arm_1']

# Step 2: Group by src_subject_id and extract a list of unique mri_info_manufacturer values for each subject
manufacturer_lists = abcd_mri_y_adm_info_baseline.groupby('src_subject_id')['mri_info_manufacturer'] \
    .agg(lambda x: list(pd.unique(x)))

# Step 3: Filter to keep only those src_subject_id entries with more than one unique manufacturer
inconsistent_ids = manufacturer_lists[manufacturer_lists.apply(lambda x: len(x) > 1)]

# Step 4: Display the results
print("src_subject_id with more than one mri_info_manufacturer for baseline_year_1_arm_1:")
for subject_id, manufacturers in inconsistent_ids.items():
    print(f"{subject_id}: {manufacturers}")

src_subject_id with more than one mri_info_manufacturer for baseline_year_1_arm_1:


In [24]:
abcd_mri_y_adm_info_baseline

Unnamed: 0,src_subject_id,eventname,mri_info_visitid,mri_info_manufacturer,mri_info_manufacturersmn,mri_info_deviceserialnumber,mri_info_magneticfieldstrength,mri_info_softwareversion,mri_info_studydate
0,NDAR_INV003RTV85,baseline_year_1_arm_1,S042_INV003RTV85_baseline,SIEMENS,Prisma_fit,HASH96a0c182,3,syngo MR E11,20181001
1,NDAR_INV005V6D2C,baseline_year_1_arm_1,G031_INV005V6D2C_baseline,GE MEDICAL SYSTEMS,DISCOVERY MR750,HASHe3ce02d3,3,27\LX\MR Software release:DV26.0_R01_1725.a,20180422
2,NDAR_INV007W6H7B,baseline_year_1_arm_1,G054_INV007W6H7B_baseline,GE MEDICAL SYSTEMS,DISCOVERY MR750,HASH48f7cbc3,3,25\LX\MR Software release:DV25.0_R02_1549.b,20170224
3,NDAR_INV00BD7VDC,baseline_year_1_arm_1,S090_INV00BD7VDC_baseline,SIEMENS,Prisma_fit,HASH65b39280,3,syngo MR E11,20180612
6,NDAR_INV00CY2MDM,baseline_year_1_arm_1,S021_INV00CY2MDM_baseline,SIEMENS,Prisma,HASHd422be27,3,syngo MR E11,20170822
...,...,...,...,...,...,...,...,...,...
22928,NDAR_INVZZNX6W2P,baseline_year_1_arm_1,S020_INVZZNX6W2P_baseline,SIEMENS,Prisma,HASH11ad4ed5,3,syngo MR E11,20170313
22931,NDAR_INVZZPKBDAC,baseline_year_1_arm_1,S012_INVZZPKBDAC_baseline,SIEMENS,Prisma_fit,HASHe4f6957a,3,syngo MR E11,20180120
22933,NDAR_INVZZZ2ALR6,baseline_year_1_arm_1,G010_INVZZZ2ALR6_baseline,GE MEDICAL SYSTEMS,DISCOVERY MR750,HASH5b2fcf80,3,25\LX\MR Software release:DV25.0_R02_1549.b,20170615
22935,NDAR_INVZZZNB0XC,baseline_year_1_arm_1,S011_INVZZZNB0XC_baseline,SIEMENS,Prisma,HASH5b0cf1bb,3,syngo MR E11,20170103


In [25]:
abcd_mri_y_adm_info_baseline["mri_info_manufacturer"].value_counts()

mri_info_manufacturer
SIEMENS                    7273
GE MEDICAL SYSTEMS         2975
Philips Medical Systems    1523
Name: count, dtype: int64

In [26]:
abcd_mri_y_adm_info_baseline["mri_info_manufacturersmn"].value_counts()

mri_info_manufacturersmn
Prisma_fit         4213
Prisma             3060
DISCOVERY MR750    2974
Achieva dStream     981
Ingenia             542
Orchestra SDK         1
Name: count, dtype: int64

In [27]:
abcd_mri_y_adm_info["mri_info_manufacturersmn"].value_counts()

mri_info_manufacturersmn
Prisma_fit         7538
Prisma             6576
DISCOVERY MR750    5914
Achieva dStream    1743
Ingenia            1040
SIGNA Premier        83
SIGNA UHP            44
Orchestra SDK         1
Name: count, dtype: int64

In [28]:
abcd_p_demo = pd.read_csv("/BEE/Connectome/ABCD/ABCD_5.1_Spreadsheets/core/abcd-general/abcd_p_demo.csv")

In [29]:
abcd_p_demo

Unnamed: 0,src_subject_id,eventname,demoi_p_select_language___1,demo_prim,demo_brthdat_v2,demo_ed_v2,demo_adopt_agex_v2,demo_adopt_agex_v2_bl_dk,demo_sex_v2,demo_gender_id_v2,...,demo_nat_lang_3_yrs_other_p_14,demo_nat_lang_3_p,demo_nat_lang_3_yrs_eng_p___1,demo_nat_lang_3_yrs_eng_p___2,demo_nat_lang_3_yrs_eng_p___3,demo_nat_lang_3_yrs_eng_p___4,demo_nat_lang_3_yrs_eng_p___5,demo_nat_lang_3_yrs_eng_p___6,race_ethnicity,acs_raked_propensity_score
0,NDAR_INV003RTV85,baseline_year_1_arm_1,0,1.0,10.0,5.0,,,2.0,2.0,...,,,,,,,,,1.0,466.092707
1,NDAR_INV003RTV85,1_year_follow_up_y_arm_1,0,,,,,,,,...,,,,,,,,,1.0,533.381820
2,NDAR_INV003RTV85,2_year_follow_up_y_arm_1,0,,,,,,,,...,,,,,,,,,,
3,NDAR_INV003RTV85,3_year_follow_up_y_arm_1,0,,,,,,,,...,,0.0,,,,,,,,
4,NDAR_INV005V6D2C,baseline_year_1_arm_1,1,1.0,10.0,4.0,,,2.0,2.0,...,,,,,,,,,3.0,520.488325
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48802,NDAR_INVZZZP87KR,baseline_year_1_arm_1,0,1.0,10.0,5.0,,,2.0,2.0,...,,,,,,,,,2.0,835.604891
48803,NDAR_INVZZZP87KR,1_year_follow_up_y_arm_1,0,,,,,,,,...,,,,,,,,,2.0,842.080358
48804,NDAR_INVZZZP87KR,2_year_follow_up_y_arm_1,0,,,,,,,,...,,,,,,,,,,
48805,NDAR_INVZZZP87KR,3_year_follow_up_y_arm_1,0,,,,,,,,...,,0.0,,,,,,,,


In [30]:
abcd_p_demo_baseline = abcd_p_demo[abcd_p_demo['eventname'] == 'baseline_year_1_arm_1']

In [31]:
abcd_p_demo_baseline

Unnamed: 0,src_subject_id,eventname,demoi_p_select_language___1,demo_prim,demo_brthdat_v2,demo_ed_v2,demo_adopt_agex_v2,demo_adopt_agex_v2_bl_dk,demo_sex_v2,demo_gender_id_v2,...,demo_nat_lang_3_yrs_other_p_14,demo_nat_lang_3_p,demo_nat_lang_3_yrs_eng_p___1,demo_nat_lang_3_yrs_eng_p___2,demo_nat_lang_3_yrs_eng_p___3,demo_nat_lang_3_yrs_eng_p___4,demo_nat_lang_3_yrs_eng_p___5,demo_nat_lang_3_yrs_eng_p___6,race_ethnicity,acs_raked_propensity_score
0,NDAR_INV003RTV85,baseline_year_1_arm_1,0,1.0,10.0,5.0,,,2.0,2.0,...,,,,,,,,,1.0,466.092707
4,NDAR_INV005V6D2C,baseline_year_1_arm_1,1,1.0,10.0,4.0,,,2.0,2.0,...,,,,,,,,,3.0,520.488325
8,NDAR_INV007W6H7B,baseline_year_1_arm_1,0,2.0,10.0,4.0,,,1.0,1.0,...,,,,,,,,,1.0,479.185338
11,NDAR_INV00BD7VDC,baseline_year_1_arm_1,0,1.0,9.0,4.0,,,1.0,1.0,...,,,,,,,,,1.0,414.643009
14,NDAR_INV00CY2MDM,baseline_year_1_arm_1,0,1.0,10.0,5.0,,,1.0,1.0,...,,,,,,,,,1.0,1433.061575
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48785,NDAR_INVZZNX6W2P,baseline_year_1_arm_1,0,1.0,10.0,5.0,,,1.0,1.0,...,,,,,,,,,1.0,593.431278
48790,NDAR_INVZZPKBDAC,baseline_year_1_arm_1,0,2.0,9.0,4.0,,,2.0,2.0,...,,,,,,,,,1.0,568.508011
48795,NDAR_INVZZZ2ALR6,baseline_year_1_arm_1,0,2.0,10.0,5.0,,,2.0,2.0,...,,,,,,,,,5.0,164.792483
48800,NDAR_INVZZZNB0XC,baseline_year_1_arm_1,0,1.0,9.0,3.0,,,2.0,2.0,...,,,,,,,,,3.0,1119.800572


In [32]:
abcd_p_demo_baseline['demo_sex_v2'].value_counts()

demo_sex_v2
1.0    6188
2.0    5677
3.0       3
Name: count, dtype: int64

In [33]:
len(identifiers_ibis_asd), len(identifiers_abcd_asd)

(65, 161)

In [34]:
import pandas as pd

# --- Step 1: Remove underscores from src_subject_id in AB-CD baseline DataFrames ---
# Make an explicit copy:
abcd_mri_y_adm_info_baseline_ = abcd_mri_y_adm_info_baseline.copy()
# Remove underscores in abcd_mri_y_adm_info_baseline
abcd_mri_y_adm_info_baseline_.loc[:, 'src_subject_id'] = abcd_mri_y_adm_info_baseline_['src_subject_id'].str.replace('_', '')

# Remove underscores in abcd_p_demo_baseline
abcd_p_demo_baseline_ = abcd_p_demo_baseline.copy()
abcd_p_demo_baseline_.loc[:, 'src_subject_id'] = abcd_p_demo_baseline_['src_subject_id'].str.replace('_', '')

# --- Step 2: Filter rows based on identifiers_abcd_asd list ---
# identifiers_abcd_asd should be a list of IDs that match the cleaned src_subject_id values

abcd_mri_filtered = abcd_mri_y_adm_info_baseline_[
    abcd_mri_y_adm_info_baseline_['src_subject_id'].isin(identifiers_abcd_asd)
]

abcd_p_demo_filtered = abcd_p_demo_baseline_[
    abcd_p_demo_baseline_['src_subject_id'].isin(identifiers_abcd_asd)
]

# --- Step 3: Merge the filtered AB-CD dataframes, select columns, and convert demo_sex_v2 values ---
# Merge the two filtered dataframes on 'src_subject_id'
merged_abcd = pd.merge(abcd_mri_filtered, abcd_p_demo_filtered, on='src_subject_id', how='inner')

# Select only the columns of interest
merged_abcd = merged_abcd[['src_subject_id', 'mri_info_manufacturer', 'mri_info_manufacturersmn', 'demo_sex_v2']]

# Map numeric demo_sex_v2 values to gender labels
sex_mapping = {1: "Male", 2: "Female", 3: "Other"}
merged_abcd['demo_sex_v2'] = merged_abcd['demo_sex_v2'].map(sex_mapping)

# --- Step 4: Create a new dataframe from VSADemographicsMar2023 ---
# Since 'ID' is the index in VSADemographicsMar2023, we use the index for filtering.
# Filter the rows using the index (which holds the IDs)
vsa_filtered = VSADemographicsMar2023[VSADemographicsMar2023.index.isin(identifiers_ibis_asd)].copy()

# Reset the index so that 'ID' becomes a column
vsa_filtered.reset_index(inplace=True)

# Create a new column that holds only the first 3 characters of the 'Identifiers' column.
vsa_filtered['Identifiers_first3'] = vsa_filtered['Identifiers'].str[:3]

# Select only the desired columns: 'ID', new 'Identifiers_first3', and 'VSA demographics,Sex'
vsa_final = vsa_filtered[['ID', 'Identifiers_first3', 'VSA demographics,Sex']]


In [35]:
#identifiers_ibis_typical

# --- Step 4: Create a new dataframe from VSADemographicsMar2023 ---
# Since 'ID' is the index in VSADemographicsMar2023, we use the index for filtering.
# Filter the rows using the index (which holds the IDs)
vsa_filtered = VSADemographicsMar2023[VSADemographicsMar2023.index.isin(identifiers_ibis_typical)].copy()

# Reset the index so that 'ID' becomes a column
vsa_filtered.reset_index(inplace=True)

# Create a new column that holds only the first 3 characters of the 'Identifiers' column.
vsa_filtered['Identifiers_first3'] = vsa_filtered['Identifiers'].str[:3]

# Select only the desired columns: 'ID', new 'Identifiers_first3', and 'VSA demographics,Sex'
vsa_final_inliers = vsa_filtered[['ID', 'Identifiers_first3', 'VSA demographics,Sex']]


In [36]:
vsa_final_inliers.shape

(82, 3)

In [37]:
#vsa_final_inliers.to_csv("batches/batches_ibis_inlier_cols_site_sex.csv", index=False)

In [38]:
merged_abcd.head(10)

Unnamed: 0,src_subject_id,mri_info_manufacturer,mri_info_manufacturersmn,demo_sex_v2
0,NDARINV0182J779,SIEMENS,Prisma_fit,Female
1,NDARINV03KMHMJJ,SIEMENS,Prisma_fit,Male
2,NDARINV09AUXBBT,GE MEDICAL SYSTEMS,DISCOVERY MR750,Male
3,NDARINV0H2AWWPU,GE MEDICAL SYSTEMS,DISCOVERY MR750,Male
4,NDARINV0N0JE94U,SIEMENS,Prisma_fit,Male
5,NDARINV11THDW9B,Philips Medical Systems,Achieva dStream,Female
6,NDARINV13BCLD41,GE MEDICAL SYSTEMS,DISCOVERY MR750,Male
7,NDARINV13FP25D3,GE MEDICAL SYSTEMS,DISCOVERY MR750,Male
8,NDARINV19GP5HLJ,SIEMENS,Prisma,Male
9,NDARINV1ETGGL9R,GE MEDICAL SYSTEMS,DISCOVERY MR750,Male


In [39]:
vsa_final.head(10)

Unnamed: 0,ID,Identifiers_first3,"VSA demographics,Sex"
0,IBIS107524,STL,Male
1,IBIS108131,SEA,Male
2,IBIS110925,SEA,Male
3,IBIS152174,SEA,Male
4,IBIS155712,UNC,Male
5,IBIS160209,PHI,Male
6,IBIS174982,UNC,Male
7,IBIS230955,PHI,Male
8,IBIS235108,SEA,Female
9,IBIS241033,UNC,Male


In [40]:
# merged_abcd.to_csv("batches/batches_abcd_asd_cols_scanner_model_sex.csv", index=False)
# vsa_final.to_csv("batches/batches_ibis_asd_cols_site_sex.csv", index=False)

## Batches v2: inliers

In [41]:
# Step 1: Filter the dataframe for samples with eventname "baseline_year_1_arm_1"
abcd_mri_y_adm_info_baseline = abcd_mri_y_adm_info[abcd_mri_y_adm_info['eventname'] == 'baseline_year_1_arm_1']

# Step 2: Group by src_subject_id and extract a list of unique mri_info_manufacturer values for each subject
manufacturer_lists = abcd_mri_y_adm_info_baseline.groupby('src_subject_id')['mri_info_manufacturer'] \
    .agg(lambda x: list(pd.unique(x)))

# Step 3: Filter to keep only those src_subject_id entries with more than one unique manufacturer
inconsistent_ids = manufacturer_lists[manufacturer_lists.apply(lambda x: len(x) > 1)]

# Step 4: Display the results
print("src_subject_id with more than one mri_info_manufacturer for baseline_year_1_arm_1:")
for subject_id, manufacturers in inconsistent_ids.items():
    print(f"{subject_id}: {manufacturers}")

src_subject_id with more than one mri_info_manufacturer for baseline_year_1_arm_1:


In [42]:
abcd_p_demo

Unnamed: 0,src_subject_id,eventname,demoi_p_select_language___1,demo_prim,demo_brthdat_v2,demo_ed_v2,demo_adopt_agex_v2,demo_adopt_agex_v2_bl_dk,demo_sex_v2,demo_gender_id_v2,...,demo_nat_lang_3_yrs_other_p_14,demo_nat_lang_3_p,demo_nat_lang_3_yrs_eng_p___1,demo_nat_lang_3_yrs_eng_p___2,demo_nat_lang_3_yrs_eng_p___3,demo_nat_lang_3_yrs_eng_p___4,demo_nat_lang_3_yrs_eng_p___5,demo_nat_lang_3_yrs_eng_p___6,race_ethnicity,acs_raked_propensity_score
0,NDAR_INV003RTV85,baseline_year_1_arm_1,0,1.0,10.0,5.0,,,2.0,2.0,...,,,,,,,,,1.0,466.092707
1,NDAR_INV003RTV85,1_year_follow_up_y_arm_1,0,,,,,,,,...,,,,,,,,,1.0,533.381820
2,NDAR_INV003RTV85,2_year_follow_up_y_arm_1,0,,,,,,,,...,,,,,,,,,,
3,NDAR_INV003RTV85,3_year_follow_up_y_arm_1,0,,,,,,,,...,,0.0,,,,,,,,
4,NDAR_INV005V6D2C,baseline_year_1_arm_1,1,1.0,10.0,4.0,,,2.0,2.0,...,,,,,,,,,3.0,520.488325
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48802,NDAR_INVZZZP87KR,baseline_year_1_arm_1,0,1.0,10.0,5.0,,,2.0,2.0,...,,,,,,,,,2.0,835.604891
48803,NDAR_INVZZZP87KR,1_year_follow_up_y_arm_1,0,,,,,,,,...,,,,,,,,,2.0,842.080358
48804,NDAR_INVZZZP87KR,2_year_follow_up_y_arm_1,0,,,,,,,,...,,,,,,,,,,
48805,NDAR_INVZZZP87KR,3_year_follow_up_y_arm_1,0,,,,,,,,...,,0.0,,,,,,,,


In [43]:
abcd_mri_y_adm_info[
    abcd_mri_y_adm_info['src_subject_id'].str.startswith('HC')]

Unnamed: 0,src_subject_id,eventname,mri_info_visitid,mri_info_manufacturer,mri_info_manufacturersmn,mri_info_deviceserialnumber,mri_info_magneticfieldstrength,mri_info_softwareversion,mri_info_studydate


In [44]:
count_HCD = len([item for item in identifiers_abcd_val if item.startswith("HCD")])
print(count_HCD)

37


In [45]:
abcd_mri_y_adm_info[
    abcd_mri_y_adm_info['src_subject_id'].str.startswith('NDAR_INVRR2')]

Unnamed: 0,src_subject_id,eventname,mri_info_visitid,mri_info_manufacturer,mri_info_manufacturersmn,mri_info_deviceserialnumber,mri_info_magneticfieldstrength,mri_info_softwareversion,mri_info_studydate
17823,NDAR_INVRR22WRZJ,2_year_follow_up_y_arm_1,S086_INVRR22WRZJ_2year,SIEMENS,Prisma_fit,HASH311170b9,3,syngo MR E11,20201118
17824,NDAR_INVRR22WRZJ,baseline_year_1_arm_1,S086_INVRR22WRZJ_baseline,SIEMENS,Prisma_fit,HASH311170b9,3,syngo MR E11,20181012


In [46]:
[item for item in identifiers_abcd_test if item.startswith("HCD")]

['HCD0059235',
 'HCD0124725',
 'HCD0295954',
 'HCD0353538',
 'HCD0515538',
 'HCD0621638',
 'HCD0644044',
 'HCD0684460',
 'HCD0717954',
 'HCD0758867',
 'HCD0772255',
 'HCD0797877',
 'HCD0848969',
 'HCD0869371',
 'HCD1010311',
 'HCD1017729',
 'HCD1071836',
 'HCD1108833',
 'HCD1231832',
 'HCD1239343',
 'HCD1262136',
 'HCD1400728',
 'HCD1410933',
 'HCD1529756',
 'HCD1594767',
 'HCD1742958',
 'HCD1749467',
 'HCD1886275',
 'HCD1900142',
 'HCD2035837',
 'HCD2055237',
 'HCD2229547',
 'HCD2540648',
 'HCD2606652',
 'HCD2633554',
 'HCD2635659',
 'HCD2685169',
 'HCD2711649',
 'HCD2731049',
 'HCD2737869']

In [47]:
[item for item in identifiers_abcd if item.startswith("NDAR_INVLP523WFQ")] 

[]

In [48]:
abcd_p_demo_baseline = abcd_p_demo[abcd_p_demo['eventname'] == 'baseline_year_1_arm_1']

In [49]:
import pandas as pd

# --- Step 1: Remove underscores from src_subject_id in AB-CD baseline DataFrames ---
# Make an explicit copy:
abcd_mri_y_adm_info_baseline_ = abcd_mri_y_adm_info_baseline.copy()
# Remove underscores in abcd_mri_y_adm_info_baseline
abcd_mri_y_adm_info_baseline_.loc[:, 'src_subject_id'] = abcd_mri_y_adm_info_baseline_['src_subject_id'].str.replace('_', '')

# Remove underscores in abcd_p_demo_baseline
abcd_p_demo_baseline_ = abcd_p_demo_baseline.copy()
abcd_p_demo_baseline_.loc[:, 'src_subject_id'] = abcd_p_demo_baseline_['src_subject_id'].str.replace('_', '')

# --- Step 2: Filter rows based on identifiers_abcd_asd list ---
# identifiers_abcd_asd should be a list of IDs that match the cleaned src_subject_id values

abcd_mri_filtered_inl = abcd_mri_y_adm_info_baseline_[
    abcd_mri_y_adm_info_baseline_['src_subject_id'].isin(identifiers_abcd)
]

abcd_p_demo_filtered_inl = abcd_p_demo_baseline_[
    abcd_p_demo_baseline_['src_subject_id'].isin(identifiers_abcd)
]

# --- Step 3: Merge the filtered AB-CD dataframes, select columns, and convert demo_sex_v2 values ---
# Merge the two filtered dataframes on 'src_subject_id'
merged2_abcd = pd.merge(abcd_mri_filtered_inl, abcd_p_demo_filtered_inl, on='src_subject_id', how='outer')

# Select only the columns of interest
merged2_abcd = merged2_abcd[['src_subject_id', 'mri_info_manufacturer', 'mri_info_manufacturersmn', 'demo_sex_v2']]

# Map numeric demo_sex_v2 values to gender labels
sex_mapping = {1: "Male", 2: "Female", 3: "Other"}
merged2_abcd['demo_sex_v2'] = merged2_abcd['demo_sex_v2'].map(sex_mapping)


In [50]:
#merged2_abcd.to_csv("batches/batches_abcd_inlier_cols_scanner_model_sex.csv", index=False)

In [51]:
abcd_mri_filtered_inl.shape, abcd_p_demo_filtered_inl.shape

((251, 9), (252, 276))

In [52]:
merged2_abcd.head(20)

Unnamed: 0,src_subject_id,mri_info_manufacturer,mri_info_manufacturersmn,demo_sex_v2
0,NDARINV021403LF,SIEMENS,Prisma_fit,Male
1,NDARINV08P1JKNE,Philips Medical Systems,Achieva dStream,Male
2,NDARINV0G37YGY1,Philips Medical Systems,Achieva dStream,Male
3,NDARINV0PJ81CA5,SIEMENS,Prisma_fit,Female
4,NDARINV0RB5CTN0,SIEMENS,Prisma_fit,Female
5,NDARINV0UV5WZUN,SIEMENS,Prisma_fit,Male
6,NDARINV0V1TNU11,SIEMENS,Prisma_fit,Male
7,NDARINV0YVKYMJX,GE MEDICAL SYSTEMS,DISCOVERY MR750,Male
8,NDARINV0ZHHWCMZ,SIEMENS,Prisma_fit,Male
9,NDARINV10FDVE0L,SIEMENS,Prisma,Female


In [53]:
merged2_abcd.shape

(252, 4)

In [54]:
merged_abcd.shape

(161, 4)

In [55]:
identifiers_abcd_asd_inlier_plus_asd_intersecton = list(set(identifiers_abcd_asd).intersection(identifiers_abcd))
identifiers_abcd_asd_inlier_plus_asd_intersecton

['NDARINVNHL7P38J', 'NDARINVD0KNT1VK']

### ABCD - Train batches

In [56]:
identifiers_abcd_train_path = "../asd-preprocessing/identifier_keys/split-keys/abcd-train_keys.txt"
identifiers_abcd_train_df = pd.read_csv(identifiers_abcd_train_path, header=None)
identifiers_abcd_train = identifiers_abcd_train_df[0].tolist()

In [57]:
# --- Step 1: Remove underscores from src_subject_id in ABCD baseline DataFrames ---
# Make an explicit copy:
abcd_mri_y_adm_info_baseline_ = abcd_mri_y_adm_info_baseline.copy()
# Remove underscores in abcd_mri_y_adm_info_baseline
abcd_mri_y_adm_info_baseline_.loc[:, 'src_subject_id'] = abcd_mri_y_adm_info_baseline_['src_subject_id'].str.replace('_', '')

# Remove underscores in abcd_p_demo_baseline
abcd_p_demo_baseline_ = abcd_p_demo_baseline.copy()
abcd_p_demo_baseline_.loc[:, 'src_subject_id'] = abcd_p_demo_baseline_['src_subject_id'].str.replace('_', '')

# --- Step 2: Filter rows based on identifiers_abcd_asd list ---
# identifiers_abcd_asd should be a list of IDs that match the cleaned src_subject_id values

abcd_mri_filtered_tr = abcd_mri_y_adm_info_baseline_[
    abcd_mri_y_adm_info_baseline_['src_subject_id'].isin(identifiers_abcd_train)
]

abcd_p_demo_filtered_tr = abcd_p_demo_baseline_[
    abcd_p_demo_baseline_['src_subject_id'].isin(identifiers_abcd_train)
]

# --- Step 3: Merge the filtered AB-CD dataframes, select columns, and convert demo_sex_v2 values ---
# Merge the two filtered dataframes on 'src_subject_id'
merged3_abcd = pd.merge(abcd_mri_filtered_tr, abcd_p_demo_filtered_tr, on='src_subject_id', how='outer')

# Select only the columns of interest
merged3_abcd = merged3_abcd[['src_subject_id', 'mri_info_manufacturer', 'mri_info_manufacturersmn', 'demo_sex_v2']]

# Map numeric demo_sex_v2 values to gender labels
sex_mapping = {1: "Male", 2: "Female", 3: "Other"}
merged3_abcd['demo_sex_v2'] = merged3_abcd['demo_sex_v2'].map(sex_mapping)


In [58]:
# new column by concatenating the two existing columns 
merged2_abcd['mri_info_manufacturer_model'] = (
    merged2_abcd['mri_info_manufacturer'].astype(str) +
    " | " +
    merged2_abcd['mri_info_manufacturersmn'].astype(str)
)

# Create a dictionary mapping "src_subject_id" to "mri_info_manufacturer".
abcd_inlier_scanner_dict = merged2_abcd.set_index("src_subject_id")["mri_info_manufacturer_model"].to_dict()

# Create a dictionary mapping "src_subject_id" to "demo_sex_v2".
abcd_inlier_sex_dict = merged2_abcd.set_index("src_subject_id")["demo_sex_v2"].to_dict()

In [59]:
# new column by concatenating the two existing columns 
merged3_abcd['mri_info_manufacturer_model'] = (
    merged3_abcd['mri_info_manufacturer'].astype(str) +
    " | " +
    merged3_abcd['mri_info_manufacturersmn'].astype(str)
)

# Create a dictionary mapping "src_subject_id" to "mri_info_manufacturer".
abcd_inlier_scanner_dict = merged3_abcd.set_index("src_subject_id")["mri_info_manufacturer_model"].to_dict()

# Create a dictionary mapping "src_subject_id" to "demo_sex_v2".
abcd_inlier_sex_dict = merged3_abcd.set_index("src_subject_id")["demo_sex_v2"].to_dict()

In [60]:
merged2_abcd_val = merged2_abcd[merged2_abcd['src_subject_id'].isin(identifiers_abcd_val)]
merged2_abcd_test = merged2_abcd[merged2_abcd['src_subject_id'].isin(identifiers_abcd_test)]
merged2_abcd_val.shape, merged2_abcd_test.shape

((127, 5), (125, 5))

In [61]:
merged3_abcd[merged3_abcd['src_subject_id'].isin(identifiers_abcd_train)].shape

(976, 5)

In [62]:
merged2_abcd_val['mri_info_manufacturer_model'].value_counts(), "========" , merged2_abcd_val['demo_sex_v2'].value_counts()


(mri_info_manufacturer_model
 SIEMENS | Prisma_fit                         54
 SIEMENS | Prisma                             51
 GE MEDICAL SYSTEMS | DISCOVERY MR750         18
 Philips Medical Systems | Achieva dStream     2
 Philips Medical Systems | Ingenia             2
 Name: count, dtype: int64,
 demo_sex_v2
 Female    70
 Male      56
 Other      1
 Name: count, dtype: int64)

In [63]:
merged2_abcd_test['mri_info_manufacturer_model'].value_counts(), "========" , merged2_abcd_test['demo_sex_v2'].value_counts()

(mri_info_manufacturer_model
 SIEMENS | Prisma_fit                         50
 SIEMENS | Prisma                             44
 GE MEDICAL SYSTEMS | DISCOVERY MR750         17
 Philips Medical Systems | Achieva dStream     7
 Philips Medical Systems | Ingenia             6
 nan | nan                                     1
 Name: count, dtype: int64,
 demo_sex_v2
 Male      65
 Female    60
 Name: count, dtype: int64)

In [64]:
merged3_abcd['mri_info_manufacturer_model'].value_counts(), "========" , merged3_abcd['demo_sex_v2'].value_counts()

(mri_info_manufacturer_model
 SIEMENS | Prisma_fit                         394
 SIEMENS | Prisma                             331
 GE MEDICAL SYSTEMS | DISCOVERY MR750         144
 Philips Medical Systems | Achieva dStream     66
 Philips Medical Systems | Ingenia             38
 nan | nan                                      3
 Name: count, dtype: int64,
 demo_sex_v2
 Female    533
 Male      443
 Name: count, dtype: int64)

In [65]:
hcd_metadata = pd.read_csv("spreadsheets/HCD_LS_2.0_subject_completeness.csv")

In [66]:
hcd_metadata

Unnamed: 0,src_subject_id,subjectkey,interview_date,interview_age,sex,visit,unrelated_subset,Full_MR_Compl,T1_Count,T2_Count,...,tlbx_socwit01,tlbx_wellbeing01,tpvt01,upps01,vision_tests01,vitals01,wais_iv_part101,wisc_v01,wppsiiv01,ysr01
0,HCA or HCD subject id,Pseudo-Guid,RedCap event registration date when copied to ...,age in months,sex at birth,subject visit/event,Is subject unrelated to others in HCA/HCD?,"Full MR protocol 100% complete? 1=Yes, 2=No",# of T1w scans,# of T2w scans,...,"Any data in tlbx_socwit01? 1=yes, 0=no","Any data in tlbx_wellbeing01? 1=yes, 0=no","Any data in tpvt01? 1=yes, 0=no","Any data in upps01? 1=yes, 0=no","Any data in vision_tests01? 1=yes, 0=no","Any data in vitals01? 1=yes, 0=no","Any data in wais_iv_part101? 1=yes, 0=no","Any data in wisc_v01? 1=yes, 0=no","Any data in wppsiiv01? 1=yes, 0=no","Any data in ysr01? 1=yes, 0=no"
1,HCD0001305,NDAR_INVEE728YNM,10/1/17,143,M,V1,TRUE,1,1,1,...,1,1,1,1,1,1,0,1,0,1
2,HCD0008117,NDAR_INVKH253JC7,4/1/19,192,F,V1,TRUE,1,1,1,...,0,0,0,1,1,1,0,1,0,1
3,HCD0021614,NDAR_INVCY281KJ2,7/1/18,110,F,V1,TRUE,1,1,1,...,1,1,1,1,1,1,0,1,0,0
4,HCD0022919,NDAR_INVXF194BDK,7/1/18,117,F,V1,TRUE,1,1,1,...,1,1,1,1,1,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
648,HCD2982579,NDAR_INVNC447ATF,1/1/19,225,F,V1,TRUE,1,1,1,...,0,1,1,1,1,1,1,0,0,0
649,HCD2987185,NDAR_INVAX000CBL,4/1/18,175,M,V1,TRUE,1,1,1,...,0,0,0,1,1,1,0,1,0,1
650,HCD2990073,NDAR_INVDR796KTJ,7/1/18,163,F,V1,TRUE,1,1,1,...,0,1,1,1,1,1,0,1,0,1
651,HCD2993079,NDAR_INVGG441MM6,10/1/18,194,F,V1,FALSE,1,1,1,...,0,0,0,1,1,1,0,1,0,1


In [67]:
def count_hcp_sex(id_list, metadata_df):
    # Filter IDs starting with "HCP"
    hcp_ids = [sid for sid in id_list if sid.startswith("HCD")]
    # Select rows in metadata matching those IDs
    df_hcp = metadata_df[metadata_df["src_subject_id"].astype(str).isin(hcp_ids)]
    # Count by sex
    return df_hcp["sex"].value_counts()

# Compute for each split
train_sex_counts = count_hcp_sex(identifiers_abcd_train, hcd_metadata)
val_sex_counts   = count_hcp_sex(identifiers_abcd_val,   hcd_metadata)
test_sex_counts  = count_hcp_sex(identifiers_abcd_test,  hcd_metadata)

# Combine into a summary DataFrame
summary_df = pd.concat(
    [train_sex_counts.rename("TRAIN"),
     val_sex_counts.rename("VAL"),
     test_sex_counts.rename("TEST")],
    axis=1
).fillna(0).astype(int)

print(summary_df)

     TRAIN  VAL  TEST
sex                  
F      178   15    28
M      163   22    12
