In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from importlib import reload
from neurocombat_sklearn import CombatModel
from sklearn.impute import SimpleImputer
import afqinsight
from tqdm import tqdm
import afqinsight
import sys
from scipy import stats as spstats
from sklearn.preprocessing import scale


sys.path.insert(0,'../utils')

from read_math_utils_hbn import *
from read_math_utils_abcd import *
from read_math_utils_ping import *


## Load, Harmonize, Combine, and Save HBN data

In [None]:

# info on loading afq data: https://richiehalford.org/AFQ-Insight/api.html#dataset-loader
X_r, y_r, groups_r, feature_names_r, group_names_r,subjects_r,classes_r = afqinsight.load_afq_data('.',
                                                                                                   fn_nodes='../data/hbn/nodes_hcp.csv',
                                                                                                   fn_subjects='../data/hbn/subjects_hcp.csv',
                                                                                                   target_cols='wiat_reading_comp',
                                                                                                   )



In [None]:
## Get sites for Neuro_ComBat

df_sites = pd.read_csv('../data/hbn/combined_tract_profiles.csv',
                       na_values={"NaN","NA","n/a","","\t"})
df_sites = df_sites[["subjectID", "sessionID"]].copy()
df_sites["site_id"] = df_sites["sessionID"].apply(lambda s: s.replace("HBNsite", ""))
df_sites.drop(columns=["sessionID"], inplace=True)
df_sites["site_id"] = df_sites["site_id"].astype("category")
df_sites.set_index("subjectID", inplace=True)
df_sites = df_sites[~df_sites.index.duplicated(keep="first")]
# df_sites = df_sites[df_sites['site_id'] != 'SI'] 
df_sites = pd.DataFrame(index=subjects_r).merge(
    df_sites, how="left", left_index=True, right_index=True
)
df_sites["site_idx"] = df_sites["site_id"].map({"RU": 0, "CBIC": 1, "CUNY": 2})
# df_sites = df_sites[df_sites['site_id'] != 'SI'] 
# df_sites = df_sites.iloc[nan_mask, :]
nan_mask = np.logical_not(df_sites['site_idx'].isna()) # to get rid of SI participants
print(df_sites.shape)
df_sites.head()

In [None]:
# filtere out SI participants from X_r and y_r

df_sites = df_sites.loc[nan_mask, :]
X_r = X_r[nan_mask, :]
y_r = y_r[nan_mask]
subjects_r = [subject for filt, subject in zip(nan_mask, subjects_r) if filt]


In [None]:
## Harmonize train and test data

imputer = SimpleImputer(strategy="median")
X_r_imputed = imputer.fit_transform(X_r)
X_r_site_harmonized = CombatModel().fit_transform(
    X_r_imputed,
    df_sites[["site_idx"]],
    None,
    None,
)


In [None]:
# this file was generated using coinsparse: https://github.com/richford/coinsparse

pheno_df = pd.read_csv('../data/hbn/pheno_df.csv')
pheno_df["wiat_math_comp"] = pheno_df[['WIAT_Num_Stnd', 'WIAT_MP_Stnd']].mean(axis=1)
pheno_df["wiat_reading_comp"] = pheno_df[['WIAT_Word_Stnd', 'WIAT_Pseudo_Stnd']].mean(axis=1)

#drop anyone who's missing composite score -- nobody is
pheno_df = pheno_df[pheno_df['wiat_math_comp'].notna()]
pheno_df = pheno_df[pheno_df['wiat_reading_comp'].notna()]


In [None]:
# build df with harmonized tract profiles and pheno data

combo_df = combine_target_feature_df(X_r_site_harmonized,subjects_r,pheno_df,'EID',
                             ['wiat_reading_comp','wiat_math_comp','TOWRE_Total_Scaled','WIAT_Word_Stnd','WIAT_Num_Stnd','Age','scan_site_id','FSQ_04'],feature_names_r)

# dummy code parental income and scan site
combo_df = pd.concat([combo_df, pd.get_dummies(combo_df['FSQ_04']).add_prefix('ses_')], axis=1)
combo_df = pd.concat([combo_df, pd.get_dummies(combo_df['scan_site_id']).add_prefix('site_')], axis=1)
combo_df['Age'] = pd.to_numeric(combo_df['Age'])
del combo_df['FSQ_04']

print(combo_df.shape)
combo_df.head()

combo_df.to_csv('../data/hbn/combo_df.csv', index=False)

## Load, Combine, and Save ABCD data

In [None]:
## Load diffusion data

tract_profiles_1 = pd.read_csv('../data/abcd/abcd_dmdtifp101.txt',delimiter = "\t",skiprows=[1])
# for overview of columns: https://nda.nih.gov/data_structure.html?short_name=abcd_dmdtifp101


# get just fa and md
tract_profiles_fa_md = tract_profiles_1.iloc[:,:93]

# interesting bundles: 
    # FA - arc_l: dmdtifp1_23 arc_r: dmdtifp1_22 MD - arc_l: dmdtifp1_64 arc_r: dmdtifp1_65
    # FA - slf_l: dmdtifp1_21 slf_r: dmdtifp1_20 MD - slf_l: dmdtifp1_63 slf_r: dmdtifp1_62
    # FA - ilf_l: dmdtifp1_14 ilf_r: dmdtifp1_13 MD - ilf_l: dmdtifp1_56 ilf_r: dmdtifp1_55
    # FA - cc: dmdtifp1_19 MD - cc: dmdtifp1_61



In [None]:
## Load reading data

reading = pd.read_csv('../data/abcd/abcd_tbss01.txt',delimiter = "\t",skiprows=[1])

# 456 subjects don't have age corrected reading scores

reading = reading[['src_subject_id','eventname','nihtbx_reading_agecorrected','nihtbx_reading_uncorrected','interview_age']]
reading = reading[reading['nihtbx_reading_agecorrected'].isnull()==False]


In [None]:
final_df = tract_profiles_fa_md.merge(reading[['src_subject_id','eventname','nihtbx_reading_agecorrected','nihtbx_reading_uncorrected']],
                                      how='left', on=['src_subject_id','eventname'])

final_df['low_r'] = "other"
final_df.loc[final_df['nihtbx_reading_agecorrected']<=90,'low_r'] = "low_r"
final_df['low_r_dummy'] = pd.get_dummies(final_df['low_r'])['low_r']

final_df_t1 = final_df[final_df['eventname']=='baseline_year_1_arm_1']

final_df_t1 = final_df_t1.rename(columns={"dmdtifp1_22":"ARC_R", "dmdtifp1_23": "ARC_L","dmdtifp1_20":'SLF_R',
                            "dmdtifp1_21":'SLF_L',"dmdtifp1_14":'ILF_L',"dmdtifp1_13":'ILF_R','dmdtifp1_19':'CC',
                               'nihtbx_reading_agecorrected':'NIH_TBX_READ_CORR','nihtbx_reading_uncorrected':'NIH_TBX_READ_UNCORR'})


In [None]:
final_df_t1_filt = final_df_t1[['src_subject_id','low_r','eventname','interview_age','ARC_L','ARC_R','ILF_L','ILF_R',
                            'SLF_L','SLF_R','CC','NIH_TBX_READ_UNCORR','NIH_TBX_READ_CORR',
                                'low_r_dummy']].dropna(subset=['ARC_L','ARC_R','ILF_L','ILF_R','SLF_L','SLF_R','CC','NIH_TBX_READ_UNCORR'],axis=0)

final_df_t1_filt = final_df_t1_filt[(np.abs(spstats.zscore(final_df_t1_filt[['ARC_L','ARC_R','ILF_L','ILF_R',
                            'SLF_L','SLF_R','CC','NIH_TBX_READ_UNCORR','NIH_TBX_READ_CORR']])) < 3).all(axis=1)]

final_df_t1_filt.to_csv('../data/abcd/abcd_plot_data.csv')

## Load, Combine, and Save PING data

In [None]:
## Load diffusion data

tract_profiles = pd.read_csv('../data/ping/pdti01.txt',delimiter = "\t",skiprows=[1])

#select age, id, sex, site, and fa/md columns
tract_profiles = tract_profiles.filter(regex=r'(dti_fiber_fa|dti_fiber_md|_id|interview_age|sex|site|study)')

tract_profiles = tract_profiles[tract_profiles['dti_fiber_fa_r_fx'].notnull()]


In [None]:
## Load reading data

reading = pd.read_csv('../data/ping/orrt01.txt',delimiter = "\t",skiprows=[1])

# #select id and raw/age corrected reading scores
reading = reading.filter(regex=r'(_id|read|interview_age)')

reading['age'] = (reading['interview_age']/12).round()
reading = reading[reading['age']!=0.0]
reading = reading[reading['tbx_reading_score'].isna()==False]



In [None]:
low_r_df = pd.DataFrame()

age_range = reading['age'].unique()
reading3 = reading[reading['age']==3]

for age in age_range:
    temp_df = reading[reading['age']==age]
    temp_df = temp_df.reset_index()
    temp_df['zscore']=scale(temp_df['tbx_reading_score'])
    temp_df['low_r'] = temp_df['zscore']<-1
    temp_df = temp_df[['src_subject_id','low_r','zscore','tbx_reading_score']]
    low_r_df = low_r_df.append(temp_df)

In [None]:
reading.set_index('src_subject_id',drop=True,inplace=True)
low_r_df.set_index('src_subject_id',drop=True,inplace=True)
tract_profiles.set_index('src_subject_id',drop=True,inplace=True)


In [None]:
final_df = tract_profiles.merge(reading['tbx_reading_score'],how='left',left_index=True,right_index=True)
final_df = final_df.merge(low_r_df['zscore'],how='left',left_index=True,right_index=True)

final_df['reading_score_bin'],qbins = pd.qcut(final_df['tbx_reading_score'],q=2,retbins=True,labels=False)
final_df['reading_group'] = 'other'
final_df.loc[final_df.reading_score_bin == 0, 'reading_group'] = "low_r"
final_df['low_r'] = "other"
final_df.loc[final_df['zscore']<-1,'low_r'] = "low_r"

final_df['age_bin'],qbins = pd.qcut(final_df['tbx_reading_score'],q=3,retbins=True,labels=False)

final_df.to_csv('../data/ping/final_df_ping.csv')