# Imports

In [240]:
import os
import pandas as pd
import numpy as np
import scipy
import pickle
import sys  

In [241]:
!pwd

/Users/hyroh/Desktop/FTD_Research/mBIN/FTD_JupyterNotebook/Load_Dataset


### Helper Fuctions

In [242]:
%store -r loadData_hf
sys.path.insert(0, loadData_hf)
import findPathCoM

# Set Directory Paths

In [243]:
# Location of the data folder
%store -r dataDir

# Directory path where Data will be saved to
%store -r path_dataDir

# Only used to load the FTDGeneralData_20221114.mat file --> Saved as NetworkDataGeneral
%store -r baseDir

# Loading the preconstructed atlas data

In [244]:
# loads the preconstructed Atlas data
NetworkDataGeneral = scipy.io.loadmat(os.path.join(baseDir, 'NetworkAnalysisGeneral', 'FTDGeneralData_20221114.mat'))

# [1] Loading Pathology Dataset - %AO

In [245]:
# Load new_pathT: ex-vivo histopathology Data (Quantification) / %AO for pathology regions
new_pathT = pd.read_excel(os.path.join(dataDir, 'NewFTDData', 'FTLD Library 4-25-23 update.xlsx'))

### Format the Pathology Data - %AO to desired format

#### Divide each INDDID into {GM, WM} and {L, R} - 22 Regions (They are alphabetically Ordered)

In [246]:
# For each INDDID divided into {GM, WM} and {L, R} (maximum 4 rows per INDDID)
pathT_WMGM = pd.pivot_table(new_pathT, values='AvgPercentAO', 
                            index=['INDDID', 'FullAutopsyID', 'AutopsyIDNumOnly', 
                                   'Tau1_TDP2', 'Hemisphere_by_slide', 'AnalysisRegion'], 
                            columns=['Region'], aggfunc=np.sum)

In [247]:
# Unstacking the Index --> Need a way to solve this without saving to csv format
pathT_WMGM.to_csv(os.path.join(dataDir, 'NewFTDData', 'new_pathT(GMWM).csv'))
pathT_WMGM = pd.read_csv(os.path.join(dataDir, 'NewFTDData', 'new_pathT(GMWM).csv'))

#### Divide the pathT into GM and WM 

In [248]:
pathT_WMGM_type = pathT_WMGM.groupby('AnalysisRegion')

# This contains 2 seperate rows for {L, R}
pathT_GM_LR = pathT_WMGM_type.get_group('GM')
pathT_WM_LR = pathT_WMGM_type.get_group('WM')

# Combine 2 Rows for {L, R} into a single row
pathT_GM_LR_type = pathT_GM_LR.groupby('Hemisphere_by_slide')
pathT_GM_L = pathT_GM_LR_type.get_group('L')
pathT_GM_R = pathT_GM_LR_type.get_group('R')
pathT_GM = pd.merge(pathT_GM_L, pathT_GM_R, left_on=['INDDID', 'FullAutopsyID', 'AutopsyIDNumOnly', 'Tau1_TDP2', 'AnalysisRegion'], right_on=['INDDID', 'FullAutopsyID', 'AutopsyIDNumOnly', 'Tau1_TDP2', 'AnalysisRegion'], how='outer', suffixes=('_L', '_R')) 

pathT_WM_LR_type = pathT_WM_LR.groupby('Hemisphere_by_slide')
pathT_WM_L = pathT_WM_LR_type.get_group('L')
pathT_WM_R = pathT_WM_LR_type.get_group('R')
pathT_WM = pd.merge(pathT_WM_L, pathT_WM_R, left_on=['INDDID', 'FullAutopsyID', 'AutopsyIDNumOnly', 'Tau1_TDP2', 'AnalysisRegion'], right_on=['INDDID', 'FullAutopsyID', 'AutopsyIDNumOnly', 'Tau1_TDP2', 'AnalysisRegion'], how='outer', suffixes=('_L', '_R'))

# Drop Hemisphere_by_slide {L, R} Columns
pathT_GM = pathT_GM.drop(columns=['Hemisphere_by_slide_L', 'Hemisphere_by_slide_R'])
pathT_WM = pathT_WM.drop(columns=['Hemisphere_by_slide_L', 'Hemisphere_by_slide_R']) 

# [2] Mapping Pathology Regions to Atlas regions

In [249]:
# Load the Look up table matching Atlas Region names to Atlas Labels(Index)
pathLUT = pd.read_csv(os.path.join(dataDir,'schaefer_path_20210719_20220328.csv'))

# Load the Look up table matching Pathology Region names to Atlas Region names
AtlasToPathLUT = pd.read_excel(os.path.join(dataDir,'NewFTDData','PathToAtlasLUT_5_10_2023(mePFC_PFC_Ignored).xlsx'))

# Using AtlasToPathLUT get the Pathology Regions and match them to Atlas Regions (Index 1~400 regions)
# Return CoM for each Pathology Regions (Single Pahtology Regions match to multiple Atlas Regions, 
# therefore get Mean Value). Theses are unordered.
# Also return list of Atlas regions index corrresponding to Pathology regions. Theses are unordered.
pathCoMunordered, pathToAtlasIndexunordered = findPathCoM.findPathCoM(pathLUT, AtlasToPathLUT, 
                                                                      NetworkDataGeneral['NetworkDataGeneral'][0,0]['Schaefer400x7']['CoM'][0, 0])

In [250]:
# Get List of all regions of pathology we can map to 3D Atlas (out of 22) in Alphabetical Order
# ['ANG', 'ATC', 'HIP', 'IFC', 'M1', 'MFC', 'OFC', 'PC', 'S1', 'SMTC', 'SPC', 'V1', 'aCING', 'aINS', 'aITC', 'dlPFC', 'iPFC', 'mPFC', 'pCING', 'pSTC']
pathNames_3D_Map = np.sort(AtlasToPathLUT["PathSpreadSheetNames"].values)

# sn - denote the number of areas we are able to map to 3D Atlas
sn = len(pathNames_3D_Map)

In [251]:
# Ordering the CoM so that it matches the order of Regions in the Pathology Dataset - %AO (Columns)
pathCoM = np.empty((sn,3,2)) # One path regions corresponds to multiple atlas region
pathToAtlasIndex = [[None, None] for _ in range(sn)]

for s in range(sn):
    idx = AtlasToPathLUT[AtlasToPathLUT.PathSpreadSheetNames == pathNames_3D_Map[s]].index[0] 
    pathCoM[s,:,:] = pathCoMunordered[idx, :, :]
    pathToAtlasIndex[s] = pathToAtlasIndexunordered[idx]

# pathCoM and pathToAtlasIndex are ordered by the order of pathNames_3D_Map (= Ordering of regions same as in PathT Dataset Columns Left to Right)

In [252]:
# Drop Columns in pathT_GM / pathT_GM Where we cannot map to 3D Atlas, using AtlasToPathLUT (+5, for index offset)
pathT_GM = pathT_GM.drop(pathT_GM.columns[[i + 5 for i, e in enumerate(pathT_GM.columns.values[5:]) if e.split("_")[0] not in pathNames_3D_Map]], axis = 1)
pathT_WM = pathT_WM.drop(pathT_WM.columns[[i + 5 for i, e in enumerate(pathT_WM.columns.values[5:]) if e.split("_")[0] not in pathNames_3D_Map]], axis = 1)

# [3] TAU and TDP Divide (GM) + Log %AO

### Get index of rows that are TAU and TDP

In [258]:
# Index for the case with tau or tdp for patients
FTD_TAUIndx = (pathT_GM.Tau1_TDP2 == 1)  # False or True
FTD_TDPIndx = (pathT_GM.Tau1_TDP2 == 2) # False or True

### Compute Log %AO of Pathology Values

In [259]:
# Get Log %AO of 22 anatomical regions of the brain
#pathData = np.ma.log(0.01 * pathT.iloc[:, 5:].values + 0.00015).filled(np.nan) # Masked log for handling the case where the value is NaN
pathData = np.ma.log(pathT_GM.iloc[:, 5:].values + 0.00015).filled(np.nan)

In [260]:
# # NO LOG!!
# pathData = pathT_GM.iloc[:, 5:].values

### Divide Pathology Data into TAU and TDP

In [261]:
# Log %AO of FTD TAU vs TDP --> Type: ndarray
path_TAU = pathData[FTD_TAUIndx,:]
path_TDP = pathData[FTD_TDPIndx,:]

# Get only bvFTD

In [262]:
path_id_TAU = pathT_GM[FTD_TAUIndx]['INDDID'].values

In [263]:
path_id_TDP = pathT_GM[FTD_TDPIndx]['INDDID'].values

In [264]:
# path_id_TAU
with open(os.path.join(path_dataDir, 'path_id_TAU.pkl'), 'wb') as f:
    pickle.dump(path_id_TAU, f)
f.close()

# path_id_TDP
with open(os.path.join(path_dataDir, 'path_id_TDP.pkl'), 'wb') as f:
    pickle.dump(path_id_TDP, f)
f.close()

## FTLD Autopsy MRI

In [201]:
ftd_autopsyT = pd.read_excel(os.path.join(dataDir, 'NewFTDData', 'FTLD Autopsy MRI (2023.02.10 14.31).xlsx'))

In [202]:
autopsy_id = np.unique(ftd_autopsyT.INDDID)

In [203]:
tau_aut_over = np.isin(path_id_TAU, autopsy_id)
print(f"Number of overlap (Path-Tau): {tau_aut_over.sum()}/{len(path_id_TAU)}")

Number of overlap (Path-Tau): 33/76


In [204]:
tdp_aut_over = np.isin(path_id_TDP, autopsy_id)
print(f"Number of overlap (Path-Tdp): {tdp_aut_over.sum()}/{len(path_id_TDP)}")

Number of overlap (Path-Tdp): 45/103


## Tau

In [232]:
bvftd_tau_df = ftd_autopsyT[ftd_autopsyT['INDDID'].isin(path_id_TAU[tau_aut_over])][['INDDID','BVFTD','ClinicalDx1']].drop_duplicates(subset=['INDDID','BVFTD','ClinicalDx1']).sort_values(by='INDDID')

In [206]:
bvftd_tau_ids = bvftd_tau_df[bvftd_tau_df['BVFTD'] == 1.0].INDDID.values

In [207]:
bvftd_tau_ids

array([101483., 105223., 105564., 106309., 106814., 107516., 107677.,
       108026., 113113., 115001., 116591., 116607., 118410., 118780.])

## TDP

In [208]:
bvftd_tdp_df = ftd_autopsyT[ftd_autopsyT['INDDID'].isin(path_id_TDP[tdp_aut_over])][['INDDID','BVFTD','ClinicalDx1']].drop_duplicates(subset=['INDDID','BVFTD','ClinicalDx1']).sort_values(by='INDDID')


In [209]:
bvftd_tdp_ids = bvftd_tdp_df[bvftd_tdp_df['BVFTD'] == 1.0].INDDID.values

## Extract from path_TAU & path_TDP

In [211]:
path_TAU_bvftd = path_TAU[np.isin(path_id_TAU, bvftd_tau_ids), :]

In [212]:
path_TDP_bvftd = path_TDP[np.isin(path_id_TDP, bvftd_tdp_ids), :]

In [239]:
path_TAU_bvftd

array([[ 1.47547609,         nan,         nan,         nan,  1.7547269 ,
         1.45372713,  1.62945517,         nan,         nan,  1.37028582,
                nan, -2.49047379,  1.60606623,         nan,         nan,
                nan,         nan,         nan,         nan,         nan,
                nan,         nan,         nan,         nan,         nan,
                nan,         nan,         nan,         nan,         nan,
                nan,         nan,         nan,         nan,         nan,
                nan,         nan,         nan,         nan,         nan],
       [ 0.04453809,         nan,         nan,  1.26940723,  0.9903587 ,
                nan,         nan,         nan,         nan,         nan,
        -2.39470038, -5.98151422,  2.01072542,  1.09941133,  2.94181185,
         1.92304842,         nan,         nan,         nan,         nan,
                nan,         nan,         nan,  2.2140527 ,         nan,
                nan,  3.1835878 ,         nan,    

In [279]:
temp = pathT_GM[pathT_GM['INDDID'].isin(bvftd_tau_ids)]
temp

Unnamed: 0,INDDID,FullAutopsyID,AutopsyIDNumOnly,Tau1_TDP2,AnalysisRegion,ANG_L,ATC_L,HIP_L,IFC_L,M1_L,...,SPC_R,V1_R,aCING_R,aINS_R,aITC_R,dlPFC_R,iPFC_R,mPFC_R,pCING_R,pSTC_R
18,105223.0,2015-009,2015-009,1.0,GM,4.372967,,,,5.781718,...,,,,,,,,,,
26,106309.0,2013-018,2013-018,1.0,GM,1.045395,,,3.558592,2.69205,...,0.65196,,10.205983,0.784469,,13.8204,,,,
30,106814.0,2014-145,2014-145,1.0,GM,0.193507,17.0527,,,7.543718,...,,,,,,,4.430519,,,
33,107516.0,2008-251,2008-251,1.0,GM,31.035758,,,,,...,6.004969,,,7.602073,17.79451,,,,,
36,108026.0,2013-223,2013-223,1.0,GM,4.646629,,,,0.049291,...,,,,,,,,,,
73,113113.0,2006-027,2006-027,1.0,GM,0.620913,,,,,...,,,,,,,,,,
80,115001.0,2013-071,2013-071,1.0,GM,0.07807,,14.068073,,,...,,0.029527,15.876919,,,,,,1.338171,
85,116591.0,2014-212,2014-212,1.0,GM,0.733029,,,2.12378,1.301797,...,1.738992,,6.62736,1.059671,,1.63756,1.343453,,,
87,116607.0,2014-052,2014-052,1.0,GM,,,,0.955997,,...,0.483558,0.004063,0.259858,1.216361,,1.615359,0.736226,,,
97,118410.0,2017-109,2017-109,1.0,GM,3.07746,,,,0.131322,...,,,,,,,,,,


In [272]:
temp1 = np.ma.log(temp.iloc[:, 5:].values + 0.00015).filled(np.nan)

# Get bvFTD + Matching MR

In [215]:
# MRI Thickness value for All Subjects - schaefer400x7
thicknessAllraw = pd.read_csv(os.path.join(dataDir, 'NewFTDData', 
                                           'invivoPathCohort_quantsSubSesSchaefer400_tian12.csv'), dtype={'id': str})

In [216]:
thicknessPathLUT = pd.read_excel(os.path.join(dataDir, 'NewFTDData', 'InvivoPathCohort_03172023.xls'), 
                                 dtype={'INDDID': str})

In [217]:
thicknessAll = pd.merge(thicknessAllraw, thicknessPathLUT, left_on='id', right_on='INDDID', how='inner') 

# We only lose INDDID 108783x09 in the thicknessAllraw (849 rows lost)

In [218]:
thickness_path_type = thicknessAll.groupby('Group')

# MRI Thickness values for Healthy Control
thicknessHC = thickness_path_type.get_group('HC')
# MRI Thickness values for Patient (TAU)
thicknessPatientTAU = thickness_path_type.get_group('tau')
# MRI Thickness values for Patient (TDP)
thicknessPatientTDP = thickness_path_type.get_group('tdp')

# IDs
thick_id_HC = np.unique(thicknessHC.INDDID)
thick_id_TAU = np.unique(thicknessPatientTAU.INDDID)
thick_id_TDP = np.unique(thicknessPatientTDP.INDDID)

thick_id_HC = thick_id_HC.astype('float64')
thick_id_TAU = thick_id_TAU.astype('float64')
thick_id_TDP = thick_id_TDP.astype('float64')

In [219]:
print(f"# of subjects in HC: {len(thick_id_HC)}")
print(f"# of subjects in Tau: {len(thick_id_TAU)}")
print(f"# of subjects in TDP: {len(thick_id_TDP)}")

# of subjects in HC: 54
# of subjects in Tau: 26
# of subjects in TDP: 30


In [220]:
print(f"Number of overlap (Tau - Path vs MR): {np.isin(thick_id_TAU, path_id_TAU).sum()}/{len(thick_id_TAU)}")

Number of overlap (Tau - Path vs MR): 13/26


In [221]:
print(f"Number of overlap (TDP - Path vs MR): {np.isin(thick_id_TDP, path_id_TDP).sum()}/{len(thick_id_TDP)}")

Number of overlap (TDP - Path vs MR): 21/30


In [222]:
path_TAU_MR = path_TAU[np.isin(path_id_TAU, thick_id_TAU), :]

In [223]:
path_TDP_MR = path_TDP[np.isin(path_id_TDP, thick_id_TDP), :]

In [224]:
path_TAU_MR.shape

(13, 40)

In [225]:
path_TDP_MR.shape

(21, 40)

# Data Summary

In [226]:
print("Total Unique INNDID in whole dataset")
print(len(pd.unique(pathT_WMGM['INDDID'])))
print("Unique INDDID in GM")
print(len(pd.unique(pathT_GM_LR['INDDID'])))
print("Unique INDDID in WM")
print(len(pd.unique(pathT_WM_LR['INDDID'])))

Total Unique INNDID in whole dataset
179
Unique INDDID in GM
179
Unique INDDID in WM
179


# Save the Dataset and Variables

#### Save pathT GM/WM to csv

In [227]:
# Save pathT GM/WM to csv
pathT_GM.to_csv(os.path.join(path_dataDir, 'new_pathT(GM).csv'), index=False)
pathT_WM.to_csv(os.path.join(path_dataDir, 'new_pathT(WM).csv'), index=False)

#### Save sn

In [228]:
with open(os.path.join(path_dataDir, 'sn.pkl'), 'wb') as f:
    pickle.dump(sn, f)
f.close()

#### Save pathCoM, pathToAtlasIndex

In [229]:
with open(os.path.join(path_dataDir, 'pathCoM.pkl'), 'wb') as f:
    pickle.dump(pathCoM, f)
f.close()

with open(os.path.join(path_dataDir, 'pathToAtlasIndex.pkl'), 'wb') as f:
    pickle.dump(pathToAtlasIndex, f)
f.close()

#### Save TAU and TDP Pathology Data (Log %AO)

In [27]:
# path_TAU
with open(os.path.join(path_dataDir, 'path_TAU.pkl'), 'wb') as f:
    pickle.dump(path_TAU, f)
f.close()

# path_TDP
with open(os.path.join(path_dataDir, 'path_TDP.pkl'), 'wb') as f:
    pickle.dump(path_TDP, f)
f.close()

## Save path_TAU_bvftd, path_TDP_bvftd

In [231]:
# # path_TAU_bvftd
# with open(os.path.join(path_dataDir, 'path_TAU_bvftd.pkl'), 'wb') as f:
#     pickle.dump(path_TAU_bvftd, f)
# f.close()

# # path_TDP_bvftd
# with open(os.path.join(path_dataDir, 'path_TDP_bvftd.pkl'), 'wb') as f:
#     pickle.dump(path_TDP_bvftd, f)
# f.close()

# path_TAU_bvftd
with open(os.path.join(path_dataDir, 'path_TAU.pkl'), 'wb') as f:
    pickle.dump(path_TAU_bvftd, f)
f.close()

# path_TDP_bvftd
with open(os.path.join(path_dataDir, 'path_TDP.pkl'), 'wb') as f:
    pickle.dump(path_TDP_bvftd, f)
f.close()

## Save path_TAU_MR, path_TDP_MR

In [142]:
# # path_TAU_MR
# with open(os.path.join(path_dataDir, 'path_TAU_MR.pkl'), 'wb') as f:
#     pickle.dump(path_TAU_MR, f)
# f.close()

# # path_TDP_MR
# with open(os.path.join(path_dataDir, 'path_TDP_MR.pkl'), 'wb') as f:
#     pickle.dump(path_TDP_MR, f)
# f.close()

# path_TAU_MR
with open(os.path.join(path_dataDir, 'path_TAU.pkl'), 'wb') as f:
    pickle.dump(path_TAU_MR, f)
f.close()

# path_TDP_MR
with open(os.path.join(path_dataDir, 'path_TDP.pkl'), 'wb') as f:
    pickle.dump(path_TDP_MR, f)
f.close()