# Imports

In [284]:
import os
import pandas as pd
import numpy as np
import scipy
import pickle
import sys  

In [285]:
!pwd

/Users/hyroh/Desktop/FTD_Research/mBIN/FTD_JupyterNotebook/Load_Dataset


### Helper Fuctions

In [286]:
%store -r loadData_hf
sys.path.insert(0, loadData_hf)
import findPathCoM

# Set Directory Paths

In [287]:
# Location of the data folder
%store -r dataDir

# Directory path where Data will be saved to
%store -r path_dataDir

# Only used to load the FTDGeneralData_20221114.mat file --> Saved as NetworkDataGeneral
%store -r baseDir

# Loading the preconstructed atlas data

In [288]:
# loads the preconstructed Atlas data
NetworkDataGeneral = scipy.io.loadmat(os.path.join(baseDir, 'NetworkAnalysisGeneral', 'FTDGeneralData_20221114.mat'))

# [1] Loading Pathology Dataset - %AO

In [289]:
# Load new_pathT: ex-vivo histopathology Data (Quantification) / %AO for pathology regions
new_pathT = pd.read_excel(os.path.join(dataDir, 'NewFTDData', 'FTLD Library 4-25-23 update.xlsx'))

### Format the Pathology Data - %AO to desired format

#### Divide each INDDID into {GM, WM} and {L, R} - 22 Regions (They are alphabetically Ordered)

In [290]:
# For each INDDID divided into {GM, WM} and {L, R} (maximum 4 rows per INDDID)
pathT_WMGM = pd.pivot_table(new_pathT, values='AvgPercentAO', 
                            index=['INDDID', 'FullAutopsyID', 'AutopsyIDNumOnly', 
                                   'Tau1_TDP2', 'Hemisphere_by_slide', 'AnalysisRegion'], 
                            columns=['Region'], aggfunc=np.sum)

In [291]:
# Unstacking the Index --> Need a way to solve this without saving to csv format
pathT_WMGM.to_csv(os.path.join(dataDir, 'NewFTDData', 'new_pathT(GMWM).csv'))
pathT_WMGM = pd.read_csv(os.path.join(dataDir, 'NewFTDData', 'new_pathT(GMWM).csv'))

#### Divide the pathT into GM and WM 

In [292]:
pathT_WMGM_type = pathT_WMGM.groupby('AnalysisRegion')

# This contains 2 seperate rows for {L, R}
pathT_GM_LR = pathT_WMGM_type.get_group('GM')
pathT_WM_LR = pathT_WMGM_type.get_group('WM')

# Combine 2 Rows for {L, R} into a single row
pathT_GM_LR_type = pathT_GM_LR.groupby('Hemisphere_by_slide')
pathT_GM_L = pathT_GM_LR_type.get_group('L')
pathT_GM_R = pathT_GM_LR_type.get_group('R')
pathT_GM = pd.merge(pathT_GM_L, pathT_GM_R, left_on=['INDDID', 'FullAutopsyID', 'AutopsyIDNumOnly', 'Tau1_TDP2', 'AnalysisRegion'], right_on=['INDDID', 'FullAutopsyID', 'AutopsyIDNumOnly', 'Tau1_TDP2', 'AnalysisRegion'], how='outer', suffixes=('_L', '_R')) 

pathT_WM_LR_type = pathT_WM_LR.groupby('Hemisphere_by_slide')
pathT_WM_L = pathT_WM_LR_type.get_group('L')
pathT_WM_R = pathT_WM_LR_type.get_group('R')
pathT_WM = pd.merge(pathT_WM_L, pathT_WM_R, left_on=['INDDID', 'FullAutopsyID', 'AutopsyIDNumOnly', 'Tau1_TDP2', 'AnalysisRegion'], right_on=['INDDID', 'FullAutopsyID', 'AutopsyIDNumOnly', 'Tau1_TDP2', 'AnalysisRegion'], how='outer', suffixes=('_L', '_R'))

# Drop Hemisphere_by_slide {L, R} Columns
pathT_GM = pathT_GM.drop(columns=['Hemisphere_by_slide_L', 'Hemisphere_by_slide_R'])
pathT_WM = pathT_WM.drop(columns=['Hemisphere_by_slide_L', 'Hemisphere_by_slide_R']) 

# [2] Mapping Pathology Regions to Atlas regions

In [293]:
# Load the Look up table matching Atlas Region names to Atlas Labels(Index)
pathLUT = pd.read_csv(os.path.join(dataDir,'schaefer_path_20210719_20220328.csv'))

# Load the Look up table matching Pathology Region names to Atlas Region names
AtlasToPathLUT = pd.read_excel(os.path.join(dataDir,'NewFTDData','PathToAtlasLUT_5_10_2023(mePFC_PFC_Ignored).xlsx'))

# Using AtlasToPathLUT get the Pathology Regions and match them to Atlas Regions (Index 1~400 regions)
# Return CoM for each Pathology Regions (Single Pahtology Regions match to multiple Atlas Regions, 
# therefore get Mean Value). Theses are unordered.
# Also return list of Atlas regions index corrresponding to Pathology regions. Theses are unordered.
pathCoMunordered, pathToAtlasIndexunordered = findPathCoM.findPathCoM(pathLUT, AtlasToPathLUT, 
                                                                      NetworkDataGeneral['NetworkDataGeneral'][0,0]['Schaefer400x7']['CoM'][0, 0])

In [294]:
# Get List of all regions of pathology we can map to 3D Atlas (out of 22) in Alphabetical Order
# ['ANG', 'ATC', 'HIP', 'IFC', 'M1', 'MFC', 'OFC', 'PC', 'S1', 'SMTC', 'SPC', 'V1', 'aCING', 'aINS', 'aITC', 'dlPFC', 'iPFC', 'mPFC', 'pCING', 'pSTC']
pathNames_3D_Map = np.sort(AtlasToPathLUT["PathSpreadSheetNames"].values)

# sn - denote the number of areas we are able to map to 3D Atlas
sn = len(pathNames_3D_Map)

In [295]:
# Ordering the CoM so that it matches the order of Regions in the Pathology Dataset - %AO (Columns)
pathCoM = np.empty((sn,3,2)) # One path regions corresponds to multiple atlas region
pathToAtlasIndex = [[None, None] for _ in range(sn)]

for s in range(sn):
    idx = AtlasToPathLUT[AtlasToPathLUT.PathSpreadSheetNames == pathNames_3D_Map[s]].index[0] 
    pathCoM[s,:,:] = pathCoMunordered[idx, :, :]
    pathToAtlasIndex[s] = pathToAtlasIndexunordered[idx]

# pathCoM and pathToAtlasIndex are ordered by the order of pathNames_3D_Map (= Ordering of regions same as in PathT Dataset Columns Left to Right)

In [296]:
# Drop Columns in pathT_GM / pathT_GM Where we cannot map to 3D Atlas, using AtlasToPathLUT (+5, for index offset)
pathT_GM = pathT_GM.drop(pathT_GM.columns[[i + 5 for i, e in enumerate(pathT_GM.columns.values[5:]) if e.split("_")[0] not in pathNames_3D_Map]], axis = 1)
pathT_WM = pathT_WM.drop(pathT_WM.columns[[i + 5 for i, e in enumerate(pathT_WM.columns.values[5:]) if e.split("_")[0] not in pathNames_3D_Map]], axis = 1)

# [3] TAU and TDP Divide (GM) + Log %AO

### Get index of rows that are TAU and TDP

In [300]:
# Index for the case with tau or tdp for patients
FTD_TAUIndx = (pathT_GM.Tau1_TDP2 == 1)  # False or True
FTD_TDPIndx = (pathT_GM.Tau1_TDP2 == 2) # False or True

### Compute Log %AO of Pathology Values

In [301]:
# Get Log %AO of 22 anatomical regions of the brain
#pathData = np.ma.log(0.01 * pathT.iloc[:, 5:].values + 0.00015).filled(np.nan) # Masked log for handling the case where the value is NaN
pathData = np.ma.log(pathT_GM.iloc[:, 5:].values + 0.00015).filled(np.nan)

In [302]:
# # NO LOG!!
# pathData = pathT_GM.iloc[:, 5:].values

### Divide Pathology Data into TAU and TDP

In [331]:
# Log %AO of FTD TAU vs TDP --> Type: ndarray
path_TAU = pathData[FTD_TAUIndx,:]
path_TDP = pathData[FTD_TDPIndx,:]

In [356]:
pd.DataFrame(path_TAU).count()

0     32
1      5
2     19
3     11
4     35
5     42
6     36
7      2
8      0
9     38
10    16
11    38
12    30
13    13
14    10
15    10
16     8
17     0
18     5
19     1
20    24
21     4
22    17
23    11
24    28
25    31
26    33
27     2
28     0
29    26
30    17
31    30
32    24
33    10
34     9
35    11
36     9
37     0
38     4
39     0
dtype: int64

# Get only bvFTD

In [332]:
path_id_TAU = pathT_GM[FTD_TAUIndx]['INDDID'].values

In [333]:
path_id_TDP = pathT_GM[FTD_TDPIndx]['INDDID'].values

In [334]:
# path_id_TAU
with open(os.path.join(path_dataDir, 'path_id_TAU.pkl'), 'wb') as f:
    pickle.dump(path_id_TAU, f)
f.close()

# path_id_TDP
with open(os.path.join(path_dataDir, 'path_id_TDP.pkl'), 'wb') as f:
    pickle.dump(path_id_TDP, f)
f.close()

## FTLD Autopsy MRI

In [335]:
ftd_autopsyT = pd.read_excel(os.path.join(dataDir, 'NewFTDData', 'FTLD Autopsy MRI (2023.02.10 14.31).xlsx'))

In [None]:
autopsy_id = np.unique(ftd_autopsyT.INDDID)

In [None]:
tau_aut_over = np.isin(path_id_TAU, autopsy_id)
print(f"Number of overlap (Path-Tau): {tau_aut_over.sum()}/{len(path_id_TAU)}")

In [336]:
tdp_aut_over = np.isin(path_id_TDP, autopsy_id)
print(f"Number of overlap (Path-Tdp): {tdp_aut_over.sum()}/{len(path_id_TDP)}")

Number of overlap (Path-Tdp): 45/103


## Tau

In [337]:
bvftd_tau_df = ftd_autopsyT[ftd_autopsyT['INDDID'].isin(path_id_TAU[tau_aut_over])][['INDDID','BVFTD','ClinicalDx1']].drop_duplicates(subset=['INDDID','BVFTD','ClinicalDx1']).sort_values(by='INDDID')

In [338]:
bvftd_tau_ids = bvftd_tau_df[bvftd_tau_df['BVFTD'] == 1.0].INDDID.values

In [339]:
bvftd_tau_ids

array([101483., 105223., 105564., 106309., 106814., 107516., 107677.,
       108026., 113113., 115001., 116591., 116607., 118410., 118780.])

## TDP

In [340]:
bvftd_tdp_df = ftd_autopsyT[ftd_autopsyT['INDDID'].isin(path_id_TDP[tdp_aut_over])][['INDDID','BVFTD','ClinicalDx1']].drop_duplicates(subset=['INDDID','BVFTD','ClinicalDx1']).sort_values(by='INDDID')


In [341]:
bvftd_tdp_ids = bvftd_tdp_df[bvftd_tdp_df['BVFTD'] == 1.0].INDDID.values

## Extract from path_TAU & path_TDP

In [342]:
path_TAU_bvftd = path_TAU[np.isin(path_id_TAU, bvftd_tau_ids), :]

In [343]:
path_TDP_bvftd = path_TDP[np.isin(path_id_TDP, bvftd_tdp_ids), :]

In [351]:
pd.DataFrame(path_TAU_bvftd)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,1.475476,,,,1.754727,1.453727,1.629455,,,1.370286,...,,,,,,,,,,
1,0.044538,,,1.269407,0.990359,,,,,,...,-0.427541,,2.322989,-0.242557,,2.626157,,,,
2,-1.641666,2.836317,,,2.020735,2.858188,,,,3.137147,...,,,,,,,1.488551,,,
3,3.435145,,,,,3.234939,2.610981,,,3.000968,...,1.792612,,,2.028441,2.878898,,,,,
4,1.536174,,,,-3.006985,2.529972,-0.338204,,,2.177931,...,,,,,,,,,,
5,-0.476323,,,,,0.055455,,,,,...,,,,,,,,,,
6,-2.548235,,2.643919,,,,,-4.75868,,-3.43236,...,,-3.51739,2.764876,,,,,,0.291416,
7,-0.310366,,,0.753268,0.263861,-0.076769,-1.162587,,,-1.616114,...,0.553392,,1.891229,0.0581,,0.493299,0.295355,,,
8,,,,-0.044843,,,,,,,...,-0.726273,-5.469577,-1.347043,0.195987,,0.47965,-0.306014,,,
9,1.124153,,,,-2.028964,3.710752,3.981187,,,2.382988,...,,,,,,,,,,


In [358]:
pd.DataFrame(path_TAU_bvftd).count().sort_values()

19     0
37     0
28     0
17     0
8      0
39     0
2      1
7      1
21     1
38     1
27     1
16     2
22     2
1      2
34     2
14     3
18     3
15     4
35     4
36     4
20     4
33     4
23     4
24     4
3      4
13     5
10     5
31     5
30     5
6      6
25     6
29     6
32     7
4      7
26     8
5      8
9      8
12     9
0      9
11    10
dtype: int64

In [354]:
pd.DataFrame(path_TDP_bvftd)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,,-2.961612,-1.955562,,,-1.837594,-1.388054,,,-1.789275,...,-2.897147,-5.98482,-1.404683,,-4.765171,-0.927689,-2.119068,,-2.616611,
1,-1.886635,,,,-0.825122,-4.480676,-1.353718,,,-3.803916,...,,,,,,,,,,
2,-1.515425,,,,-1.518871,-1.758736,-1.138626,,,-1.584745,...,-2.140614,,,-3.07024,-2.417488,,,,,
3,,,,,-3.479224,-2.545405,-0.910632,,,-0.624872,...,-3.046289,-5.654992,-1.955933,-3.362761,-4.743107,-2.520341,-2.189108,,,
4,,-1.74548,,,,-4.041463,-3.5431,,,,...,-4.306063,-3.502483,-2.570595,,-1.65112,-3.795679,-3.521075,,,
5,,,,,,-2.493096,-0.100926,,,-0.554325,...,-1.008979,-2.950991,-0.636926,-1.170919,-1.13519,,,,-1.114767,
6,,,,-1.377323,,-1.363034,-1.294659,,,-2.77314,...,-1.863685,-3.892493,-2.227762,-1.852078,,-2.399624,-2.118922,,-1.882998,
7,,,,,,,-4.116429,,,,...,-4.108009,-4.426134,-3.608949,-3.78385,-6.155054,,,,,
8,-0.881029,,,,-2.313828,,-0.093272,,,-1.395619,...,-2.565466,,,-0.254344,,,,,,
9,-4.274369,-5.233113,,-4.750595,-2.145413,-4.533884,-3.989856,,,-6.015973,...,-5.217551,,-4.98375,-8.804875,,-5.178411,-4.647098,,-6.237365,


In [359]:
pd.DataFrame(path_TDP_bvftd).count().sort_values()

19     0
37     0
28     0
27     0
17     0
8      0
7      0
39     0
2      3
3      5
22     5
23     6
1      6
21     6
18     6
38     7
4      8
0      8
11     9
34    10
13    11
16    11
14    12
36    12
24    12
35    12
15    13
20    14
33    14
9     15
31    15
32    16
10    16
12    16
30    16
5     16
29    17
25    19
6     19
26    20
dtype: int64

# Get bvFTD + Matching MR

In [215]:
# MRI Thickness value for All Subjects - schaefer400x7
thicknessAllraw = pd.read_csv(os.path.join(dataDir, 'NewFTDData', 
                                           'invivoPathCohort_quantsSubSesSchaefer400_tian12.csv'), dtype={'id': str})

In [216]:
thicknessPathLUT = pd.read_excel(os.path.join(dataDir, 'NewFTDData', 'InvivoPathCohort_03172023.xls'), 
                                 dtype={'INDDID': str})

In [217]:
thicknessAll = pd.merge(thicknessAllraw, thicknessPathLUT, left_on='id', right_on='INDDID', how='inner') 

# We only lose INDDID 108783x09 in the thicknessAllraw (849 rows lost)

In [218]:
thickness_path_type = thicknessAll.groupby('Group')

# MRI Thickness values for Healthy Control
thicknessHC = thickness_path_type.get_group('HC')
# MRI Thickness values for Patient (TAU)
thicknessPatientTAU = thickness_path_type.get_group('tau')
# MRI Thickness values for Patient (TDP)
thicknessPatientTDP = thickness_path_type.get_group('tdp')

# IDs
thick_id_HC = np.unique(thicknessHC.INDDID)
thick_id_TAU = np.unique(thicknessPatientTAU.INDDID)
thick_id_TDP = np.unique(thicknessPatientTDP.INDDID)

thick_id_HC = thick_id_HC.astype('float64')
thick_id_TAU = thick_id_TAU.astype('float64')
thick_id_TDP = thick_id_TDP.astype('float64')

In [219]:
print(f"# of subjects in HC: {len(thick_id_HC)}")
print(f"# of subjects in Tau: {len(thick_id_TAU)}")
print(f"# of subjects in TDP: {len(thick_id_TDP)}")

# of subjects in HC: 54
# of subjects in Tau: 26
# of subjects in TDP: 30


In [220]:
print(f"Number of overlap (Tau - Path vs MR): {np.isin(thick_id_TAU, path_id_TAU).sum()}/{len(thick_id_TAU)}")

Number of overlap (Tau - Path vs MR): 13/26


In [221]:
print(f"Number of overlap (TDP - Path vs MR): {np.isin(thick_id_TDP, path_id_TDP).sum()}/{len(thick_id_TDP)}")

Number of overlap (TDP - Path vs MR): 21/30


In [222]:
path_TAU_MR = path_TAU[np.isin(path_id_TAU, thick_id_TAU), :]

In [223]:
path_TDP_MR = path_TDP[np.isin(path_id_TDP, thick_id_TDP), :]

In [224]:
path_TAU_MR.shape

(13, 40)

In [225]:
path_TDP_MR.shape

(21, 40)

# Data Summary

In [226]:
print("Total Unique INNDID in whole dataset")
print(len(pd.unique(pathT_WMGM['INDDID'])))
print("Unique INDDID in GM")
print(len(pd.unique(pathT_GM_LR['INDDID'])))
print("Unique INDDID in WM")
print(len(pd.unique(pathT_WM_LR['INDDID'])))

Total Unique INNDID in whole dataset
179
Unique INDDID in GM
179
Unique INDDID in WM
179


# Save the Dataset and Variables

#### Save pathT GM/WM to csv

In [227]:
# Save pathT GM/WM to csv
pathT_GM.to_csv(os.path.join(path_dataDir, 'new_pathT(GM).csv'), index=False)
pathT_WM.to_csv(os.path.join(path_dataDir, 'new_pathT(WM).csv'), index=False)

#### Save sn

In [228]:
with open(os.path.join(path_dataDir, 'sn.pkl'), 'wb') as f:
    pickle.dump(sn, f)
f.close()

#### Save pathCoM, pathToAtlasIndex

In [229]:
with open(os.path.join(path_dataDir, 'pathCoM.pkl'), 'wb') as f:
    pickle.dump(pathCoM, f)
f.close()

with open(os.path.join(path_dataDir, 'pathToAtlasIndex.pkl'), 'wb') as f:
    pickle.dump(pathToAtlasIndex, f)
f.close()

#### Save TAU and TDP Pathology Data (Log %AO)

In [360]:
# path_TAU
with open(os.path.join(path_dataDir, 'path_TAU.pkl'), 'wb') as f:
    pickle.dump(path_TAU, f)
f.close()

# path_TDP
with open(os.path.join(path_dataDir, 'path_TDP.pkl'), 'wb') as f:
    pickle.dump(path_TDP, f)
f.close()

## Save path_TAU_bvftd, path_TDP_bvftd

In [231]:
# # path_TAU_bvftd
# with open(os.path.join(path_dataDir, 'path_TAU_bvftd.pkl'), 'wb') as f:
#     pickle.dump(path_TAU_bvftd, f)
# f.close()

# # path_TDP_bvftd
# with open(os.path.join(path_dataDir, 'path_TDP_bvftd.pkl'), 'wb') as f:
#     pickle.dump(path_TDP_bvftd, f)
# f.close()

# path_TAU_bvftd
with open(os.path.join(path_dataDir, 'path_TAU.pkl'), 'wb') as f:
    pickle.dump(path_TAU_bvftd, f)
f.close()

# path_TDP_bvftd
with open(os.path.join(path_dataDir, 'path_TDP.pkl'), 'wb') as f:
    pickle.dump(path_TDP_bvftd, f)
f.close()

## Save path_TAU_MR, path_TDP_MR

In [142]:
# # path_TAU_MR
# with open(os.path.join(path_dataDir, 'path_TAU_MR.pkl'), 'wb') as f:
#     pickle.dump(path_TAU_MR, f)
# f.close()

# # path_TDP_MR
# with open(os.path.join(path_dataDir, 'path_TDP_MR.pkl'), 'wb') as f:
#     pickle.dump(path_TDP_MR, f)
# f.close()

# path_TAU_MR
with open(os.path.join(path_dataDir, 'path_TAU.pkl'), 'wb') as f:
    pickle.dump(path_TAU_MR, f)
f.close()

# path_TDP_MR
with open(os.path.join(path_dataDir, 'path_TDP.pkl'), 'wb') as f:
    pickle.dump(path_TDP_MR, f)
f.close()