# Imports

In [1]:
import os
import pandas as pd
import numpy as np
import scipy
import pickle
import sys  

In [2]:
!pwd

/Users/hyung/Research23_Network_Analysis/mBIN/FTD_JupyterNotebook/Load_Dataset


### Helper Fuctions

In [3]:
%store -r loadData_hf
sys.path.insert(0, loadData_hf)
import findPathCoM

# Set Directory Paths

In [4]:
# Location of the data folder
%store -r dataDir

# Directory path where Data will be saved to
%store -r path_dataDir

# Only used to load the FTDGeneralData_20221114.mat file --> Saved as NetworkDataGeneral
%store -r baseDir

# Loading the preconstructed atlas data

In [5]:
# loads the preconstructed Atlas data
NetworkDataGeneral = scipy.io.loadmat(os.path.join(baseDir, 'NetworkAnalysisGeneral', 'FTDGeneralData_20221114.mat'))

# [1] Loading Pathology Dataset - %AO

In [6]:
# Load new_pathT: ex-vivo histopathology Data (Quantification) / %AO for pathology regions
new_pathT = pd.read_excel(os.path.join(dataDir, 'NewFTDData', 'FTLD Library 4-25-23 update.xlsx'), 
                          dtype={'INDDID': str, 'Tau1_TDP2': str})

### Format the Pathology Data - %AO to desired format

#### Divide each INDDID into {GM, WM} and {L, R} - 22 Regions (They are alphabetically Ordered)

In [7]:
# For each INDDID divided into {GM, WM} and {L, R} (maximum 4 rows per INDDID)
pathT_WMGM = pd.pivot_table(new_pathT, values='AvgPercentAO', 
                            index=['INDDID', 'FullAutopsyID', 'AutopsyIDNumOnly', 
                                   'Tau1_TDP2', 'Hemisphere_by_slide', 'AnalysisRegion'], 
                            columns=['Region'], aggfunc=np.sum)

In [8]:
# Unstacking the Index --> Need a way to solve this without saving to csv format
pathT_WMGM.to_csv(os.path.join(dataDir, 'NewFTDData', 'new_pathT(GMWM).csv'))
pathT_WMGM = pd.read_csv(os.path.join(dataDir, 'NewFTDData', 'new_pathT(GMWM).csv'))

#### Divide the pathT into GM and WM 

In [9]:
pathT_WMGM_type = pathT_WMGM.groupby('AnalysisRegion')

# This contains 2 seperate rows for {L, R}
pathT_GM_LR = pathT_WMGM_type.get_group('GM')
pathT_WM_LR = pathT_WMGM_type.get_group('WM')

# Combine 2 Rows for {L, R} into a single row
pathT_GM_LR_type = pathT_GM_LR.groupby('Hemisphere_by_slide')
pathT_GM_L = pathT_GM_LR_type.get_group('L')
pathT_GM_R = pathT_GM_LR_type.get_group('R')
pathT_GM = pd.merge(pathT_GM_L, pathT_GM_R, left_on=['INDDID', 'FullAutopsyID', 'AutopsyIDNumOnly', 'Tau1_TDP2', 'AnalysisRegion'], right_on=['INDDID', 'FullAutopsyID', 'AutopsyIDNumOnly', 'Tau1_TDP2', 'AnalysisRegion'], how='outer', suffixes=('_L', '_R')) 

pathT_WM_LR_type = pathT_WM_LR.groupby('Hemisphere_by_slide')
pathT_WM_L = pathT_WM_LR_type.get_group('L')
pathT_WM_R = pathT_WM_LR_type.get_group('R')
pathT_WM = pd.merge(pathT_WM_L, pathT_WM_R, left_on=['INDDID', 'FullAutopsyID', 'AutopsyIDNumOnly', 'Tau1_TDP2', 'AnalysisRegion'], right_on=['INDDID', 'FullAutopsyID', 'AutopsyIDNumOnly', 'Tau1_TDP2', 'AnalysisRegion'], how='outer', suffixes=('_L', '_R'))

# Drop Hemisphere_by_slide {L, R} Columns
pathT_GM = pathT_GM.drop(columns=['Hemisphere_by_slide_L', 'Hemisphere_by_slide_R'])
pathT_WM = pathT_WM.drop(columns=['Hemisphere_by_slide_L', 'Hemisphere_by_slide_R']) 

# [2] Mapping Pathology Regions to Atlas regions

In [11]:
# Load the Look up table matching Atlas Region names to Atlas Labels(Index)
pathLUT = pd.read_csv(os.path.join(dataDir,'schaefer_path_20210719_20220328.csv'))

# Load the Look up table matching Pathology Region names to Atlas Region names
AtlasToPathLUT = pd.read_excel(os.path.join(dataDir,'NewFTDData','PathToAtlasLUT_5_10_2023(mePFC_PFC_Ignored).xlsx'))

# Using AtlasToPathLUT get the Pathology Regions and match them to Atlas Regions (Index 1~400 regions)
# Return CoM for each Pathology Regions (Single Pahtology Regions match to multiple Atlas Regions, 
# therefore get Mean Value). Theses are unordered.
# Also return list of Atlas regions index corrresponding to Pathology regions. Theses are unordered.
pathCoMunordered, pathToAtlasIndexunordered = findPathCoM.findPathCoM(pathLUT, AtlasToPathLUT, 
                                                                      NetworkDataGeneral['NetworkDataGeneral'][0,0]['Schaefer400x7']['CoM'][0, 0])

In [12]:
# Get List of all regions of pathology we can map to 3D Atlas (out of 22) in Alphabetical Order
# ['ANG', 'ATC', 'HIP', 'IFC', 'M1', 'MFC', 'OFC', 'PC', 'S1', 'SMTC', 'SPC', 'V1', 'aCING', 'aINS', 'aITC', 'dlPFC', 'iPFC', 'mPFC', 'pCING', 'pSTC']
pathNames_3D_Map = np.sort(AtlasToPathLUT["PathSpreadSheetNames"].values)

# sn - denote the number of areas we are able to map to 3D Atlas
sn = len(pathNames_3D_Map)

In [13]:
# Ordering the CoM so that it matches the order of Regions in the Pathology Dataset - %AO (Columns)
pathCoM = np.empty((sn,3,2)) # One path regions corresponds to multiple atlas region
pathToAtlasIndex = [[None, None] for _ in range(sn)]

for s in range(sn):
    idx = AtlasToPathLUT[AtlasToPathLUT.PathSpreadSheetNames == pathNames_3D_Map[s]].index[0] 
    pathCoM[s,:,:] = pathCoMunordered[idx, :, :]
    pathToAtlasIndex[s] = pathToAtlasIndexunordered[idx]

# pathCoM and pathToAtlasIndex are ordered by the order of pathNames_3D_Map (= Ordering of regions same as in PathT Dataset Columns Left to Right)

In [14]:
# Drop Columns in pathT_GM / pathT_GM Where we cannot map to 3D Atlas, using AtlasToPathLUT (+5, for index offset)
pathT_GM = pathT_GM.drop(pathT_GM.columns[[i + 5 for i, e in enumerate(pathT_GM.columns.values[5:]) if e.split("_")[0] not in pathNames_3D_Map]], axis = 1)
pathT_WM = pathT_WM.drop(pathT_WM.columns[[i + 5 for i, e in enumerate(pathT_WM.columns.values[5:]) if e.split("_")[0] not in pathNames_3D_Map]], axis = 1)

# [3] TAU and TDP Divide (GM) + Log %AO

### Get index of rows that are TAU and TDP

In [16]:
# Index for the case with tau or tdp for patients
FTD_TAUIndx = (pathT_GM.Tau1_TDP2 == 1)  # False or True
FTD_TDPIndx = (pathT_GM.Tau1_TDP2 == 2) # False or True

In [24]:
pathT_GM[FTD_TAUIndx]['INDDID'].values

array([100551.  , 101068.  , 101105.  , 102149.  , 103032.  , 105000.  ,
       105223.  , 105358.  , 105492.  , 105961.  , 106297.  , 106309.  ,
       106814.  , 106840.  , 107516.  , 107663.  , 107969.  , 108026.  ,
       108077.  , 108196.  , 108508.  , 109115.  , 109176.  , 109299.  ,
       109759.  , 110181.  , 110306.  , 110745.  , 110914.  , 110917.  ,
       111231.  , 111527.  , 111530.  , 112570.  , 113113.  , 113909.  ,
       113938.  , 115001.  , 115592.  , 116504.  , 116591.  , 116607.  ,
       118011.  , 118410.  , 118780.  , 119113.  , 119140.  , 119359.  ,
       119413.  , 120298.  , 122143.  , 101407.  , 101483.  , 103121.  ,
       103782.  , 104281.  , 104937.  , 105564.  , 107187.  , 107429.  ,
       107667.  , 107677.  , 109048.  , 111005.  , 111853.  , 112514.  ,
       112764.  , 114348.  , 114762.  , 114762.02, 115327.  , 116275.  ,
       116401.  , 116409.  , 117566.  , 118575.  ])

In [26]:
pathT_GM[FTD_TDPIndx]['INDDID'].values

array([100071.  , 100686.  , 102792.  , 103282.  , 103601.  , 103640.  ,
       103703.  , 103714.  , 104156.  , 104613.  , 104659.  , 104862.  ,
       105247.  , 105686.  , 105769.  , 106335.  , 106461.  , 106641.  ,
       107031.  , 108276.  , 108344.  , 108783.  , 108783.09, 109058.  ,
       109073.  , 109206.  , 109476.  , 110338.  , 110361.  , 110445.  ,
       110581.  , 110658.  , 110705.  , 111077.  , 112202.  , 112273.  ,
       112298.  , 112780.  , 112974.  , 113867.  , 114076.  , 114395.  ,
       114753.  , 116521.  , 116569.  , 116598.  , 116748.  , 117589.  ,
       117630.  , 117637.  , 117663.  , 117753.  , 118190.  , 118234.  ,
       118430.  , 118694.  , 118762.  , 118914.  , 118952.  , 119454.  ,
       119610.  , 119768.  , 120720.  , 120950.  , 121078.  , 121199.  ,
       121261.  , 100096.  , 101045.  , 101272.  , 101525.  , 101778.  ,
       103568.  , 104094.  , 106955.  , 107204.01, 107519.  , 107636.  ,
       108542.  , 108790.  , 108930.  , 109050.  , 

### Compute Log %AO of Pathology Values

In [None]:
# Get Log %AO of 22 anatomical regions of the brain
#pathData = np.ma.log(0.01 * pathT.iloc[:, 5:].values + 0.00015).filled(np.nan) # Masked log for handling the case where the value is NaN
pathData = np.ma.log(pathT_GM.iloc[:, 5:].values + 0.00015).filled(np.nan)

In [None]:
# # NO LOG!!
# pathData = pathT_GM.iloc[:, 5:].values

### Divide Pathology Data into TAU and TDP

In [None]:
# Log %AO of FTD TAU vs TDP --> Type: ndarray
path_TAU = pathData[FTD_TAUIndx,:]
path_TDP = pathData[FTD_TDPIndx,:]

# Data Summary

In [None]:
print("Total Unique INNDID in whole dataset")
print(len(pd.unique(pathT_WMGM['INDDID'])))
print("Unique INDDID in GM")
print(len(pd.unique(pathT_GM_LR['INDDID'])))
print("Unique INDDID in WM")
print(len(pd.unique(pathT_WM_LR['INDDID'])))

In [None]:
pd.unique(pathT_GM_LR['INDDID'])

# Save the Dataset and Variables

#### Save pathT GM/WM to csv

In [None]:
# Save pathT GM/WM to csv
pathT_GM.to_csv(os.path.join(path_dataDir, 'new_pathT(GM).csv'), index=False)
pathT_WM.to_csv(os.path.join(path_dataDir, 'new_pathT(WM).csv'), index=False)

#### Save sn

In [None]:
with open(os.path.join(path_dataDir, 'sn.pkl'), 'wb') as f:
    pickle.dump(sn, f)
f.close()

#### Save pathCoM, pathToAtlasIndex

In [None]:
with open(os.path.join(path_dataDir, 'pathCoM.pkl'), 'wb') as f:
    pickle.dump(pathCoM, f)
f.close()

with open(os.path.join(path_dataDir, 'pathToAtlasIndex.pkl'), 'wb') as f:
    pickle.dump(pathToAtlasIndex, f)
f.close()

#### Save TAU and TDP Pathology Data (Log %AO)

In [None]:
# path_TAU
with open(os.path.join(path_dataDir, 'path_TAU.pkl'), 'wb') as f:
    pickle.dump(path_TAU, f)
f.close()

# path_TDP
with open(os.path.join(path_dataDir, 'path_TDP.pkl'), 'wb') as f:
    pickle.dump(path_TDP, f)
f.close()