In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os
import datetime as dt

import scipy.stats as stats

from sklearn.preprocessing import StandardScaler, Normalizer, LabelEncoder, PolynomialFeatures
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold, train_test_split, cross_val_score, cross_val_predict, GridSearchCV, RandomizedSearchCV

from sklearn.cluster import KMeans
from sklearn.metrics.cluster import silhouette_score

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC  
from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error, accuracy_score

#visualizing results
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
#import yellowbrick as yb

plt.rcParams['figure.figsize'] = (20.0, 10.0)  

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

In [None]:
#create file paths for required data (some from TBIFreeze_20190215, individual questions quiried by RCH)

#path for AUDITC individual questions
AUDITC_path = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/PeskindTBI/AUDITC/auditDataSet_Oct2018_smaller.csv'

#path for excel sheet of multi measurements (multiple entries for each participant)
path_multi = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/PeskindTBI/Individual_sheets/Multi_measurements.xlsx'

#path for excel sheet of single measurements (single entry for each participant)
path_single = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/PeskindTBI/Individual_sheets/Single_measurements.xlsx'

#path for excel sheet of TBI measurements (single entry for each participant)
path_TBI = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/PeskindTBI/Individual_sheets/TBI_symptoms.xlsx'

#path for excel sheet of TBI measurements (single entry for each participant)
path_PET = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/PeskindTBI/Individual_sheets/FDG_PET.xlsx'

In [None]:
#we are going to deal with path_multi first
data_mult = pd.read_excel(path_multi)
data_mult = pd.DataFrame(data = data_mult)
print('Data shape all groups:\n', data_mult.shape, '\n')

#select only TBIID C and T (control and TBI)
data_mult = data_mult[data_mult['TBIID'].str.match(r'[CT]\d\d')]
print('Data shape only deployed controls and mTBI groups:\n', data_mult.shape, '\n')
print('Data types:\n', data_mult.info(), '\n')
data_mult.head()

In [None]:
# create new data frame containing only the first visit record (based on 'MeasureDate' column)
#add new column with group ID (C = deployed controls, T = blast mTBI)

first_visit_data_mult = pd.DataFrame()

participants = data_mult['TBIID'].unique()

for part in participants:
    dates = data_mult.loc[data_mult['TBIID'] == part, ['MeasureDate']].values
    min_date = dates.min()
    
    first_date = data_mult[(data_mult['TBIID'] == part) & (data_mult['MeasureDate'] == min_date)]
    first_date['Group'] = part[0]
    first_visit_data_mult = first_visit_data_mult.append(first_date)

#reset indexes
first_visit_data_mult = first_visit_data_mult.reset_index(drop=True)

print(len(participants))
print(first_visit_data_mult.shape)
first_visit_data_mult.head()

In [None]:
#now deal with path_single
data_single = pd.read_excel(path_single)
data_single = pd.DataFrame(data = data_single)
print('Data shape:\n', data_single.shape, '\n')

#select only TBIID C and T (control and TBI)
data_single = data_single[data_single['TBIID'].str.match(r'[CT]\d\d')]
print('Data shape:\n', data_single.shape, '\n')
print('Data types:\n', data_single.info(), '\n')
data_single.head()

In [None]:
#create new data frame containing only the first visit record (based on 'ScreenDate' column)
#add new column with group ID (C = deployed controls, T = blast mTBI)

first_visit_data_single = pd.DataFrame()

participants = data_single['TBIID'].unique()

for part in participants:
    dates = data_single.loc[data_single['TBIID'] == part, ['ScreenDate']].values
    min_date = dates.min()
    
    first_date = data_single[(data_single['TBIID'] == part) & (data_single['ScreenDate'] == min_date)]
    first_date['Group'] = part[0]
    first_visit_data_single = first_visit_data_single.append(first_date)
    
#reset indexes
first_visit_data_single = first_visit_data_single.reset_index(drop=True)

print(len(participants))
print(first_visit_data_single.shape)
first_visit_data_single.head()

In [None]:
#now deal with path_TBI
data_TBI = pd.read_excel(path_TBI)
data_TBI = pd.DataFrame(data = data_TBI)
print('Data shape all groups:\n', data_TBI.shape, '\n')

#select only TBIID C and T (control and TBI)
data_TBI = data_TBI[data_TBI['TBIID'].str.match(r'[CT]\d\d')]
print('Data shape only deployed controls and mTBI groups:\n', data_mult.shape, '\n')
print('Data types:\n', data_TBI.info(), '\n')
data_TBI.head()

In [None]:
#create new data frame containing only the first visit record (based on 'FormDate' column)
#add new column with group ID (C = deployed controls, T = blast mTBI)

first_visit_data_TBI = pd.DataFrame()

participants = data_TBI['TBIID'].unique()

for part in participants:
    dates = data_TBI.loc[data_TBI['TBIID'] == part, ['FormDate']].values
    min_date = dates.min()
    
    first_date = data_TBI[(data_TBI['TBIID'] == part) & (data_TBI['FormDate'] == min_date)]
    first_date['Group'] = part[0]
    first_visit_data_TBI = first_visit_data_TBI.append(first_date)
    
#reset indexes
first_visit_data_TBI = first_visit_data_TBI.reset_index(drop=True)

print(len(participants))
print(first_visit_data_TBI.shape)
first_visit_data_TBI.head()

In [None]:
#now deal with path_PET
data_PET = pd.read_excel(path_PET)
data_PET = pd.DataFrame(data = data_PET)
print('Data shape all groups:\n', data_PET.shape, '\n')

#select only TBIID C and T (control and TBI)
data_PET = data_PET[data_PET['TBIID'].str.match(r'[CT]\d\d')]
print('Data shape only deployed controls and mTBI groups:\n', data_PET.shape, '\n')
print('Data types:\n', data_PET.info(), '\n')
data_PET.head()

In [None]:
#create new data frame containing only the first visit record (based on 'ScreenDate' column)
#add new column with group ID (C = deployed controls, T = blast mTBI)

first_visit_data_PET = pd.DataFrame()

participants = data_PET['TBIID'].unique()

for part in participants:
    dates = data_PET.loc[data_PET['TBIID'] == part, ['ScreenDate']].values
    min_date = dates.min()
    
    first_date = data_PET[(data_PET['TBIID'] == part) & (data_PET['ScreenDate'] == min_date)]
    first_date['Group'] = part[0]
    first_visit_data_PET = first_visit_data_PET.append(first_date)
    
#reset indexes
first_visit_data_PET = first_visit_data_PET.reset_index(drop=True)

print(len(participants))
print(first_visit_data_PET.shape)
first_visit_data_PET.head()

In [None]:
#now deal with AUDITC path - only one date
data_AUDITC = pd.read_csv(AUDITC_path)
data_AUDITC = pd.DataFrame(data = data_AUDITC)
print('Data shape all groups:\n', data_AUDITC.shape, '\n')

#select only TBIID C and T (control and TBI)
data_AUDITC = data_AUDITC[data_AUDITC['TBIID'].str.match(r'[CT]\d\d')]
print('Data shape only deployed controls and mTBI groups:\n', data_AUDITC.shape, '\n')
print('Data types:\n', data_AUDITC.info(), '\n')
data_AUDITC.head()

In [None]:
#check length of data tables (AUDITC has more entries)
print('The length of the multi data table is: ', len(first_visit_data_mult))
print('The length of the single data table is: ', len(first_visit_data_single))
print('The length of the TBI data table is: ', len(first_visit_data_TBI))
print('The length of the TBI data table is: ', len(first_visit_data_PET))
print('The length of the AUDITC data table is: ', len(data_AUDITC))

In [None]:
#joing the data sets on the common key 'TBIID' - this should only keep entries that are shared across all data sets
merge_data_first = pd.merge(first_visit_data_mult, first_visit_data_single, how='inner', on='TBIID', suffixes=('_mult', '_single'), validate='one_to_one')
print(merge_data_first.shape)
merge_data_first = pd.merge(merge_data_first, first_visit_data_TBI, how='inner', on='TBIID', suffixes=('_mult', '_TBI'), validate='one_to_one')
print(merge_data_first.shape)
merge_data_first = pd.merge(merge_data_first, first_visit_data_PET, how='inner', on='TBIID', suffixes=('_mult', '_PET'), validate='one_to_one')
print(merge_data_first.shape)
merge_data_first = pd.merge(merge_data_first, data_AUDITC, how='inner', on='TBIID', suffixes=('_mult', '_AUDITC'), validate='one_to_one')
print(merge_data_first.shape)
merge_data_first.head()

In [None]:
#confirm the two data tables' columns were merged correctly
print('The column length of the multi data table is: ', len(first_visit_data_mult.columns))
print('The column length of the single data table is: ', len(first_visit_data_single.columns))
print('The column length of the TBI data table is: ', len(first_visit_data_TBI.columns))
print('The column length of the TBI data table is: ', len(first_visit_data_PET.columns))
print('The column length of the AUDITC data table is: ', len(data_AUDITC.columns))
print('The columns of all tables add to: ', (len(first_visit_data_mult.columns) + len(first_visit_data_single.columns) + len(first_visit_data_TBI.columns) + len(first_visit_data_PET.columns) + len(data_AUDITC.columns) -4))
print('The column length of the merge data table is: ', len(merge_data_first.columns))

In [None]:
#-999 and 'None' are missing values so replace with 'Nan'
merge_data_first = merge_data_first.replace({-999.0: np.nan, 'None': np.nan})

In [None]:
dep_vars = ['Group_PET', 'DOB', 'ScreenAge', 'Education', 'servconn',
       'cestotal', 'MnthSncBlst', 'AUDIT1', 'AUDIT2', 'AUDIT3', 'AUDITtot', 'PSQItot_AUDITC', 'PCLTot_AUDITC',
 'CAPSTotal_AUDITC', 'PTSD_YN', 'LECTotal', 'NSITot_AUDITC', 'QKOIorA_AUDITC',
 'QKOExpMil_AUDITC', 'QKOAllMil_AUDITC', 'QKOLife_AUDITC', 'QBlstExp_AUDITC',
 'QBEACRM_AUDITC', 'QBEIorA_AUDITC', 'DA', 'DOPA', 'NE', 'tbiDizzy', 'tbiBalan', 'tbiCoord',
       'tbiHeada', 'tbiNaus', 'tbiVision', 'tbiLight', 'tbiHear',
       'tbiNoise', 'tbiTingl', 'tbiTstsml', 'tbiAppet', 'tbiConc',
       'tbiForget', 'tbiDecis', 'tbiSlow', 'tbiEnergy', 'tbiSleep',
       'tbiAnx', 'tbiSad', 'tbiIrrit', 'tbiOverw', 'tbiDisin', 'tbiWithd',
       'tbiRing', 'tbiMoods', 'tbiFight', 'tbiSpch', 'NSITot_mult', 'TBITot', 'WCSTPrs', 'WCSTPrsT', 'WCSTCL',
       'WCSTCLP', 'WCSTCLPT', 'WCSTCat', 'WCSTFail', 'Inhibit_Mean', 'Shift_Mean',
       'Emotional Control_Mean', 'Self Monitor_Mean', 'Initiate_Mean',
       'Working Memory_Mean', 'Plan/Organize_Mean', 'FrTotBe', 'FrTotBeT', 'FrTotBeAns', 'FrTotBeQ',
       'FrTotAf', 'FrTotAfT', 'FrTotAfAns', 'FrTotAfQ', 'BISAtt', 'BISAttAns', 'BISAttTotQ', 'BISMtr', 'BISMrtAns',
       'BISMtrTotQ', 'BISNonpl', 'BISNonplAns', 'BISNonplTotQ', 'BISTot',
       'BISAns', 'BISTotQ', 'WEIGHT', 'BPSYS', 'BPDIAS', 'HRATE', 'hrslp', 'PSQI1hr', 'PSQI1min',
       'PSQI2', 'PSQI3hr', 'PSQI3min', 'PSQI5a', 'PSQI5b', 'PSQI5c',
       'PSQI5d', 'PSQI5e', 'PSQI5f', 'PSQI5g', 'PSQI5h', 'PSQI5i',
       'PSQI5j', 'PSQI5jco', 'PSQI6', 'PSQI7', 'PSQI8', 'PSQI9', 'PSQIc1',
       'PSQIc2', 'PSQIc3', 'PSQIc4', 'PSQIc5', 'PSQIc6', 'PSQIc7',
       'PSQItot_mult', 'PCL1', 'PCL2', 'PCL3', 'PCL4', 'PCL5', 'PCL6', 'PCL7',
       'PCL8', 'PCL9', 'PCL10', 'PCL11', 'PCL12', 'PCL13', 'PCL14',
       'PCL15', 'PCL16', 'PCL17', 'PCLTot_mult', 
       'capsCrtA', 'CAPSTotal_mult', 'PHQ1', 'PHQ2', 'PHQ3', 'PHQ4',
       'PHQ5', 'PHQ6', 'PHQ7', 'PHQ8', 'PHQ9', 'PHQTot', 'auditc',
       'BNI1Im', 'BNI2Im', 'BNI3Im', 'BNI4Im', 'BNI5Im', 'BNI6Im',
       'BNI7Im', 'BNI8Im', 'BNI9Im', 'BNI10Im', 'BNI11', 'BNITotIm',
       'LEC1', 'LEC2', 'LEC3', 'LEC4', 'LEC5', 'LEC6', 'LEC7', 'LEC8',
       'LEC9', 'LEC10', 'LEC11', 'LEC12', 'LEC13', 'LEC14', 'LEC15',
       'LEC16', 'LEC17', 'LEC18', 'LEC19', 'LEC20', 'Insula_l', 'Insula_r', 'Cingulum_Ant_l',
       'Cingulum_Ant_r', 'Cingulum_Mid_l', 'Cingulum_Mid_r',
       'Cingulum_Post_l', 'Cingulum_Post_r', 
       'Amygdala_l', 'Amygdala_r', 'CaudateNucl_l',
       'CaudateNucl_r', 'Putamen_l', 'Putamen_r', 'Pallidum_l',
       'Pallidum_r', 'Midbrain', 'Pons']
merge_data_first_dep = merge_data_first[dep_vars]
print(merge_data_first_dep.shape)

In [None]:
#explore missing data
print(merge_data_first_dep.shape)
merge_data_first_dep.isna().sum().sort_values(ascending=False).plot(kind = 'hist')
plt.show()
#drop columns with greater than 50 missing values
merge_first_clean = merge_data_first_dep.dropna(thresh = 100, axis = 1)
print(merge_first_clean.shape)
print(merge_first_clean.isna().sum().sort_values(ascending=False))

In [None]:
#confirm data sets match
print(merge_first_clean.groupby('Group_PET')['auditc'].mean())
print(merge_first_clean.groupby('Group_PET')['AUDITtot'].mean())
#vizulaize AUDIT_C data by group
AUDIT_C_names = ['AUDIT1', 'AUDIT2', 'AUDIT3', 'AUDITtot']
for param in AUDIT_C_names:
    plt.figure(figsize=(10,10))
    merge_first_clean.groupby(['Group_PET'])[param].mean().plot(kind='bar', yerr=merge_first_clean.groupby(['Group_PET'])[param].sem())
    plt.ylabel([param])
    #plt.savefig(str(param + '.png'))
    plt.show()

In [None]:
all_dep = ['ScreenAge', 'Education', 'servconn',
       'cestotal', 'MnthSncBlst', 'AUDIT1', 'AUDIT2', 'AUDIT3',
       'AUDITtot', 'PSQItot_AUDITC', 'PCLTot_AUDITC', 'CAPSTotal_AUDITC',
       'PTSD_YN', 'LECTotal', 'NSITot_AUDITC', 
       'NSITot_mult', 'TBITot', 'QKOIorA_AUDITC',
       'QKOExpMil_AUDITC', 'QKOAllMil_AUDITC', 'QKOLife_AUDITC',
       'QBlstExp_AUDITC', 'QBEIorA_AUDITC', 
       'tbiSleep', 'tbiAnx', 'tbiSad', 'tbiIrrit', 'tbiOverw', 'tbiDisin',
       'tbiWithd', 'tbiMoods', 'tbiFight', 'WEIGHT', 'BPSYS', 'BPDIAS', 'HRATE', 'hrslp',
       'PSQItot_mult', 'PCLTot_mult', 'capsCrtA',
       'CAPSTotal_mult', 'PHQTot', 'Amygdala_l', 'Amygdala_r',
       'Pallidum_l', 'Pallidum_r', 'Midbrain']
AUDIT_C_names = ['AUDIT1', 'AUDIT2', 'AUDIT3', 'AUDITtot']

In [None]:
for param in all_dep:
    plt.figure(figsize=(10,10))
    merge_first_clean.groupby(['Group_PET'])[param].mean().plot(kind='bar', yerr=merge_first_clean.groupby(['Group_PET'])[param].sem())
    plt.ylabel([param])
    #plt.savefig(str(param + '.png'))
    plt.show()

In [None]:
corr = merge_first_clean[all_dep].groupby('Group_PET').corr()
fig, ax = plt.subplots(figsize=(40, 40))
sns.heatmap(corr, center=0)

In [None]:
sns.pairplot(merge_first_clean[merge_first_clean['Group_PET'] == 'T'], x_vars=all_dep, y_vars=AUDIT_C_names, kind='reg', dropna=True)