In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os
import datetime as dt

import scipy.stats as stats

from sklearn.preprocessing import StandardScaler, Normalizer, LabelEncoder, PolynomialFeatures
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold, train_test_split, cross_val_score, cross_val_predict, GridSearchCV, RandomizedSearchCV

from sklearn.cluster import KMeans
from sklearn.metrics.cluster import silhouette_score

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC  
from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error, accuracy_score

#visualizing results
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
#import yellowbrick as yb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

In [None]:
#path for excel sheet of multi measurements (multiple entries for each participant)
path_multi = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/PeskindTBI/Individual_sheets/Multi_measurements.xlsx'

#path for excel sheet of single measurements (single entry for each participant)
path_single = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/PeskindTBI/Individual_sheets/Single_measurements.xlsx'

#path for excel sheet of TBI measurements (single entry for each participant)
path_TBI = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/PeskindTBI/Individual_sheets/TBI_symptoms.xlsx'

In [None]:
#we are going to deal with path_multi first
data_mult = pd.read_excel(path_multi)
data_mult = pd.DataFrame(data = data_mult)
print('Data shape all groups:\n', data_mult.shape, '\n')

#select only TBIID C and T (control and TBI)
data_mult = data_mult[data_mult['TBIID'].str.match(r'[CT]\d\d')]
print('Data shape only deployed controls and mTBI groups:\n', data_mult.shape, '\n')
print('Data types:\n', data_mult.info(), '\n')
data_mult.head()

In [None]:
#create two new data frames: 1 of the first visit record and 1 of the last visit record (based on 'MeasureDate' column)
#add new column with group ID (C = deployed controls, T = blast mTBI)
last_visit_data_mult = pd.DataFrame()
first_visit_data_mult = pd.DataFrame()

participants = data_mult['TBIID'].unique()

for part in participants:
    dates = data_mult.loc[data_mult['TBIID'] == part, ['MeasureDate']].values
    max_date = dates.max()
    min_date = dates.min()
    
    last_date = data_mult[(data_mult['TBIID'] == part) & (data_mult['MeasureDate'] == max_date)]
    last_date['Group'] = part[0]
    last_visit_data_mult = last_visit_data_mult.append(last_date)
    
    first_date = data_mult[(data_mult['TBIID'] == part) & (data_mult['MeasureDate'] == min_date)]
    first_date['Group'] = part[0]
    first_visit_data_mult = first_visit_data_mult.append(first_date)

#reset indexes
last_visit_data_mult = last_visit_data_mult.reset_index(drop=True)
first_visit_data_mult = first_visit_data_mult.reset_index(drop=True)

print(len(participants))
print(first_visit_data_mult.shape)
print(last_visit_data_mult.shape)
last_visit_data_mult.head()

In [None]:
#now deal with path_single
data_single = pd.read_excel(path_single)
data_single = pd.DataFrame(data = data_single)
print('Data shape:\n', data_single.shape, '\n')

#select only TBIID C and T (control and TBI)
data_single = data_single[data_single['TBIID'].str.match(r'[CT]\d\d')]
print('Data shape:\n', data_single.shape, '\n')
print('Data types:\n', data_single.info(), '\n')
data_single.head()

In [None]:
#create two new data frames: 1 of the first visit record and 1 of the last visit record
#add new column with group ID (C = deployed controls, T = blast mTBI)
last_visit_data_single = pd.DataFrame()
first_visit_data_single = pd.DataFrame()

participants = data_single['TBIID'].unique()

for part in participants:
    dates = data_single.loc[data_single['TBIID'] == part, ['ScreenDate']].values
    max_date = dates.max()
    min_date = dates.min()
    
    last_date = data_single[(data_single['TBIID'] == part) & (data_single['ScreenDate'] == max_date)]
    last_date['Group'] = part[0]
    last_visit_data_single = last_visit_data_single.append(last_date)
    
    first_date = data_single[(data_single['TBIID'] == part) & (data_single['ScreenDate'] == min_date)]
    first_date['Group'] = part[0]
    first_visit_data_single = first_visit_data_single.append(first_date)
    
#reset indexes
last_visit_data_single = last_visit_data_single.reset_index(drop=True)
first_visit_data_single = first_visit_data_single.reset_index(drop=True)

print(len(participants))
print(first_visit_data_single.shape)
print(last_visit_data_single.shape)
last_visit_data_single.head()

In [None]:
#now deal with path_TBI
data_TBI = pd.read_excel(path_TBI)
data_TBI = pd.DataFrame(data = data_TBI)
print('Data shape all groups:\n', data_TBI.shape, '\n')

#select only TBIID C and T (control and TBI)
data_TBI = data_TBI[data_TBI['TBIID'].str.match(r'[CT]\d\d')]
print('Data shape only deployed controls and mTBI groups:\n', data_mult.shape, '\n')
print('Data types:\n', data_TBI.info(), '\n')
data_TBI.head()

In [None]:
#create two new data frames: 1 of the first visit record and 1 of the last visit record
#add new column with group ID (C = deployed controls, T = blast mTBI)
last_visit_data_TBI = pd.DataFrame()
first_visit_data_TBI = pd.DataFrame()

participants = data_TBI['TBIID'].unique()

for part in participants:
    dates = data_TBI.loc[data_TBI['TBIID'] == part, ['FormDate']].values
    max_date = dates.max()
    min_date = dates.min()
    
    last_date = data_TBI[(data_TBI['TBIID'] == part) & (data_TBI['FormDate'] == max_date)]
    last_date['Group'] = part[0]
    last_visit_data_TBI = last_visit_data_TBI.append(last_date)
    
    first_date = data_TBI[(data_TBI['TBIID'] == part) & (data_TBI['FormDate'] == min_date)]
    first_date['Group'] = part[0]
    first_visit_data_TBI = first_visit_data_TBI.append(first_date)
    
#reset indexes
last_visit_data_TBI = last_visit_data_TBI.reset_index(drop=True)
first_visit_data_TBI = first_visit_data_TBI.reset_index(drop=True)

print(len(participants))
print(first_visit_data_TBI.shape)
print(last_visit_data_TBI.shape)
last_visit_data_TBI.head()

In [None]:
#confirm the two data tables share a common key at TBIID
print('The length of the multi data table is: ', len(last_visit_data_mult))
print('The length of the single data table is: ', len(last_visit_data_single))
print('The length of the TBI data table is: ', len(last_visit_data_TBI))
print('The number of matching keys is: ', len((last_visit_data_single['TBIID'] == last_visit_data_mult['TBIID']) == True))
print('The number of matching keys is: ', len((last_visit_data_single['TBIID'] == last_visit_data_TBI['TBIID']) == True))

In [None]:
#joing the data sets on the common key 'TBIID'
merge_data_last = pd.merge(last_visit_data_mult, last_visit_data_single, how='inner', on='TBIID', suffixes=('_mult', '_single'), validate='one_to_one')
print(merge_data_last.shape)
merge_data_last = pd.merge(merge_data_last, last_visit_data_TBI, how='inner', on='TBIID', suffixes=('_mult', '_single'), validate='one_to_one')
print(merge_data_last.shape)
merge_data_last.head()

In [None]:
#joing the data sets on the common key 'TBIID'
merge_data_first = pd.merge(first_visit_data_mult, first_visit_data_single, how='inner', on='TBIID', suffixes=('_mult', '_single'), validate='one_to_one')
print(merge_data_first.shape)
merge_data_first = pd.merge(merge_data_first, first_visit_data_TBI, how='inner', on='TBIID', suffixes=('_mult', '_single'), validate='one_to_one')
print(merge_data_first.shape)
merge_data_first.head()

In [None]:
#confirm the two data tables' columns were merged correctly
print('The column length of the multi data table is: ', len(last_visit_data_mult.columns))
print('The column length of the single data table is: ', len(last_visit_data_single.columns))
print('The column length of the TBI data table is: ', len(last_visit_data_TBI.columns))
print('The columns of all tables add to: ', (len(last_visit_data_mult.columns) + len(last_visit_data_single.columns) + len(last_visit_data_TBI.columns) -2))
print('The column length of the merge data table is: ', len(merge_data_last.columns))

In [None]:
#confirm the two data tables' columns were merged correctly
print('The column length of the multi data table is: ', len(first_visit_data_mult.columns))
print('The column length of the single data table is: ', len(first_visit_data_single.columns))
print('The column length of the TBI data table is: ', len(first_visit_data_TBI.columns))
print('The columns of all tables add to: ', (len(first_visit_data_mult.columns) + len(first_visit_data_single.columns) + len(first_visit_data_TBI.columns) -2))
print('The column length of the merge data table is: ', len(merge_data_first.columns))

In [None]:
print(merge_data_first.groupby('Group_single')['auditc'].mean())
print(merge_data_last.groupby('Group_single')['auditc'].mean())

In [None]:
#select columns of interest and save to new df
merge_short = merge_data_first[['TBIID', 'Group_single', 'Group_mult', 'ScreenDate_single', 'MeasureDate', 'MnthSncBlst', 'tbiDizzy', 'tbiBalan', 'tbiCoord', 'tbiHeada',
       'tbiNaus', 'tbiVision', 'tbiLight', 'tbiHear', 'tbiNoise',
       'tbiTingl', 'tbiTstsml', 'tbiAppet', 'tbiConc', 'tbiForget',
       'tbiDecis', 'tbiSlow', 'tbiEnergy', 'tbiSleep', 'tbiAnx', 'tbiSad',
       'tbiIrrit', 'tbiOverw', 'tbiDisin', 'tbiWithd', 'tbiRing',
       'tbiMoods', 'tbiFight', 'tbiSpch', 'NSITot', 'TBITot', 'PSQIc1', 'PSQIc2', 'PSQIc3', 'PSQIc4',
       'PSQIc5', 'PSQIc6', 'PSQIc7', 'PSQItot', 'PCLTot', 'CAPSTotal', 'PHQTot', 'auditc', 'BNITotIm', 'LEC1', 'LEC2', 'LEC3', 'LEC4', 'LEC5',
       'LEC6', 'LEC7', 'LEC8', 'LEC9', 'LEC10', 'LEC11', 'LEC12', 'LEC13',
    'LEC14', 'LEC15', 'LEC16', 'LEC17', 'LEC18', 'LEC19', 'LEC20',
       'Race', 'Hispanic', 'ScreenAge', 'Education', 'cestotal', 'RivTot', 'Inhibit_Mean', 'Shift_Mean', 'Emotional Control_Mean',
       'Self Monitor_Mean', 'Initiate_Mean', 'Working Memory_Mean',
       'Plan/Organize_Mean', 'QLCog', 'QLCogS', 'QLBIaAns', 'QLBIaTotQ', 'QLSelf',
       'QLSelfS', 'QLBIbAns', 'QLBIbTotQ', 'QLDaLi', 'QLDaLiS',
       'QLBIcAns', 'QLBIcTotQ', 'QLSoc', 'QLSocS', 'QLBIdAns',
       'QLBIdTotQ', 'QLEmot', 'QLEmotS', 'QLBIeAns', 'QLBIeTotQ',
       'QLPhys', 'QLPhysS', 'QLBIfAns', 'QLBIfTotQ', 'WEIGHT', 'BPSYS', 'BPDIAS', 'HRATE', 'QolJob',
       'QolDySlp', 'QolPpLon', 'QolRead', 'QolFrnd', 'QolEfTsk',
       'QolEmCtl', 'QolLsay', 'QolConfd', 'QolPshDo', 'QolTense',
       'QolLtDwn', 'QolMxPpl', 'QolWorn', 'QolLow', 'QolResp', 'QolAvdMx',
       'QolBurdn', 'QolForgt', 'QolPlan', 'QolIrrit', 'Qol2Tird',
       'QolForce', 'QolAwake', 'QolMemry', 'QKOIorA', 'QKOExpMil', 'QKOAllMil',
       'QKOLife', 'QBlstExp', 'QDenTrx',
       'QNoRep', 'QFrstBEMo', 'QFrstBEDay', 'QFrstBEYr', 'QRecBEMo',
       'QRecBEDay', 'QRecBEYr', 'QNBRHT', 'APOEGen']]

In [None]:
#select columns of interest and save to new df
merge_short_BIS = merge_data_first[['TBIID', 'Group_single', 'Group_mult', 'ScreenDate_single', 'MeasureDate', 'MnthSncBlst', 'tbiDizzy', 'tbiBalan', 'tbiCoord', 'tbiHeada',
       'tbiNaus', 'tbiVision', 'tbiLight', 'tbiHear', 'tbiNoise',
       'tbiTingl', 'tbiTstsml', 'tbiAppet', 'tbiConc', 'tbiForget',
       'tbiDecis', 'tbiSlow', 'tbiEnergy', 'tbiSleep', 'tbiAnx', 'tbiSad',
       'tbiIrrit', 'tbiOverw', 'tbiDisin', 'tbiWithd', 'tbiRing',
       'tbiMoods', 'tbiFight', 'tbiSpch', 'NSITot', 'TBITot', 'PSQIc1', 'PSQIc2', 'PSQIc3', 'PSQIc4',
       'PSQIc5', 'PSQIc6', 'PSQIc7', 'PSQItot', 'PCLTot', 'CAPSTotal', 'PHQTot', 'auditc', 'BNITotIm', 'LEC1', 'LEC2', 'LEC3', 'LEC4', 'LEC5',
       'LEC6', 'LEC7', 'LEC8', 'LEC9', 'LEC10', 'LEC11', 'LEC12', 'LEC13',
    'LEC14', 'LEC15', 'LEC16', 'LEC17', 'LEC18', 'LEC19', 'LEC20', 'BISAtt', 'BISAttAns', 'BISAttTotQ', 'BISMtr', 'BISMrtAns',
       'BISMtrTotQ', 'BISNonpl', 'BISNonplAns', 'BISNonplTotQ', 'BISTot',
       'BISAns', 'BISTotQ',
       'Race', 'Hispanic', 'ScreenAge', 'Education', 'cestotal', 'RivTot', 'Inhibit_Mean', 'Shift_Mean', 'Emotional Control_Mean',
       'Self Monitor_Mean', 'Initiate_Mean', 'Working Memory_Mean',
       'Plan/Organize_Mean', 'QLCog', 'QLCogS', 'QLBIaAns', 'QLBIaTotQ', 'QLSelf',
       'QLSelfS', 'QLBIbAns', 'QLBIbTotQ', 'QLDaLi', 'QLDaLiS',
       'QLBIcAns', 'QLBIcTotQ', 'QLSoc', 'QLSocS', 'QLBIdAns',
       'QLBIdTotQ', 'QLEmot', 'QLEmotS', 'QLBIeAns', 'QLBIeTotQ',
       'QLPhys', 'QLPhysS', 'QLBIfAns', 'QLBIfTotQ', 'WEIGHT', 'BPSYS', 'BPDIAS', 'HRATE', 'QolJob',
       'QolDySlp', 'QolPpLon', 'QolRead', 'QolFrnd', 'QolEfTsk',
       'QolEmCtl', 'QolLsay', 'QolConfd', 'QolPshDo', 'QolTense',
       'QolLtDwn', 'QolMxPpl', 'QolWorn', 'QolLow', 'QolResp', 'QolAvdMx',
       'QolBurdn', 'QolForgt', 'QolPlan', 'QolIrrit', 'Qol2Tird',
       'QolForce', 'QolAwake', 'QolMemry', 'QKOIorA', 'QKOExpMil', 'QKOAllMil',
       'QKOLife', 'QBlstExp', 'QDenTrx',
       'QNoRep', 'QFrstBEMo', 'QFrstBEDay', 'QFrstBEYr', 'QRecBEMo',
       'QRecBEDay', 'QRecBEYr', 'QNBRHT', 'APOEGen']]

In [None]:
#-999 is a missing value so replace with 'Nan'
merge_short = merge_short.replace({-999.0: None})
merge_short.head()

In [None]:
merge_short_TBI = merge_short[merge_short['Group_single'] == 'T']

In [None]:
#examine missing values in each column
print(merge_short_TBI.shape)
merge_short_TBI.isnull().sum().sort_values()

In [None]:
merge_short = merge_short.dropna(thresh=60, axis = 1)
print(merge_short.shape)

In [None]:
merge_short_TBI = merge_short_TBI.fillna(0)
merge_short_TBI = merge_short_TBI.drop(87)
merge_short_TBI.head()

In [None]:
AUDIT_C = ['auditc']
dep_var = ['MnthSncBlst', 'tbiDizzy', 'tbiBalan', 'tbiCoord', 'tbiHeada',
       'tbiNaus', 'tbiVision', 'tbiLight', 'tbiHear', 'tbiNoise',
       'tbiTingl', 'tbiTstsml', 'tbiAppet', 'tbiConc', 'tbiForget',
       'tbiDecis', 'tbiSlow', 'tbiEnergy', 'tbiSleep', 'tbiAnx', 'tbiSad',
       'tbiIrrit', 'tbiOverw', 'tbiDisin', 'tbiWithd', 'tbiRing',
       'tbiMoods', 'tbiFight', 'tbiSpch', 'NSITot', 'TBITot', 'PSQIc1', 'PSQIc2', 'PSQIc3', 'PSQIc4', 'PSQIc5',
       'PSQIc6', 'PSQIc7', 'PSQItot', 'PCLTot', 'CAPSTotal', 'PHQTot',
       'LEC1', 'LEC2', 'LEC3', 'LEC4', 'LEC5', 'LEC6', 'LEC7',
       'LEC8', 'LEC9', 'LEC10', 'LEC11', 'LEC12', 'LEC13', 'LEC14',
       'LEC15', 'LEC16', 'LEC17', 'LEC18', 'LEC19', 'LEC20',
     'ScreenAge', 'Education', 'cestotal', 'WEIGHT',
       'BPSYS', 'BPDIAS', 'HRATE', 'QKOIorA', 'QKOExpMil', 'QKOAllMil',
       'QKOLife', 'QBlstExp', 'QDenTrx', 'QNoRep']

In [None]:
sns.pairplot(merge_short_TBI, x_vars=dep_var, y_vars=AUDIT_C, kind='reg')

In [None]:
corr = merge_short_TBI[['MnthSncBlst','NSITot', 'TBITot', 'PSQIc1', 'PSQIc2', 'PSQIc3', 'PSQIc4', 'PSQIc5',
       'PSQIc6', 'PSQIc7', 'PSQItot', 'PCLTot', 'CAPSTotal', 'PHQTot',
     'ScreenAge', 'Education', 'cestotal', 'WEIGHT',
       'BPSYS', 'BPDIAS', 'HRATE', 'QKOIorA', 'QKOExpMil', 'QKOAllMil',
       'QKOLife', 'QBlstExp', 'QDenTrx', 'QNoRep',
       'auditc']].corr()
fig, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(corr, annot=True, center=0)