In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os
import scipy as sp
import missingno as msno

from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

#visualizing results
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
%matplotlib inline
import seaborn as sns
sns.set_context('poster')

pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 10000)
pd.set_option('display.width', 10000)
pd.set_option('display.max_colwidth', -1)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

In [None]:
data_path = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/PeskindTBI/final_data/data_final.csv'

In [None]:
#read in csv containing data from all surveys and all visitseqs
data = pd.read_csv(data_path)
data = pd.DataFrame(data = data)

data['group_num'] = data['Group'].replace({'C': 0, 'T': 1})

print('Original data shape:\n', data.shape, '\n')
print(data.info())
data.head(1)

In [None]:
#create new column with year of screen date
data['screenyear'] = [np.int(x.split('-')[0]) for x in data['ScreenDate_x']]

## Value counts by group and visit seq

In [None]:
data.groupby('VisitSeq')['Group'].value_counts()

In [None]:
visit_keep = [1,2]
data_short = data[data['VisitSeq'].isin(visit_keep)]

In [None]:
columns_to_keep = ['Status_x', 'VisitSeq', 'Group', 'group_num', 'TBIID', 'EntityID', 'DOB', 'screenyear', 'GType', 'Race', 'Hispanic', 'Handedness', 
                   'ScreenAge_x', 'Education', 'Marital', 'APOEGen', 'servconn', 'cestotal', 'MnthSncBlst', 
                   'QKOIorA', 'QKOExpMil', 'QKOAllMil', 'QKOLife', 'QBlstExp', 'QBEACRM', 'Q5plus2', 
                   'QEDist_sum', 'QEDist_mean', 'QEDist_min',
                   'ACT18T', 'AnimalsT', 'BVTotReT', 'DkefInSS', 'CVT15FRT', 'LetFAST', 'W3LNSSS',
                   'CvmtHitT', 'CvmtFaT', 'CvmtDprT', 'CvmtTotT', 'CvmtDelT',  
                   'MTPMTPer', 'MT15Per', 'MTTCPer',
                   'rffacct', 'rffdst', 'rffdat', 'rffsst', 'rffsat', 'rffspdt', 
                   'SDW90TS', 'TrailAT', 'TrailBT', 'WCSTPrsT',
                   'auditc', 'AUDIT1', 'AUDIT2', 'AUDIT3', 
                   'DOPA', 'DA', 'DOPAC', 'NE', 'DHPG', 'da_dopa_ratio', 'dopac_da_ratio', 'ne_dopa_ratio', 'dhpg_ne_ratio',
                   'HEIGHT', 'WEIGHT', 'BMI', 'BPSYS', 'BPDIAS', 'HRATE',
                   'HYPERTEN', 'HYPERCHO', 'DIABETES', 'B12DEF', 'THYROID', 
                   'BGlucose', 'BNa', 'BUN', 'BCreat', 'BOsmo', 'UOsmo', 'USG', 'UNa', 'BK',
                   'TotalChol', 'LDL', 'HDL', 'Trig',
                   'CSFPROTEIN_x', 'CSFGLUCOSE', 'CSFRBCS',
                   'MHxPain', 'MHxHA', 'MHxHtn', 'MHxCard', 'MHxGI', 'MHxNeuro', 'MHxLung', 'MHxApnea',
                   'SCPTSD', 'SCMDD', 'SCPD', 'SCGAD', 'SCNone',
                   'DvpHA', 'DvpHAAct', 'DvpHASlp', 'DvpHAMd', 'DvpHAStr', 'DvpBP',
                   'DvpBPAct', 'DvpBPslp', 'DvpBPMd', 'DvpBPStr', 
                   'BISAtt', 'BISMtr', 'BISNonpl', 'BISTot',
                   'capsCrtA', 'CAPSTotal', 
                   'PSQIc1', 'PSQIc2', 'PSQIc3', 'PSQIc4', 'PSQIc5', 'PSQIc6', 'PSQIc7', 'PSQITot',
                   'PCLTot', 'PCL_reexp', 'PCL_avoid', 'PCL_numb', 'PCL_hyper',
                   'PHQTot', 'PHQ_psych', 'PHQ_somatic',
                   'NSITot', 'TBITot', 'NSI_vestibular', 'NSI_somatosensory', 'NSI_cognitive', 'NSI_affective', 'NSI_ERP_affective', 'NIS_ERP_vestsom',
                   'Amygdala_l', 'Amygdala_r', 'Pallidum_l', 'Pallidum_r', 'Midbrain',
                   'Plasma1_bFGF', 'Plasma1_CRP', 'Plasma1_Eotaxin', 'Plasma1_Eotaxin3', 'Plasma1_Flt1', 'Plasma1_ICAM1', 'Plasma1_IFNγ', 'Plasma1_IL10',
                   'Plasma1_IL12_IL23p40', 'Plasma1_IL12p70', 'Plasma1_IL15',
                   'Plasma1_IL16', 'Plasma1_IL17A', 'Plasma1_IL1α', 'Plasma1_IL6',
                   'Plasma1_IL7', 'Plasma1_IL8', 'Plasma1_IP10', 'Plasma1_MCP1',
                   'Plasma1_MCP4', 'Plasma1_MDC', 'Plasma1_MIP1α', 'Plasma1_MIP1β',
                   'Plasma1_PlGF', 'Plasma1_SAA', 'Plasma1_TARC', 'Plasma1_Tie2',
                   'Plasma1_TNFα', 'Plasma1_TNFβ', 'Plasma1_VCAM1', 'Plasma1_VEGF',
                   'Plasma1_VEGFC', 'Plasma1_VEGFD']

data_short = data_short[columns_to_keep]
print(data_short.shape)
data_short.head()

In [None]:
#find and fix other forms of nan's
data_short = data_short.replace({-20.0: np.nan, -920.0: np.nan})
data_short.describe()

## Explore missing across all TBIID

In [None]:
poi = ['ACT18T', 'AnimalsT', 'BVTotReT', 'DkefInSS', 'CVT15FRT', 'LetFAST', 'W3LNSSS',
                   'CvmtHitT', 'CvmtFaT', 'CvmtDprT', 'CvmtTotT', 'CvmtDelT',  
                   'MTPMTPer', 'MT15Per', 'MTTCPer',
                   'rffacct', 'rffdst', 'rffdat', 'rffsst', 'rffsat', 'rffspdt', 
                   'SDW90TS']

data_poi = data_short[['VisitSeq', 'Group', 'group_num', 'TBIID', 'screenyear', 'APOEGen', 'ACT18T', 'AnimalsT', 'BVTotReT', 'DkefInSS', 'CVT15FRT', 'LetFAST', 'W3LNSSS',
                   'CvmtHitT', 'CvmtFaT', 'CvmtDprT', 'CvmtTotT', 'CvmtDelT',  
                   'MTPMTPer', 'MT15Per', 'MTTCPer',
                   'rffacct', 'rffdst', 'rffdat', 'rffsst', 'rffsat', 'rffspdt', 
                   'SDW90TS']]

data_poi.groupby(['VisitSeq', 'Group']).count().reset_index()

In [None]:
data_poi.groupby(['VisitSeq', 'Group'])[poi].count().mean(axis=1).reset_index(name='count')

In [None]:
data_poi.groupby(['VisitSeq', 'Group', 'TBIID'])[poi].count().sum(axis=1).reset_index(name='count')

In [None]:
msno.matrix(data_poi[data_poi['VisitSeq'] ==  1])

In [None]:
msno.bar(data_poi[data_poi['VisitSeq'] ==  1])

In [None]:
msno.heatmap(data_poi[data_poi['VisitSeq'] ==  1])

In [None]:
msno.matrix(data_poi[data_poi['VisitSeq'] ==  2])

In [None]:
#get count for each year and visitseq
visit_year_count = data_poi.groupby(['VisitSeq', 'screenyear'])['TBIID'].count().reset_index(name='TBIID_count')
#get count for each poi fore each year and visitseq
visit_year_count_poi = data_poi.groupby(['VisitSeq', 'screenyear'])[poi].count().reset_index()
#compute perc responding for each poi
visit_year_perc_poi = visit_year_count_poi[poi].div(visit_year_count['TBIID_count'].values, axis=0)
visit_year_perc_poi['VisitSeq'] = visit_year_count_poi['VisitSeq']
visit_year_perc_poi['screenyear'] = visit_year_count_poi['screenyear']
visit_year_perc_poi = visit_year_perc_poi[['VisitSeq', 'screenyear', 'ACT18T', 'AnimalsT', 'BVTotReT', 'DkefInSS', 'CVT15FRT',
       'LetFAST', 'W3LNSSS', 'CvmtHitT', 'CvmtFaT', 'CvmtDprT',
       'CvmtTotT', 'CvmtDelT', 'MTPMTPer', 'MT15Per', 'MTTCPer',
       'rffacct', 'rffdst', 'rffdat', 'rffsst', 'rffsat', 'rffspdt',
       'SDW90TS']]

visit_year_perc_poi

In [None]:
plt.figure(figsize=(13,13))  
sns.heatmap(visit_year_perc_poi.corr())

## Explore missing across TBIID that had visitseq 2

In [None]:
#get TBIIDs of participants with a second visit and use to filter
TBIID_v2 = data_poi[data_poi['VisitSeq'] == 2]['TBIID'].values

#create new df with only participants who came to both visits 1 and 2
data_poi_v2check = data_poi[data_poi['TBIID'].isin(TBIID_v2)]
print(data_poi_v2check.groupby('VisitSeq')['Group'].value_counts())
print(data_poi_v2check.shape)
data_poi_v2check.head()

In [None]:
msno.matrix(data_poi_v2check[data_poi_v2check['VisitSeq'] ==  1])

In [None]:
data_poi.head(1)
sns.distplot(data_poi[(data_poi['VisitSeq'] == 1) & (data_poi['Group'] == 'C')]['AnimalsT'].dropna())
sns.distplot(data_poi[(data_poi['VisitSeq'] == 1) & (data_poi['Group'] == 'T')]['AnimalsT'].dropna())

## Explore IIV data

In [None]:
for var in poi:
    print(var)
    sns.distplot(data_poi[(data_poi['VisitSeq'] == 1) & (data_poi['Group'] == 'C')][var].dropna())
    sns.distplot(data_poi[(data_poi['VisitSeq'] == 1) & (data_poi['Group'] == 'T')][var].dropna())
    plt.show()

In [None]:
#create tidy df with IIV params melt
meta_params = ['VisitSeq', 'Group', 'TBIID', 'screenyear',]
IIV_parmas = ['ACT18T', 'AnimalsT', 'BVTotReT', 'DkefInSS', 'CVT15FRT',
       'LetFAST', 'W3LNSSS', 'CvmtHitT', 'CvmtFaT', 'CvmtDprT',
       'CvmtTotT', 'CvmtDelT', 'MTPMTPer', 'MT15Per', 'MTTCPer',
       'rffacct', 'rffdst', 'rffdat', 'rffsst', 'rffsat', 'rffspdt',
       'SDW90TS']
data_poi_tidy = pd.melt(data_poi, id_vars=meta_params, value_vars=IIV_parmas)
print(data_poi_tidy.shape)
data_poi_tidy.head()

In [None]:
sns.catplot(x="variable", y="value", hue="Group",
            kind="violin", inner="stick", split=True,
            palette="pastel", data=data_poi_tidy[data_poi_tidy['VisitSeq'] == 1], height=15, aspect=3)

In [None]:
sns.catplot(x="variable", y="value", hue="Group",
            kind="violin", inner="stick", split=True,
            palette="pastel", data=data_poi_tidy[data_poi_tidy['VisitSeq'] == 2], height=15, aspect=3)

## z-score

In [None]:
# center and scale the data
scaler = RobustScaler()

meta_col = ['VisitSeq', 'Group', 'group_num', 'TBIID', 'screenyear', 'APOEGen']
poi_col = ['ACT18T',
       'AnimalsT', 'BVTotReT', 'DkefInSS', 'CVT15FRT', 'LetFAST',
       'W3LNSSS', 'CvmtHitT', 'CvmtFaT', 'CvmtDprT', 'CvmtTotT',
       'CvmtDelT', 'MTPMTPer', 'MT15Per', 'MTTCPer', 'rffacct', 'rffdst',
       'rffdat', 'rffsst', 'rffsat', 'rffspdt', 'SDW90TS']

#visit 1 first
data_poi_v1 = data_poi[data_poi['VisitSeq'] == 1]
#scale data
data_poi_scaled = scaler.fit_transform(data_poi_v1[poi_col])
data_poi_scaled = pd.DataFrame(data=data_poi_scaled, columns=poi)
data_poi_scaled = pd.concat([data_poi_v1[meta_col].reset_index(), data_poi_scaled], ignore_index=False, axis=1)

#comput std per TBIID
data_poi_scaled['std'] = data_poi_scaled[poi_col].std(axis=1)
#comput  maximum discrepancy per TBIID
data_poi_scaled['md'] = data_poi_scaled[poi_col].max(axis=1) - data_poi_scaled[poi_col].min(axis=1)

print(data_poi_scaled.shape)
data_poi_scaled.head(1)

In [None]:
sns.catplot(x="Group", y="std", 
            kind="bar", data=data_poi_scaled, ci=68)

In [None]:
sns.catplot(x="Group", y="md", 
            kind="bar", data=data_poi_scaled, ci=68)

In [None]:
# center and scale the data
scaler = StandardScaler()

meta_col = ['VisitSeq', 'Group', 'group_num', 'TBIID', 'screenyear', 'APOEGen']
poi_col = ['ACT18T',
       'AnimalsT', 'BVTotReT', 'DkefInSS', 'CVT15FRT', 'LetFAST',
       'W3LNSSS', 'CvmtHitT', 'CvmtFaT', 'CvmtDprT', 'CvmtTotT',
       'CvmtDelT', 'MTPMTPer', 'MT15Per', 'MTTCPer', 'rffacct', 'rffdst',
       'rffdat', 'rffsst', 'rffsat', 'rffspdt', 'SDW90TS']

#visit 1 first
data_poi_v1 = data_poi[data_poi['VisitSeq'] == 1]

#standardize C and T separately
data_poi_C = data_poi_v1[data_poi_v1['Group'] ==  'C']
print(data_poi_C.shape)
data_poi_C_scaled = scaler.fit_transform(data_poi_C[poi_col])
data_poi_C_scaled = pd.DataFrame(data=data_poi_C_scaled, columns=poi)
data_poi_C_scaled = pd.concat([data_poi_C[meta_col].reset_index(), data_poi_C_scaled], ignore_index=False, axis=1)
data_poi_C_scaled['std'] = data_poi_C_scaled[poi_col].std(axis=1)
print(data_poi_C_scaled.shape)

data_poi_T = data_poi_v1[data_poi_v1['Group'] ==  'T']
print(data_poi_T.shape)
data_poi_T_scaled = scaler.fit_transform(data_poi_T[poi_col])
data_poi_T_scaled = pd.DataFrame(data=data_poi_T_scaled, columns=poi)
data_poi_T_scaled = pd.concat([data_poi_T[meta_col].reset_index(), data_poi_T_scaled], ignore_index=False, axis=1)
data_poi_T_scaled['std'] = data_poi_T_scaled[poi_col].std(axis=1)
print(data_poi_T_scaled.shape)

data_poi_scaled = pd.concat([data_poi_C_scaled, data_poi_T_scaled], ignore_index=True, axis=0)

data_poi_scaled.describe()

In [None]:
sns.catplot(x="Group", y="std", 
            kind="bar", data=data_poi_scaled, ci=68)

In [None]:
#determined outliers for auditc and QBlstExp (outlier = >3 SD from mean) and remove
#data = data[data["TBIID"] != 'C010']
#data = data[data["TBIID"] != 'T080']