In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os
import datetime as dt
import string

#visualizing results
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_context("poster")
sns.set_style("ticks")
#import yellowbrick as yb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 15000)
pd.set_option('display.max_colwidth', -1)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

In [None]:
path_data = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/PeskindTBI/freeze_191003/TBIFreeze_20191003_python.xlsx'

#get list of sheet names in file

whole_file = pd.ExcelFile(path_data)
print(whole_file.sheet_names, '\n')

In [None]:
#original file has multiple sheets (for different data set types (e.g. single visit, imaging, labs etc)
#only interested in deployed controls ('C') and mTBI ('T')
#each participant can have multiple visits for each visit sequence (1-3 visit sequences per participant)
#we are currently interested in data only from the first visit of each visit sequence

first_visits = pd.DataFrame()

#each data set uses as different column name for visit date, use a dictionary to access corresponding date column name
sheet_dic = {'Single_Record_Measures': 'ScreenDate', 
             'Multi_Record_Measures': 'MeasureDate', 
             'TBI_Symptom_Q': 'NSIFormDate',  
             'DTI_Kleinhans_Lab': 'ScreenDate', 
             'FDG-PET': 'ScreenDate',
             'Test_Dates': 'CogPreTestDate',
             'labs_1': 'CLabDate',
             'labs_2': 'CLabDate'}

for sheet in whole_file.sheet_names:
    
    sheet_data_int = pd.DataFrame()
    
    print('Sheet being processed:\n', sheet)
    
    #create intermediate dataframe
    data_int = pd.DataFrame(data = pd.read_excel(whole_file, sheet))

    #select only TBIID C and T (control and TBI)
    data_int = data_int[data_int['TBIID'].str.match(r'[CT]\d\d')]
    print('Data shape only deployed controls and mTBI groups:\n', data_int.shape)

    #meta data
    visit_sequences = data_int['VisitSeq'].unique()
    print('Number of visit sequences:\n', len(visit_sequences))
    
    #meta data
    participants_total = data_int['TBIID'].unique()
    print('Number of participants:\n', len(participants_total), '\n')
    
    #for each visit
    for visit in visit_sequences:
        print('Visit sequence being processed:\n', visit)
        
        #get visit data
        full_visit_seq_data = data_int[data_int['VisitSeq'] == visit]
        visit_data_int = pd.DataFrame()
        
        #loop through participants and for each one find and save the first visit of that visit sequence
        participants_visit = full_visit_seq_data['TBIID'].unique()
        #print('Number of participants for this visit sequence:\n', len(participants_visit))
        for part in participants_visit:
            dates = full_visit_seq_data.loc[(full_visit_seq_data['TBIID'] == part), sheet_dic[sheet]].values
            min_date = dates.min()
    
            visit_seq_data_indiv = full_visit_seq_data[(full_visit_seq_data['TBIID'] == part) & (full_visit_seq_data[sheet_dic[sheet]] == min_date)]
            visit_data_int = visit_data_int.append(visit_seq_data_indiv)

        #reset indexes and clean up missing values
        visit_data_int = visit_data_int.reset_index(drop=True)
        visit_data_int = visit_data_int.replace({-999.0: np.nan, 999: np.nan, 'None': np.nan})
        
        if sheet_data_int.shape[0] < 1:
            sheet_data_int = visit_data_int
        else:
            sheet_data_int = sheet_data_int.append(visit_data_int)
 
    #combine into one final df that contains all first visit for each sequence
    if first_visits.shape[0] < 1:
        first_visits = sheet_data_int
    else:
        first_visits = pd.merge(first_visits, sheet_data_int, how='left', on=['TBIID', 'VisitSeq'], sort=False)
    print('Visit data shape for current sheet:\n', first_visits.shape, '\n')

#clean up duplicates
first_visits.drop_duplicates(inplace=True)

#add group column 
first_visits['Group'] = [TBIID[0] for TBIID in first_visits['TBIID']]

print('Final shape of first visit data:\n', first_visits.shape)
first_visits.head()

In [None]:
#read in data from RH pull and combine with df
path_RH = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/PeskindTBI/RHdatapull/RHpull.csv'

data_RH = pd.read_csv(path_RH)
data_RH = pd.DataFrame(data = data_RH)
print('Data shape all groups:\n', data_RH.shape)

#select only TBIID C and T (control and TBI)
data_RH = data_RH[data_RH['TBIID'].str.match(r'[CT]\d\d')]
print('Data shape only deployed controls and mTBI groups:\n', data_RH.shape)

#clean up missing values
data_RH = data_RH.replace({-999.0: np.nan, 'None': np.nan})

#change DA = 0 to np.nan, add ratio cals
data_RH['DA'] = data_RH['DA'].replace({0: np.nan})
data_RH['da_dopa_ratio'] = data_RH['DA'] / data_RH['DOPA']
data_RH['dopac_da_ratio'] = data_RH['DOPAC'] / data_RH['DA']
data_RH['ne_dopa_ratio'] = data_RH['NE'] / data_RH['DOPA']
data_RH['dhpg_ne_ratio'] = data_RH['DHPG'] / data_RH['NE']

#add to df containing first visit info
first_visits = pd.merge(first_visits, data_RH, how='left', on=['TBIID', 'VisitSeq'], sort=False)

print('Final shape of first visit data:\n', first_visits.shape)
first_visits.head()

In [None]:
#read in data from MSD multiplex on blood and CSF and combine with df
path_MESO = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/PeskindTBI/freeze_191003/MSD_for_Abbie.xlsx'

data_MESO = pd.read_excel(path_MESO)
data_MESO = pd.DataFrame(data = data_MESO)
print('Data shape all groups:\n', data_MESO.shape)

#drop participants that have no data
data_MESO.dropna(axis=0, thresh=3, inplace=True)
print('Data shape all groups:\n', data_MESO.shape)
#drop analytes that have many missing values (suggests sensitivity of MSD not sufficient for this analyte)
data_MESO.dropna(axis=1, thresh=100, inplace=True)
print('Data shape all groups:\n', data_MESO.shape)

#add to df containing first visit info
first_visits = pd.merge(first_visits, data_MESO, how='left', on=['TBIID', 'VisitSeq'], sort=False)

print('Final shape of first visit data:\n', first_visits.shape)
first_visits.head()

In [None]:
#create new column for sum of distance from blast (worst 5)
first_visits['QEDist_sum'] = first_visits.loc[:, 'QEDist1':'QEDist5'].sum(axis=1)
#create new column for mean of distance from blast (worst 5)
first_visits['QEDist_mean'] = first_visits.loc[:, 'QEDist1':'QEDist5'].mean(axis=1)
#create new column for min of distance from blast (worst 5)
first_visits['QEDist_min'] = first_visits.loc[:, 'QEDist1':'QEDist5'].min(axis=1)

#create new columns for NSI 4-factor scoring approach
NSI_comp_vestibular = ['tbiDizzy', 'tbiBalan', 'tbiCoord']
NSI_comp_somatosensory = ['tbiHeada', 'tbiNaus', 'tbiVision', 'tbiLight', 'tbiNoise', 'tbiTingl', 'tbiTstsml']
NSI_comp_cognitive = ['tbiConc', 'tbiForget', 'tbiDecis', 'tbiSlow']
NSI_comp_affective = ['tbiEnergy', 'tbiSleep', 'tbiAnx', 'tbiSad', 'tbiIrrit', 'tbiOverw']
NSI_comp_ERP_affective = ['tbiDisin', 'tbiWithd', 'tbiMoods', 'tbiFight']
NIS_comp_ERP_vestsom = ['tbiRing', 'tbiSpch']

first_visits['NSI_vestibular'] = first_visits.loc[:, NSI_comp_vestibular].mean(axis=1)
first_visits['NSI_somatosensory'] = first_visits.loc[:, NSI_comp_somatosensory].mean(axis=1)
first_visits['NSI_cognitive'] = first_visits.loc[:, NSI_comp_cognitive].mean(axis=1)
first_visits['NSI_affective'] = first_visits.loc[:, NSI_comp_affective].mean(axis=1)
first_visits['NSI_ERP_affective'] = first_visits.loc[:, NSI_comp_ERP_affective].mean(axis=1)
first_visits['NIS_ERP_vestsom'] = first_visits.loc[:, NIS_comp_ERP_vestsom].mean(axis=1)

#create new columns for PCL subscores for 4 factor model from King et al., 1998
PCL_reexp = ['PCL1', 'PCL2', 'PCL3', 'PCL4', 'PCL5']
PCL_avoid = ['PCL6', 'PCL7']
PCL_numb = ['PCL8', 'PCL9', 'PCL10', 'PCL11', 'PCL12']
PCL_hyper = ['PCL13', 'PCL14', 'PCL15', 'PCL16', 'PCL17']

first_visits['PCL_reexp'] = first_visits.loc[:, PCL_reexp].mean(axis=1)
first_visits['PCL_avoid'] = first_visits.loc[:, PCL_avoid].mean(axis=1)
first_visits['PCL_numb'] = first_visits.loc[:, PCL_numb].mean(axis=1)
first_visits['PCL_hyper'] = first_visits.loc[:, PCL_hyper].mean(axis=1)

#create new columns for PHQ-9 subscores for 2 factor model
PHQ_psych = ['PHQ1', 'PHQ2', 'PHQ6', 'PHQ9']
PHQ_somatic = [ 'PHQ3', 'PHQ4', 'PHQ5', 'PHQ7', 'PHQ8'] 

first_visits['PHQ_psych'] = first_visits.loc[:, PHQ_psych].mean(axis=1)
first_visits['PHQ_somatic'] = first_visits.loc[:, PHQ_somatic].mean(axis=1)
        
print(first_visits.shape)
first_visits.head(1)

In [None]:
columns_to_keep = ['Status_x', 'VisitSeq', 'Group', 'TBIID', 'EntityID', 'DOB', 'GType', 'Race', 'Hispanic', 'Handedness', 
                   'ScreenAge', 'Education', 'Marital', 'APOEGen', 'servconn', 'cestotal', 'MnthSncBlst', 
                   'QKOIorA', 'QKOExpMil', 'QKOAllMil', 'QKOLife', 'QBlstExp', 'QBEACRM', 'Q5plus2', 
                   'QEDist_sum', 'QEDist_mean', 'QEDist_min',
                   'auditc', 'AUDIT1', 'AUDIT2', 'AUDIT3', 
                   'DOPA', 'DA', 'DOPAC', 'NE', 'DHPG', 'da_dopa_ratio', 'dopac_da_ratio', 'ne_dopa_ratio', 'dhpg_ne_ratio',
                   'HEIGHT', 'WEIGHT', 'BMI', 'BPSYS', 'BPDIAS', 'HRATE',
                   'HYPERTEN', 'HYPERCHO', 'DIABETES', 'B12DEF', 'THYROID', 
                   'BGlucose', 'BNa', 'BUN', 'BCreat', 'BOsmo', 'UOsmo', 'USG', 'UNa', 'BK',
                   'TotalChol', 'LDL', 'HDL', 'Trig',
                   'CSFPROTEIN_x', 'CSFGLUCOSE', 'CSFRBCS',
                   'MHxPain', 'MHxHA', 'MHxHtn', 'MHxCard', 'MHxGI', 'MHxNeuro', 'MHxLung', 'MHxApnea',
                   'SCPTSD', 'SCMDD', 'SCPD', 'SCGAD', 'SCNone',
                   'DvpHA', 'DvpHAAct', 'DvpHASlp', 'DvpHAMd', 'DvpHAStr', 'DvpBP',
                   'DvpBPAct', 'DvpBPslp', 'DvpBPMd', 'DvpBPStr', 
                   'BISAtt', 'BISMtr', 'BISNonpl', 'BISTot',
                   'capsCrtA', 'CAPSTotal', 
                   'PSQIc1', 'PSQIc2', 'PSQIc3', 'PSQIc4', 'PSQIc5', 'PSQIc6', 'PSQIc7', 'PSQItot',
                   'PCLTot', 'PCL_reexp', 'PCL_avoid', 'PCL_numb', 'PCL_hyper',
                   'PHQTot', 'PHQ_psych', 'PHQ_somatic',
                   'NSITot', 'TBITot', 'NSI_vestibular', 'NSI_somatosensory', 'NSI_cognitive', 'NSI_affective', 'NSI_ERP_affective', 'NIS_ERP_vestsom',
                   'PreSleep', 'PreCaff', 'PreETOH', 'PreNic', 'PreTHC',
                   'Amygdala_l', 'Amygdala_r', 'Pallidum_l', 'Pallidum_r', 'Midbrain',
                   'Plasma1_bFGF', 'Plasma1_CRP', 'Plasma1_Eotaxin', 'Plasma1_Eotaxin3', 'Plasma1_Flt1', 'Plasma1_ICAM1', 'Plasma1_IFNγ', 'Plasma1_IL10',
                   'Plasma1_IL12_IL23p40', 'Plasma1_IL12p70', 'Plasma1_IL15',
                   'Plasma1_IL16', 'Plasma1_IL17A', 'Plasma1_IL1α', 'Plasma1_IL6',
                   'Plasma1_IL7', 'Plasma1_IL8', 'Plasma1_IP10', 'Plasma1_MCP1',
                   'Plasma1_MCP4', 'Plasma1_MDC', 'Plasma1_MIP1α', 'Plasma1_MIP1β',
                   'Plasma1_PlGF', 'Plasma1_SAA', 'Plasma1_TARC', 'Plasma1_Tie2',
                   'Plasma1_TNFα', 'Plasma1_TNFβ', 'Plasma1_VCAM1', 'Plasma1_VEGF',
                   'Plasma1_VEGFC', 'Plasma1_VEGFD']

first_visits_short = first_visits[columns_to_keep]
print(first_visits_short.shape)

In [None]:
first_visits_short.to_csv('first_visits_short.csv')

In [None]:
for param in first_visits_short.columns.values:
    print(param)
    print(first_visits_short[param].dropna().count())
    try:
        print(first_visits_short[param].dropna().unique(), '\n')
    except:
        pass