In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os
import datetime as dt
import string

#visualizing results
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_context("poster")
sns.set_style("ticks")
#import yellowbrick as yb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 15000)
pd.set_option('display.max_colwidth', -1)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

In [None]:
path_data = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/PeskindTBI/freeze_191003/TBIFreeze_20191003_python.xlsx'

#original file has multiple sheets (for different data set types (e.g. single visit, imaging etc)
#only interested in deployed controls ('C') and mTBI ('T')
#each participant can have multiple visits for each visit sequence (1-3 visit sequences per participant)
#we are currently interested in data only from the first visit of each visit sequence

first_visits = pd.DataFrame()

whole_file = pd.ExcelFile(path_data)
print(whole_file.sheet_names, '\n')

#each data set uses as different column name for visit date, use a dictionary to access corresponding date column name
sheet_dic = {'Single_Record_Measures': 'ScreenDate', 
             'Multi_Record_Measures': 'MeasureDate', 
             'TBI_Symptom_Q': 'NSIFormDate',  
             'Test_Dates': 'CogPreTestDate',
             'Clinical_Labs': 'CLabDate',
             'DTI_Kleinhans_Lab': 'ScreenDate', 
             'FDG-PET': 'ScreenDate'}

for sheet in whole_file.sheet_names:
    
    sheet_data_int = pd.DataFrame()
    
    print('Sheet being processed:\n', sheet)
    
    #create intermediate dataframe
    data_int = pd.DataFrame(data = pd.read_excel(whole_file, sheet))

    #select only TBIID C and T (control and TBI)
    data_int = data_int[data_int['TBIID'].str.match(r'[CT]\d\d')]
    print('Data shape only deployed controls and mTBI groups:\n', data_int.shape)

    # create new data frames one for each visit sequence (each participant has 1-3 visits)
    #add new column with group ID (C = deployed controls, T = blast mTBI)
    visit_sequences = data_int['VisitSeq'].unique()
    print('Number of visit sequences:\n', len(visit_sequences))
    
    participants_total = data_int['TBIID'].unique()
    print('Number of participants:\n', len(participants_total), '\n')
    
    #for each visit
    for visit in visit_sequences:
        #print('Visit sequence being processed:\n', visit)
        
        #get visit data
        full_visit_seq_data = data_int[data_int['VisitSeq'] == visit]
        visit_data_int = pd.DataFrame()
        
        #loop through participants and for each one find and save the first visit of that visit sequence
        participants_visit = full_visit_seq_data['TBIID'].unique()
        #print('Number of participants for this visit sequence:\n', len(participants_visit))
        for part in participants_visit:
            dates = full_visit_seq_data.loc[(full_visit_seq_data['TBIID'] == part), sheet_dic[sheet]].values
            min_date = dates.min()
    
            visit_seq_data_indiv = full_visit_seq_data[(full_visit_seq_data['TBIID'] == part) & (full_visit_seq_data[sheet_dic[sheet]] == min_date)]
            visit_data_int = visit_data_int.append(visit_seq_data_indiv)

        #reset indexes and clean up missing values
        visit_data_int = visit_data_int.reset_index(drop=True)
        visit_data_int = visit_data_int.replace({-999.0: np.nan, 999: np.nan, 'None': np.nan})
        
        if sheet_data_int.shape[0] < 1:
            sheet_data_int = visit_data_int
        else:
            sheet_data_int = sheet_data_int.append(visit_data_int)
        #print('Visit data shape for current sheet and visit:\n', sheet_data_int.shape, '\n')
    
    #combine into one final df that contains all first visit for each sequence
    if first_visits.shape[0] < 1:
        first_visits = sheet_data_int
    else:
        first_visits = pd.merge(first_visits, sheet_data_int, how='left', on=['TBIID', 'VisitSeq'], sort=False)
    print('Visit data shape for current sheet:\n', first_visits.shape, '\n')

#clean up duplicates
first_visits.drop_duplicates(inplace=True)

#add group column 
first_visits['Group'] = [TBIID[0] for TBIID in first_visits['TBIID']]

print('Final shape of first visit data:\n', first_visits.shape)
first_visits.head()

In [None]:
#read in data from RH pull and combine with df
path_RH = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/PeskindTBI/RHdatapull/RHpull.csv'

data_RH = pd.read_csv(path_RH)
data_RH = pd.DataFrame(data = data_RH)
print('Data shape all groups:\n', data_RH.shape)

#select only TBIID C and T (control and TBI)
data_RH = data_RH[data_RH['TBIID'].str.match(r'[CT]\d\d')]
print('Data shape only deployed controls and mTBI groups:\n', data_RH.shape)

#clean up missing values
data_RH = data_RH.replace({-999.0: np.nan, 'None': np.nan})

#change DA = 0 to np.nan, add ratio cals
data_RH['DA'] = data_RH['DA'].replace({0: np.nan})
data_RH['da_dopa_ratio'] = data_RH['DA'] / data_RH['DOPA']
data_RH['dopac_da_ratio'] = data_RH['DOPAC'] / data_RH['DA']
data_RH['ne_dopa_ratio'] = data_RH['NE'] / data_RH['DOPA']
data_RH['dhpg_ne_ratio'] = data_RH['DHPG'] / data_RH['NE']

#add to df containing first visit info
first_visits = pd.merge(first_visits, data_RH, how='left', on=['TBIID', 'VisitSeq'], sort=False)

print('Final shape of first visit data:\n', first_visits.shape)
first_visits.head()

In [None]:
columns_to_keep = ['Status_x', 'VisitSeq', 'Group', 'TBIID', 'EntityID', 'DOB', 'GType', 'Race', 'Hispanic', 'Handedness', 
                   'ScreenAge', 'Education', 'Marital', 'servconn',
                   'cestotal', 'ExpPB', 'ExpMark1', 'ExpAntiM', 'ExpStim',
       'ExpOthrS', 'ExpDEET', 'ExpTick', 'ExpPCollr', 'ExpPStrp',
       'ExpPEnvi', 'ExpToxic', 'ExpPaint', 'ExpXsVib', 'ExpHStrk',
       'ExpRadar', 'ExpIonRa', 'ExpYCake', 'ExpVhicl', 'ExpUrRnd',
       'ExpDtOrd', 'ExpGasM', 'ExpMOPP', 'ExpRadBg', 'ExpAN', 'ExpNG',
       'ExpTNT', 'ExpPETN', 'ExpRDX', 'ExpNC', 'ExpANFO', 'ExpCompB',
       'ExpOctol', 'ExpPntlt', 'ExpDynmt', 'ExpOthrX', 
                   'QKOIorA', 'QKOExpMil', 'QKOAllMil',
       'QKOLife', 'QBlstExp', 'QBEACRM', 'Q5plus2',
                   'MnthSncBlst', 'QEDist1', 'QEDist2', 'QEDist3', 'QEDist4', 'QEDist5',
                   'auditc', 'AUDIT1', 'AUDIT2', 'AUDIT3', 'DOPA', 'DA', 'DOPAC', 'NE', 'DHPG', 'da_dopa_ratio',
       'dopac_da_ratio', 'ne_dopa_ratio', 'dhpg_ne_ratio',
                   'HEIGHT', 'HEIGDEC', 'WEIGHT', 'BMI', 'BPSYS', 'BPDIAS', 'HRATE',
                   'HYPERTEN', 'HYPERCHO', 'DIABETES', 'B12DEF', 'THYROID', 'BGlucose', 'BNa', 'BUN', 'BCreat', 'UNa',
       'BK', 'CSFPROTEIN_x', 'CSFGLUCOSE', 'CSFRBCS',
       'MHxPain', 'MHxHA', 'MHxHtn', 'MHxCard', 'MHxGI', 'MHxNeuro', 'MHxLung', 'MHxApnea',
                   'SCPTSD', 'SCMDD', 'SCPD', 'SCGAD', 'SCNone',
       'DvpHA', 'DvpHAAct', 'DvpHASlp', 'DvpHAMd', 'DvpHAStr', 'DvpBP',
       'DvpBPAct', 'DvpBPslp', 'DvpBPMd', 'DvpBPStr', 
        'BISAtt', 'BISMtr', 'BISNonpl', 'BISTot',
                  'capsCrtA', 'b1f', 'b1i', 'b2f',
       'b2i', 'b3f', 'b3i', 'b4f', 'b4i', 'b5f', 'b5i', 'c6f', 'c6i',
       'c7f', 'c7i', 'c8f', 'c8i', 'c9f', 'c9i', 'c10f', 'c10i', 'c11f', 
                  'c12f', 'c12i', 'd13f', 'd13i', 'd14f', 'd14i', 'd15f',
       'd15i', 'd16f', 'd16i', 'd17f', 'd17i', 'f20', 'f21', 'f22',
       'caps23', 'caps24', 'caps25', 'caps26f', 'caps26i', 'caps27f',
       'caps27i', 'caps28f', 'caps28i', 'caps29f', 'caps29i', 'caps30f',
       'caps30i', 'CAPSTotal', 'hrslp', 'PSQI1hr', 'PSQI1min', 'PSQI2',
       'PSQI3hr', 'PSQI3min', 'PSQI5a', 'PSQI5b', 'PSQI5c', 'PSQI5d',
       'PSQI5e', 'PSQI5f', 'PSQI5g', 'PSQI5h', 'PSQI5i', 'PSQI5j',
       'PSQI5jco', 'PSQI6', 'PSQI7', 'PSQI8', 'PSQI9', 'PSQIc1', 'PSQIc2',
       'PSQIc3', 'PSQIc4', 'PSQIc5', 'PSQIc6', 'PSQIc7', 'PSQItot',
       'PCL1', 'PCL2', 'PCL3', 'PCL4', 'PCL5', 'PCL6', 'PCL7', 'PCL8',
       'PCL9', 'PCL10', 'PCL11', 'PCL12', 'PCL13', 'PCL14', 'PCL15',
       'PCL16', 'PCL17', 'PCLTot', 'PHQ1', 'PHQ2', 'PHQ3', 'PHQ4', 'PHQ5',
       'PHQ6', 'PHQ7', 'PHQ8', 'PHQ9', 'PHQTot', 
                  'tbiDizzy', 'tbiBalan', 'tbiCoord', 'tbiHeada', 'tbiNaus', 'tbiVision',
       'tbiLight', 'tbiHear', 'tbiNoise', 'tbiTingl', 'tbiTstsml',
       'tbiAppet', 'tbiConc', 'tbiForget', 'tbiDecis', 'tbiSlow',
       'tbiEnergy', 'tbiSleep', 'tbiAnx', 'tbiSad', 'tbiIrrit',
       'tbiOverw', 'tbiDisin', 'tbiWithd', 'tbiRing', 'tbiMoods',
       'tbiFight', 'tbiSpch', 'NSITot', 'TBITot',
                  'PreSleep', 'PreCaff',
       'PreETOH', 'PreNic', 'PreTHC']

first_visits_short = first_visits[columns_to_keep]
print(first_visits_short.shape)
                   
#create new column for sum of distance from blast (worst 5)
first_visits_short['QEDist_sum'] = first_visits_short.loc[:, 'QEDist1':'QEDist5'].sum(axis=1)
#create new column for mean of distance from blast (worst 5)
first_visits_short['QEDist_mean'] = first_visits_short.loc[:, 'QEDist1':'QEDist5'].mean(axis=1)
#create new column for min of distance from blast (worst 5)
first_visits_short['QEDist_min'] = first_visits_short.loc[:, 'QEDist1':'QEDist5'].min(axis=1)

#create new columns for NSI 4-factor scoring approach
NSI_comp_vestibular = ['tbiDizzy', 'tbiBalan', 'tbiCoord']
NSI_comp_somatosensory = ['tbiHeada', 'tbiNaus', 'tbiVision', 'tbiLight', 'tbiNoise', 'tbiTingl', 'tbiTstsml']
NSI_comp_cognitive = ['tbiConc', 'tbiForget', 'tbiDecis', 'tbiSlow']
NSI_comp_affective = ['tbiEnergy', 'tbiSleep', 'tbiAnx', 'tbiSad', 'tbiIrrit', 'tbiOverw']
NSI_comp_ERP_affective = ['tbiDisin', 'tbiWithd', 'tbiMoods', 'tbiFight']
NIS_comp_ERP_vestsom = ['tbiRing', 'tbiSpch']

first_visits_short['NSI_comp_vestibular'] = first_visits_short.loc[:, NSI_comp_vestibular].mean(axis=1)
first_visits_short['NSI_comp_somatosensory'] = first_visits_short.loc[:, NSI_comp_somatosensory].mean(axis=1)
first_visits_short['NSI_comp_cognitive'] = first_visits_short.loc[:, NSI_comp_cognitive].mean(axis=1)
first_visits_short['NSI_comp_affective'] = first_visits_short.loc[:, NSI_comp_affective].mean(axis=1)
first_visits_short['NSI_comp_ERP_affective'] = first_visits_short.loc[:, NSI_comp_ERP_affective].mean(axis=1)
first_visits_short['NIS_comp_ERP_vestsom'] = first_visits_short.loc[:, NIS_comp_ERP_vestsom].mean(axis=1)
                   
first_visits_short.shape
first_visits_short.head(1)

In [None]:
#determined outliers for auditc and QBlstExp (outlier = >3 SD from mean) and remove
first_visits_short = first_visits_short[first_visits_short["TBIID"] != 'C010']
first_visits_short = first_visits_short[first_visits_short["TBIID"] != 'T080']

In [None]:
first_visits_short.to_csv('first_visits_short.csv')

In [None]:
for param in first_visits_short.columns.values:
    print(param)
    print(first_visits_short[param].dropna().count())
    try:
        print(first_visits_short[param].dropna().unique(), '\n')
    except:
        pass

In [None]:
#prepare AUDIT data
AUDITC_cols = ['auditc', 'AUDIT1', 'AUDIT2', 'AUDIT3']

In [None]:
#get AUDIT counts and percents for chi squared analysis and viz
AUDIT_tot_counts = first_final.groupby('Group')['auditc'].value_counts()
AUDIT_1_counts = first_final.groupby('Group')['AUDIT1'].value_counts()
AUDIT_2_counts = first_final.groupby('Group')['AUDIT2'].value_counts()
AUDIT_3_counts = first_final.groupby('Group')['AUDIT3'].value_counts()
AUDIT_tot_counts.to_csv('AUDIT_tot_counts.csv')
AUDIT_1_counts.to_csv('AUDIT_1_counts.csv')
AUDIT_2_counts.to_csv('AUDIT_2_counts.csv')
AUDIT_3_counts.to_csv('AUDIT_3_counts.csv')

AUDIT_tot_perc = first_final.groupby('Group')['auditc'].value_counts() / first_final.groupby('Group')['auditc'].count()
AUDIT_1_perc = first_final.groupby('Group')['AUDIT1'].value_counts() / first_final.groupby('Group')['AUDIT1'].count()
AUDIT_2_perc = first_final.groupby('Group')['AUDIT2'].value_counts() / first_final.groupby('Group')['AUDIT2'].count()
AUDIT_3_perc = first_final.groupby('Group')['AUDIT3'].value_counts() / first_final.groupby('Group')['AUDIT3'].count()
AUDIT_tot_perc.to_csv('AUDIT_tot_perc.csv')
AUDIT_1_perc.to_csv('AUDIT_1_perc.csv')
AUDIT_2_perc.to_csv('AUDIT_2_perc.csv')
AUDIT_3_perc.to_csv('AUDIT_3_perc.csv')