In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os
import scipy as sp
import missingno as msno

from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

#visualizing results
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
%matplotlib inline
import seaborn as sns
sns.set_context('poster')

pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 10000)
pd.set_option('display.width', 10000)
pd.set_option('display.max_colwidth', -1)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

In [None]:
data_path = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/PeskindTBI/final_data/Methods Paper DB_Deidentified for Abbie_5.5.20.xlsx'

In [None]:
#read in csv containing data from all surveys and all visitseqs
data = pd.read_excel(data_path)
data = pd.DataFrame(data = data)

print('Original data shape:\n', data.shape, '\n')
print(data.info())
data.head(1)

## Value counts by group and visit seq

In [None]:
data.groupby('VisitSeq')['Group'].value_counts()

In [None]:
visit_keep = [1,2]
data_short = data[data['VisitSeq'].isin(visit_keep)]

In [None]:
#find and fix other forms of nan's
data_short = data_short.replace({-20.0: np.nan, -920.0: np.nan, -999.0: np.nan})
print(data_short.shape)
data_short.describe()

## Explore visit1 vs visit2

In [None]:
#get TBIIDs of participants with a second visit and use to filter
TBIID_v2 = data_short[data_short['VisitSeq'] == 2]['ID_CODE'].values

#add note for participant if came for visit 2
data_short['Visit2'] = [1 if x in TBIID_v2 else 0 for x in data_short['ID_CODE']]

In [None]:
poi = ['ACT18T', 'AnimalsT', 'BVTotReT',
       'CVT15FRT', 'LetFAST', 'W3LNST', 'rffspdt', 'SDW90TS', 'TrailAT',
       'TrailBT', 'WCSTPrsT']

for param in poi:
    sns.catplot(x='VisitSeq', y=param, data=data_short, kind='bar', hue='Group', col='Visit2')
    plt.show()

## Explore missing across all TBIID

In [None]:
poi = ['ACT18T', 'AnimalsT', 'BVTotReT',
       'CVT15FRT', 'LetFAST', 'W3LNST', 'rffspdt', 'SDW90TS', 'TrailAT',
       'TrailBT', 'WCSTPrsT']

data_poi = data_short[['ID_CODE', 'VisitSeq', 'Visit2', 'Group', 'ACT18T', 'AnimalsT', 'BVTotReT',
       'CVT15FRT', 'LetFAST', 'W3LNST', 'rffspdt', 'SDW90TS', 'TrailAT',
       'TrailBT', 'WCSTPrsT']]

data_poi.groupby(['VisitSeq', 'Group']).count().reset_index()

### Percent response

In [None]:
data_poi_counts = data_poi.groupby(['VisitSeq', 'Group'])[poi].count().mean(axis=1).reset_index(name='count_mean')
data_poi_counts['count_mean_perc'] = data_poi_counts['count_mean'] / data_poi.groupby(['VisitSeq', 'Group'])['ID_CODE'].count().reset_index(name='count_n')['count_n']
data_poi_counts

### Pariticpant count summary with nans vs without

In [None]:
print('Counts including nans: \n', data_poi.groupby('VisitSeq')['Group'].value_counts(), '\n')
data_poi_nonan = data_poi.dropna(axis=0)
print('Counts excluding nans: \n', data_poi_nonan.groupby('VisitSeq')['Group'].value_counts())

### Pariticpant counts by TBIID

In [None]:
print('Total poi: ', len(poi))
data_poi.groupby(['VisitSeq', 'Group', 'ID_CODE'])[poi].count().sum(axis=1).reset_index(name='count')

### Missingno viz VisitSeq 1

In [None]:
msno.matrix(data_poi[data_poi['VisitSeq'] ==  1])

In [None]:
msno.bar(data_poi[data_poi['VisitSeq'] ==  1])

In [None]:
msno.heatmap(data_poi[data_poi['VisitSeq'] ==  1])

### Missingno viz VisitSeq 2

In [None]:
msno.matrix(data_poi[data_poi['VisitSeq'] ==  2])

## Explore missing across TBIID that had visitseq 2

In [None]:
#get TBIIDs of participants with a second visit and use to filter
TBIID_v2 = data_poi[data_poi['VisitSeq'] == 2]['ID_CODE'].values

#create new df with only participants who came to both visits 1 and 2
data_poi_v2check = data_poi[data_poi['ID_CODE'].isin(TBIID_v2)]
print(data_poi_v2check.groupby('VisitSeq')['Group'].value_counts())
print(data_poi_v2check.shape)
data_poi_v2check.head()

In [None]:
msno.matrix(data_poi_v2check[data_poi_v2check['VisitSeq'] ==  1])

## Fill visit seq 1 missing with data from visit seq 2

In [None]:
#find ID_CODEs where there is more data in visit seq2 than seq1
max_data_v12 = []
#find IDs that had second visit and get count of answers for each visit
v12_count = data_poi[data_poi['ID_CODE'].isin(TBIID_v2)].groupby(['ID_CODE', 'VisitSeq']).count().sum(axis=1).reset_index(name='count')
#create new df
data_short_comb = pd.DataFrame()
for partic in data_poi['ID_CODE'].unique():
    #if participant has data from each visit
    if partic in v12_count['ID_CODE'].unique():
        #determine which visit seq has more answers
        if v12_count[v12_count['ID_CODE'] == partic]['ID_CODE'].shape[0] == 1:
            data_short_comb = data_short_comb.append(data_poi[(data_poi['VisitSeq'] == 2) & (data_poi['ID_CODE'] == partic)], ignore_index=True)
        elif v12_count[(v12_count['VisitSeq'] == 1) & (v12_count['ID_CODE'] == partic)]['count'].values < v12_count[(v12_count['VisitSeq'] == 2) & (v12_count['ID_CODE'] == partic)]['count'].values:
            data_short_comb = data_short_comb.append(data_poi[(data_poi['VisitSeq'] == 2) & (data_poi['ID_CODE'] == partic)], ignore_index=True)
        else:
            data_short_comb = data_short_comb.append(data_poi[(data_poi['VisitSeq'] == 1) & (data_poi['ID_CODE'] == partic)], ignore_index=True)
    else:
        data_short_comb = data_short_comb.append(data_poi[(data_poi['VisitSeq'] == 1) & (data_poi['ID_CODE'] == partic)], ignore_index=True)

print(len(data_poi['ID_CODE'].unique()))
print(len(data_short_comb['ID_CODE'].unique()))
data_short_comb

In [None]:
data_short_comb_nonan = data_short_comb.dropna(axis=0)
print('Counts excluding nans: \n', data_short_comb_nonan['Group'].value_counts())

## Explore IIV data

### Vis potential group differences in distribution across poi's

In [None]:
for var in poi:
    print(var)
    sns.distplot(data_poi[(data_poi['VisitSeq'] == 1) & (data_poi['Group'] == 0)][var].dropna())
    sns.distplot(data_poi[(data_poi['VisitSeq'] == 1) & (data_poi['Group'] == 1)][var].dropna())
    plt.show()

### Tidy df with poi type as new column

In [None]:
#create tidy df with IIV params melt
meta_params = ['VisitSeq', 'Group', 'ID']
IIV_parmas = ['tommtot1', 'tommtot2', 'tommret',
       'ACT18T', 'AnimalsT', 'BVTotReT', 'DkefInSS', 'CVT15FRT',
       'CvmtHitT', 'CvmtFaT', 'CvmtDprT', 'CvmtTotT', 'CvmtDelT',
       'LetFAST', 'W3LNSSS', 'MTPMTPer', 'MT15Per', 'MTTCPer', 'rffdst',
       'rffdat', 'rffsst', 'rffsat', 'rffspdt', 'rffacct', 'SDW90TS',
       'TrailAT', 'TrailBT', 'WCSTPrsT']
data_poi_tidy = pd.melt(data_poi, id_vars=meta_params, value_vars=IIV_parmas)
print(data_poi_tidy.shape)
data_poi_tidy.head()

### Violin plot viz

In [None]:
sns.catplot(x="variable", y="value", hue="Group",
            kind="violin", inner="stick", split=True,
            palette="pastel", data=data_poi_tidy[data_poi_tidy['VisitSeq'] == 1], height=15, aspect=3)

In [None]:
sns.catplot(x="variable", y="value", hue="Group",
            kind="violin", inner="stick", split=True,
            palette="pastel", data=data_poi_tidy[data_poi_tidy['VisitSeq'] == 2], height=15, aspect=3)

## z-score

### RobustScalar including nans visit seq 1

In [None]:
# center and scale the data
scaler = RobustScaler()

meta_col = ['VisitSeq', 'Group', 'ID_CODE']
poi_col = ['ACT18T', 'AnimalsT', 'BVTotReT',
       'CVT15FRT', 'LetFAST', 'W3LNST', 'rffspdt', 'SDW90TS', 'TrailAT',
       'TrailBT', 'WCSTPrsT']

#visit 1 first
data_poi_v1 = data_poi[data_poi['VisitSeq'] == 1]
#scale data
data_poi_scaled = scaler.fit_transform(data_poi_v1[poi_col])
data_poi_scaled = pd.DataFrame(data=data_poi_scaled, columns=poi)
data_poi_scaled = pd.concat([data_poi_v1[meta_col].reset_index(), data_poi_scaled], ignore_index=False, axis=1)

#comput std per TBIID
data_poi_scaled['std'] = data_poi_scaled[poi_col].std(axis=1)
#comput  maximum discrepancy per TBIID
data_poi_scaled['md'] = data_poi_scaled[poi_col].max(axis=1) - data_poi_scaled[poi_col].min(axis=1)

print(data_poi_scaled.shape)
data_poi_scaled.head(1)

In [None]:
sns.catplot(x="Group", y="std", 
            kind="bar", data=data_poi_scaled, ci=68)

In [None]:
sns.catplot(x="Group", y="md", 
            kind="bar", data=data_poi_scaled, ci=68)

### IIV corrs

In [None]:
data_poi_scaled.corr()[['std', 'md']]

In [None]:
data_short_v1 = data_short[data_short['VisitSeq'] == 1]
data_short_v1['std'] = data_poi_scaled['std']
data_short_v1['md'] = data_poi_scaled['md']
data_short_v1.head()

In [None]:
data_short_v1.corr()['std']

### RobustScalar no nans visit seq 1

In [None]:
# center and scale the data
scaler = RobustScaler()

#visit 1 first
data_poi_v1_nonan = data_poi[data_poi['VisitSeq'] == 1].dropna(axis=0)
#scale data
data_poi_scaled_nonan = scaler.fit_transform(data_poi_v1_nonan[poi_col])
data_poi_scaled_nonan = pd.DataFrame(data=data_poi_scaled_nonan, columns=poi)
data_poi_scaled_nonan = pd.concat([data_poi_v1_nonan[meta_col].reset_index(), data_poi_scaled_nonan], ignore_index=False, axis=1)

#comput std per TBIID
data_poi_scaled_nonan['std'] = data_poi_scaled_nonan[poi_col].std(axis=1)
#comput  maximum discrepancy per TBIID
data_poi_scaled_nonan['md'] = data_poi_scaled_nonan[poi_col].max(axis=1) - data_poi_scaled_nonan[poi_col].min(axis=1)

print(data_poi_scaled_nonan.shape)
data_poi_scaled_nonan.head(1)

In [None]:
sns.catplot(x="Group", y="std", 
            kind="bar", data=data_poi_scaled_nonan, ci=68)

In [None]:
sns.catplot(x="Group", y="md", 
            kind="bar", data=data_poi_scaled_nonan, ci=68)

In [None]:
#determined outliers for auditc and QBlstExp (outlier = >3 SD from mean) and remove
#data = data[data["TBIID"] != 'C010']
#data = data[data["TBIID"] != 'T080']