In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os
import scipy as sp

#visualizing results
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_context("poster")
sns.set_style("ticks")
#import yellowbrick as yb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import silhouette_score
from sklearn.decomposition import KernelPCA
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import scipy.cluster.hierarchy as shc

from kmodes.kmodes import KModes

In [None]:
data_path = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/PeskindTBI/final_data/first_visits_short.csv'

In [None]:
#read in csv containing data from all surveys and all visitseqs
data = pd.read_csv(data_path, index_col=0)
data = pd.DataFrame(data = data)

print('Original data shape:\n', data.shape, '\n')
print(data.info())
data.head(1)

In [None]:
#create graphing function to deal with different types of variables 
def graph_data(data, param_names, cat_vars, auditc_vars, group, order, hue=None):
    for param in param_names:
        print(param)

        #viz format depends on type of variable - if cat then get percent for each answer type for each group (C and T)
        if param in cat_vars: #drop response = 9: not known
            data_cat = (data[data[param] != 9].groupby(group)[param].value_counts() /
                        data[data[param] != 9].groupby(group)[param].count()).reset_index(name='count')
        
            try:
                g = sns.catplot(x=param, y='count', data=data_cat, hue=group, kind='bar', height=5, aspect=4, col=hue)
                plt.show()
        
                print('\n')
        
            except:
                pass
        
        elif param in auditc_vars:
            data_auditc = (data.groupby(group)[param].value_counts() /
                        data.groupby(group)[param].count()).reset_index(name='count')
        
            try:
                g = sns.catplot(x=param, y='count', data=data_auditc, hue=group, kind='bar', height=5, aspect=4, col=hue)
                plt.show()
        
                print('\n')
        
            except:
                pass
        else:
            try:
                g = sns.catplot(x=group, y=param, data=data, kind='bar', height=5, aspect=4, order=order, hue=hue)
                plt.show()
        
                print('\n')
        
            except:
                pass

In [None]:
#list of categorical variables
cat_vars = ['Status_x', 'GType', 'Race', 'Hispanic', 'Handedness', 'Marital', 
 'ExpPB', 'ExpMark1', 'ExpAntiM', 'ExpStim', 'ExpOthrS', 'ExpDEET',
       'ExpTick', 'ExpPCollr', 'ExpPStrp', 'ExpPEnvi', 'ExpToxic',
       'ExpPaint', 'ExpXsVib', 'ExpHStrk', 'ExpRadar', 'ExpIonRa',
       'ExpYCake', 'ExpVhicl', 'ExpUrRnd', 'ExpDtOrd', 'ExpGasM',
       'ExpMOPP', 'ExpRadBg', 'ExpAN', 'ExpNG', 'ExpTNT', 'ExpPETN',
       'ExpRDX', 'ExpNC', 'ExpANFO', 'ExpCompB', 'ExpOctol', 'ExpPntlt',
       'ExpDynmt', 'ExpOthrX', 'HYPERTEN', 'HYPERCHO', 'DIABETES', 'B12DEF', 'THYROID', 
 'MHxPain','MHxHA', 'MHxHtn', 'MHxCard', 'MHxGI', 'MHxNeuro', 'MHxLung', 'MHxApnea', 
 'SCPTSD', 'SCMDD', 'SCPD', 'SCGAD', 'SCNone', 'capsCrtA']

auditc_vars = ['auditc', 'AUDIT1', 'AUDIT2', 'AUDIT3']

param_names = list(data)[0:2]

group = 'Group'

order=["C", "T"]

graph_data(data, param_names, cat_vars, auditc_vars, group, order, hue=None)

## AUDIT-C clustering with C and T from first visit sequence

In [None]:
#perform clustering on individual auditc questions (3 total questions) from visit sequence 1
data_first = data[data['VisitSeq'] == 1]
data_allgroups_auditqs = data_first[['VisitSeq', 'Group', 'TBIID', 'AUDIT1', 'AUDIT2', 'AUDIT3']].set_index(['VisitSeq', 'Group', 'TBIID'])
print(data_allgroups_auditqs.shape)
data_allgroups_auditqs.dropna(axis=0, inplace=True)
print(data_allgroups_auditqs.shape)
data_allgroups_auditqs.head()

In [None]:
# center and scale the data
scaler = StandardScaler()

data_allgroups_auditqs_scaled = scaler.fit_transform(data_allgroups_auditqs)

In [None]:
#viz dendrogram to find if three clusters makes sense
plt.figure(figsize=(10, 7))  
plt.title("AUDIT-C 3 Question Dendrogram")  
plt.ylabel("Distance (dissimilarity)")
plt.xlabel("Participants")
dend = shc.dendrogram(shc.linkage(data_allgroups_auditqs, method='ward'), 
                      distance_sort='ascending',
                      show_leaf_counts=True, leaf_font_size=8)

In [None]:
#pick cluster number based on silhouette coefficient
k_range = range(2,7)
scores = []
for k in k_range:
    km_ss = KMeans(n_clusters=k, random_state=10)
    km_ss.fit(data_allgroups_auditqs)
    scores.append(silhouette_score(data_allgroups_auditqs, km_ss.labels_))

# plot the results
plt.plot(k_range, scores)
plt.title('AUDIT-C Questions K means clustering')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')

In [None]:
#pick cluster number based on minimizing sse
k_range = range(2,7)
scores = []
for k in k_range:
    km_ss = KMeans(n_clusters=k, random_state=10)
    km_ss.fit(data_allgroups_auditqs)
    scores.append(km_ss.inertia_)
    
# plot the results
plt.plot(k_range, scores)
plt.title('AUDIT-C Questions K means clustering')
plt.xlabel('Number of clusters')
plt.ylabel('SSE')

In [None]:
#viz clustering with heat map of AUDIT-C answers
sns.clustermap(data_allgroups_auditqs, metric="euclidean", standard_scale=1, method="ward", cmap="Blues")

In [None]:
#choose k=3 clusters and fit data
km_3 = KMeans(n_clusters=3,random_state=1)
km_3.fit(data_allgroups_auditqs)
#new df for cluster info
data_allgroups_auditqs_clusters = data_allgroups_auditqs
data_allgroups_auditqs_clusters = data_allgroups_auditqs_clusters.reset_index()
data_allgroups_auditqs_clusters['kmeans_cluster'] = ["cluster_" + str(label) for label in km_3.labels_ ]
data_allgroups_auditqs_clusters.head(1)

In [None]:
#merge with orig df so each TBIID entry has its corr cluster assignment added
data_clusters = pd.merge(data, data_allgroups_auditqs_clusters[['TBIID', 'kmeans_cluster']], on=['TBIID'], how='outer')
print(data_clusters.shape)
print(data_clusters['VisitSeq'].value_counts())
data_clusters.head(1)

In [None]:
#viz cluster grouping and auditc answers - seems to be low(1), high freq(2), and binge clusters(0)
auditc_cols = ['auditc', 'AUDIT1', 'AUDIT2', 'AUDIT3']
for variable in auditc_cols:
    try:
        plt.figure(figsize=(7,7))
        g = sns.barplot(x='kmeans_cluster', y=variable, data=data_clusters[data_clusters['VisitSeq'] == 1], ci=68, hue='Group', palette="rocket", order=["cluster_0", "cluster_1", "cluster_2"])
        plt.show()
    except:
        pass

In [None]:
#rename clusters for easier viz understanding
data_clusters.replace({'kmeans_cluster': {'cluster_0':'binge', 'cluster_1':'low', 'cluster_2':'freq'}}, inplace=True)
#look at counts in each cluster for each group (is there a difference in cluster patterns between groups?)
data_clusters[data_clusters['VisitSeq'] == 1].groupby(['Group'])['kmeans_cluster'].value_counts().reset_index(name='count').sort_values(['Group', 'kmeans_cluster'])

In [None]:
tsne = TSNE(n_components=2, perplexity=50, random_state=1234)
tsne_features = tsne.fit_transform(data_allgroups_auditqs)

print(tsne_features.shape)
tsne_df = pd.DataFrame(data = tsne_features, columns = ['tsne_0', 'tsne_1'], index = data_allgroups_auditqs_clusters.index)
data_allgroups_auditqs_clusters_TSNE = pd.concat([data_allgroups_auditqs_clusters, tsne_df], axis = 1)
data_allgroups_auditqs_clusters_TSNE.replace({'kmeans_cluster': {'cluster_0':'binge', 'cluster_1':'low', 'cluster_2':'freq'}}, inplace=True)
data_allgroups_auditqs_clusters_TSNE.head()

In [None]:
plt.figure(figsize=(15,10))
sns.scatterplot(x='tsne_0', y='tsne_1', data=data_allgroups_auditqs_clusters_TSNE, hue='kmeans_cluster')

In [None]:
#select only visit 1
data_clusters_visit1 = data_clusters[data_clusters['VisitSeq'] == 1]
data_clusters_visit1.to_csv('data_auditc_clustering_visit1.csv')

#get value counts for AUDIT-C for each cluster (not split by group)
AUDIT_tot_counts_cluster = data_clusters_visit1.groupby(['kmeans_cluster'])['auditc'].value_counts()
AUDIT_1_counts_cluster = data_clusters_visit1.groupby(['kmeans_cluster'])['AUDIT1'].value_counts()
AUDIT_2_counts_cluster = data_clusters_visit1.groupby(['kmeans_cluster'])['AUDIT2'].value_counts()
AUDIT_3_counts_cluster = data_clusters_visit1.groupby(['kmeans_cluster'])['AUDIT3'].value_counts()
AUDIT_tot_counts_cluster.to_csv('AUDIT_tot_counts_cluster.csv')
AUDIT_1_counts_cluster.to_csv('AUDIT_1_counts_cluster.csv')
AUDIT_2_counts_cluster.to_csv('AUDIT_2_counts_cluster.csv')
AUDIT_3_counts_cluster.to_csv('AUDIT_3_counts_cluster.csv')

AUDIT_tot_perc_cluster = data_clusters_visit1.groupby(['kmeans_cluster'])['auditc'].value_counts() / data_clusters_visit1.groupby(['kmeans_cluster'])['auditc'].count()
AUDIT_1_perc_cluster = data_clusters_visit1.groupby(['kmeans_cluster'])['AUDIT1'].value_counts() / data_clusters_visit1.groupby(['kmeans_cluster'])['AUDIT1'].count()
AUDIT_2_perc_cluster = data_clusters_visit1.groupby(['kmeans_cluster'])['AUDIT2'].value_counts() / data_clusters_visit1.groupby(['kmeans_cluster'])['AUDIT2'].count()
AUDIT_3_perc_cluster = data_clusters_visit1.groupby(['kmeans_cluster'])['AUDIT3'].value_counts()/ data_clusters_visit1.groupby(['kmeans_cluster'])['AUDIT3'].count()
AUDIT_tot_perc_cluster.to_csv('AUDIT_tot_perc_cluster.csv')
AUDIT_1_perc_cluster.to_csv('AUDIT_1_perc_cluster.csv')
AUDIT_2_perc_cluster.to_csv('AUDIT_2_perc_cluster.csv')
AUDIT_3_perc_cluster.to_csv('AUDIT_3_perc_cluster.csv')

#get value counts by group for each cluster
AUDIT_cluster_counts = data_clusters_visit1.groupby('Group')['kmeans_cluster'].value_counts() 
AUDIT_cluster_perc = data_clusters_visit1.groupby('Group')['kmeans_cluster'].value_counts() / data_clusters_visit1.groupby('Group')['kmeans_cluster'].count()
AUDIT_cluster_counts.to_csv('AUDIT_cluster_counts.csv')
AUDIT_cluster_perc.to_csv('AUDIT_cluster_perc.csv')

### TBI only add'n analysis and viz

In [None]:
#select just TBI participants and exclude visit '88' for add'l analysis
data_clusters_TBI = data_clusters[(data_clusters['Group'] == 'T') & (data_clusters['VisitSeq'] != 88)]

In [None]:
#list of categorical variables
cat_vars = ['Status_x', 'GType', 'Race', 'Hispanic', 'Handedness', 'Marital', 
 'ExpPB', 'ExpMark1', 'ExpAntiM', 'ExpStim', 'ExpOthrS', 'ExpDEET',
       'ExpTick', 'ExpPCollr', 'ExpPStrp', 'ExpPEnvi', 'ExpToxic',
       'ExpPaint', 'ExpXsVib', 'ExpHStrk', 'ExpRadar', 'ExpIonRa',
       'ExpYCake', 'ExpVhicl', 'ExpUrRnd', 'ExpDtOrd', 'ExpGasM',
       'ExpMOPP', 'ExpRadBg', 'ExpAN', 'ExpNG', 'ExpTNT', 'ExpPETN',
       'ExpRDX', 'ExpNC', 'ExpANFO', 'ExpCompB', 'ExpOctol', 'ExpPntlt',
       'ExpDynmt', 'ExpOthrX', 'HYPERTEN', 'HYPERCHO', 'DIABETES', 'B12DEF', 'THYROID', 
 'MHxPain','MHxHA', 'MHxHtn', 'MHxCard', 'MHxGI', 'MHxNeuro', 'MHxLung', 'MHxApnea', 
 'SCPTSD', 'SCMDD', 'SCPD', 'SCGAD', 'SCNone', 'capsCrtA']

auditc_vars = ['auditc', 'AUDIT1', 'AUDIT2', 'AUDIT3']

param_names = list(data_clusters_TBI)

group = 'kmeans_cluster'

order=["low", "freq", "binge"]

#how does drinking cluster affect params?
graph_data(data_clusters_TBI[data_clusters_TBI['VisitSeq']==1], param_names, cat_vars, auditc_vars, group, order, hue=None)

In [None]:
#does drinking cluster affect follow-up visits
print(data_clusters_TBI.groupby(['VisitSeq', 'kmeans_cluster'])['kmeans_cluster'].count())
sns.countplot(x='VisitSeq', data=data_clusters_TBI, hue='kmeans_cluster', palette="rocket")

In [None]:
#exclude 3rd visit seq
data_clusters_TBI_v12 = data_clusters_TBI[data_clusters_TBI['VisitSeq'] != 3]
#get TBIIDs of participants with a second visit and use to filter
TBIID_v2 = data_clusters_TBI_v12[data_clusters_TBI_v12['VisitSeq'] == 2]['TBIID'].values

#create new column for if came to second visitseq - want to see if params at visitseq 1 map on to whether they came back again
data_clusters_TBI_v12['visit2'] = [1 if TBIID in TBIID_v2 else 0 for TBIID in data_clusters_TBI_v12['TBIID']]

#create new df with only participants who came to both visits 1 and 2
data_clusters_TBI_v12_only = data_clusters_TBI_v12[data_clusters_TBI_v12['TBIID'].isin(TBIID_v2)]
print(data_clusters_TBI_v12_only.shape)
data_clusters_TBI_v12_only.head()

In [None]:
#list of categorical variables
cat_vars = ['Status_x', 'GType', 'Race', 'Hispanic', 'Handedness', 'Marital', 
 'ExpPB', 'ExpMark1', 'ExpAntiM', 'ExpStim', 'ExpOthrS', 'ExpDEET',
       'ExpTick', 'ExpPCollr', 'ExpPStrp', 'ExpPEnvi', 'ExpToxic',
       'ExpPaint', 'ExpXsVib', 'ExpHStrk', 'ExpRadar', 'ExpIonRa',
       'ExpYCake', 'ExpVhicl', 'ExpUrRnd', 'ExpDtOrd', 'ExpGasM',
       'ExpMOPP', 'ExpRadBg', 'ExpAN', 'ExpNG', 'ExpTNT', 'ExpPETN',
       'ExpRDX', 'ExpNC', 'ExpANFO', 'ExpCompB', 'ExpOctol', 'ExpPntlt',
       'ExpDynmt', 'ExpOthrX', 'HYPERTEN', 'HYPERCHO', 'DIABETES', 'B12DEF', 'THYROID', 
 'MHxPain','MHxHA', 'MHxHtn', 'MHxCard', 'MHxGI', 'MHxNeuro', 'MHxLung', 'MHxApnea', 
 'SCPTSD', 'SCMDD', 'SCPD', 'SCGAD', 'SCNone', 'capsCrtA']

auditc_vars = ['auditc', 'AUDIT1', 'AUDIT2', 'AUDIT3']

param_names = list(data_clusters_TBI_v12_only)[0:15]

group = 'VisitSeq'

order=[1,2]

graph_data(data_clusters_TBI_v12_only, param_names, cat_vars, auditc_vars, group, order, hue=None)

In [None]:
#viz how data params change across drinking pattern groups and visit sequences (1 vs 2)
variables = ['Race', 'Hispanic', 'Handedness', 'ScreenAge',
       'Education', 'Marital', 'servconn', 'cestotal', 'auditc', 'AUDIT1', 'AUDIT2', 'AUDIT3',
       'HEIGHT', 'HEIGDEC', 'WEIGHT', 'BMI', 'BPSYS',
       'BPDIAS', 'HRATE', 'HYPERTEN', 'HYPERCHO', 'DIABETES', 'B12DEF',
       'THYROID', 'PITHistTBIID', 'BGlucose', 'BNa', 'BUN', 'BCreat',
       'BOsmo', 'UOsmo', 'USG', 'UNa', 'BK', 'TotalChol', 'LDL', 'HDL',
       'Trig', 'CSFPROTEIN_x', 'CSFGLUCOSE', 'CSFRBCS', 'MHxPain',
       'MHxHA', 'MHxHtn', 'MHxCard', 'MHxGI', 'MHxNeuro', 'MHxLung',
       'MHxApnea', 'SCPTSD', 'SCMDD', 'SCPD', 'SCGAD', 'SCNone',
       'capsCrtA', 'CAPSTotal', 'hrslp', 
       'PSQIc1', 'PSQIc2', 'PSQIc3', 'PSQIc4', 'PSQIc5', 'PSQIc6',
       'PSQIc7', 'PSQItot', 'PCL1', 'PCL2', 'PCL3', 'PCL4', 'PCL5',
       'PCL6', 'PCL7', 'PCL8', 'PCL9', 'PCL10', 'PCL11', 'PCL12', 'PCL13',
       'PCL14', 'PCL15', 'PCL16', 'PCL17', 'PCLTot', 'PHQ1', 'PHQ2',
       'PHQ3', 'PHQ4', 'PHQ5', 'PHQ6', 'PHQ7', 'PHQ8', 'PHQ9', 'PHQTot',
       'tbiDizzy', 'tbiBalan', 'tbiCoord', 'tbiHeada', 'tbiNaus',
       'tbiVision', 'tbiLight', 'tbiHear', 'tbiNoise', 'tbiTingl',
       'tbiTstsml', 'tbiAppet', 'tbiConc', 'tbiForget', 'tbiDecis',
       'tbiSlow', 'tbiEnergy', 'tbiSleep', 'tbiAnx', 'tbiSad', 'tbiIrrit',
       'tbiOverw', 'tbiDisin', 'tbiWithd', 'tbiRing', 'tbiMoods',
       'tbiFight', 'tbiSpch', 'NSITot', 'TBITot', 'PreSleep', 'PreCaff',
       'PreETOH', 'PreNic', 'PreTHC', 
       'NSI_comp_vestibular', 'NSI_comp_somatosensory',
       'NSI_comp_cognitive', 'NSI_comp_affective',
       'NSI_comp_ERP_affective', 'NIS_comp_ERP_vestsom']
for variable in variables[0:10]:
    try:
        plt.figure(figsize=(7,5))
        sns.lineplot(x='VisitSeq', y=variable, data=data_clusters_TBI_v12_only, hue='kmeans_cluster', ci=None, palette="rocket")
        #plt.savefig(str(variable + '_plot.png'))
        plt.show()
    except:
        pass

In [None]:
#viz how data params change across drinking pattern groups and and whether they came back for second visit
variables = ['Race', 'Hispanic', 'Handedness', 'ScreenAge',
       'Education', 'Marital', 'servconn', 'cestotal', 'auditc', 'AUDIT1', 'AUDIT2', 'AUDIT3',
       'HEIGHT', 'HEIGDEC', 'WEIGHT', 'BMI', 'BPSYS',
       'BPDIAS', 'HRATE', 'HYPERTEN', 'HYPERCHO', 'DIABETES', 'B12DEF',
       'THYROID', 'PITHistTBIID', 'BGlucose', 'BNa', 'BUN', 'BCreat',
       'BOsmo', 'UOsmo', 'USG', 'UNa', 'BK', 'TotalChol', 'LDL', 'HDL',
       'Trig', 'CSFPROTEIN_x', 'CSFGLUCOSE', 'CSFRBCS', 'MHxPain',
       'MHxHA', 'MHxHtn', 'MHxCard', 'MHxGI', 'MHxNeuro', 'MHxLung',
       'MHxApnea', 'SCPTSD', 'SCMDD', 'SCPD', 'SCGAD', 'SCNone',
       'capsCrtA', 'CAPSTotal', 'hrslp', 
       'PSQIc1', 'PSQIc2', 'PSQIc3', 'PSQIc4', 'PSQIc5', 'PSQIc6',
       'PSQIc7', 'PSQItot', 'PCL1', 'PCL2', 'PCL3', 'PCL4', 'PCL5',
       'PCL6', 'PCL7', 'PCL8', 'PCL9', 'PCL10', 'PCL11', 'PCL12', 'PCL13',
       'PCL14', 'PCL15', 'PCL16', 'PCL17', 'PCLTot', 'PHQ1', 'PHQ2',
       'PHQ3', 'PHQ4', 'PHQ5', 'PHQ6', 'PHQ7', 'PHQ8', 'PHQ9', 'PHQTot',
       'tbiDizzy', 'tbiBalan', 'tbiCoord', 'tbiHeada', 'tbiNaus',
       'tbiVision', 'tbiLight', 'tbiHear', 'tbiNoise', 'tbiTingl',
       'tbiTstsml', 'tbiAppet', 'tbiConc', 'tbiForget', 'tbiDecis',
       'tbiSlow', 'tbiEnergy', 'tbiSleep', 'tbiAnx', 'tbiSad', 'tbiIrrit',
       'tbiOverw', 'tbiDisin', 'tbiWithd', 'tbiRing', 'tbiMoods',
       'tbiFight', 'tbiSpch', 'NSITot', 'TBITot', 'PreSleep', 'PreCaff',
       'PreETOH', 'PreNic', 'PreTHC', 
       'NSI_comp_vestibular', 'NSI_comp_somatosensory',
       'NSI_comp_cognitive', 'NSI_comp_affective',
       'NSI_comp_ERP_affective', 'NIS_comp_ERP_vestsom']

data_viz = data_clusters_TBI_v12[data_clusters_TBI_v12['VisitSeq']==1]
for variable in variables[0:15]:
    try:
        plt.figure(figsize=(7,5))
        sns.barplot(x='kmeans_cluster', y=variable, data=data_viz, hue='visit2', ci=68, palette="rocket")
        #plt.savefig(str(variable + '_plot.png'))
        plt.show()
    except:
        pass

In [None]:
#list of categorical variables
cat_vars = ['Status_x', 'GType', 'Race', 'Hispanic', 'Handedness', 'Marital', 
 'ExpPB', 'ExpMark1', 'ExpAntiM', 'ExpStim', 'ExpOthrS', 'ExpDEET',
       'ExpTick', 'ExpPCollr', 'ExpPStrp', 'ExpPEnvi', 'ExpToxic',
       'ExpPaint', 'ExpXsVib', 'ExpHStrk', 'ExpRadar', 'ExpIonRa',
       'ExpYCake', 'ExpVhicl', 'ExpUrRnd', 'ExpDtOrd', 'ExpGasM',
       'ExpMOPP', 'ExpRadBg', 'ExpAN', 'ExpNG', 'ExpTNT', 'ExpPETN',
       'ExpRDX', 'ExpNC', 'ExpANFO', 'ExpCompB', 'ExpOctol', 'ExpPntlt',
       'ExpDynmt', 'ExpOthrX', 'HYPERTEN', 'HYPERCHO', 'DIABETES', 'B12DEF', 'THYROID', 
 'MHxPain','MHxHA', 'MHxHtn', 'MHxCard', 'MHxGI', 'MHxNeuro', 'MHxLung', 'MHxApnea', 
 'SCPTSD', 'SCMDD', 'SCPD', 'SCGAD', 'SCNone', 'capsCrtA', 'kmeans_cluster', 'visit2']

auditc_vars = ['auditc', 'AUDIT1', 'AUDIT2', 'AUDIT3']

param_names = list(data_clusters_TBI)

group = 'kmeans_cluster'

order=["low", "freq", "binge"]

hue='visit2'

#viz if params differ between partic who came back for a second visit or not
graph_data(data_clusters_TBI_v12[data_clusters_TBI_v12['VisitSeq']==1], param_names, cat_vars, auditc_vars, group, order, hue=hue)

In [None]:
#compute outliers
data_clusters_TBI.set_index(['TBIID', 'Group', 'kmeans_cluster'], inplace=True)
data_clusters_TBI_unstack = data_clusters_TBI.unstack(level = -1)
data_clusters_TBI_unstack.head()

#compute quartiles, IQRs, and bounds for each parameter for each group
quartile_1 = data_clusters_TBI_unstack.quantile(0.25)
quartile_3 = data_clusters_TBI_unstack.quantile(0.75)
iqr = quartile_3 - quartile_1
lower_bound = quartile_1 - (iqr * 1.5)
upper_bound = quartile_3 + (iqr * 1.5)

#use bounds to exclude any data points outside of the bounds (outliers will be replaced with NaN)
outliers = data_clusters_TBI_unstack[(data_clusters_TBI_unstack <= upper_bound) & (data_clusters_TBI_unstack >= lower_bound)]
#stack to return dataframe to original orientation
df_no_outliers = outliers.stack()
df_no_outliers.reset_index(inplace=True)
df_no_outliers.head()

In [None]:
df_no_outliers.to_csv('df_no_outliers.csv')

In [None]:
#viz cluster grouping and auditc answers within this smaller data set of participants who came back for visitseq2
auditc_cols = ['auditc', 'AUDIT1', 'AUDIT2', 'AUDIT3']
for variable in auditc_cols:
    try:
        plt.figure(figsize=(7,7))
        g = sns.barplot(x='kmeans_cluster', y=variable, data=data_clusters_TBI_v12_only[data_clusters_TBI_v12_only['VisitSeq'] == 1], ci=68, hue='Group', palette="rocket", order=["low", "freq", "binge"])
        plt.show()
    except:
        pass