In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os
import scipy as sp

#visualizing results
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
%matplotlib inline
import seaborn as sns
sns.set_context('poster')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.manifold import TSNE

from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from kmodes.kmodes import KModes

from sklearn.mixture import GaussianMixture

import scipy.cluster.hierarchy as shc

from minisom import MiniSom

In [None]:
data_path = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/PeskindTBI/final_data/first_visits_short.csv'
praz_path = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/PeskindTBI/praz_data.xlsx'

In [None]:
#read in csv containing data from all surveys and all visitseqs
data = pd.read_csv(data_path, index_col=0)
data = pd.DataFrame(data = data)

data['group_num'] = data['Group'].replace({'C': 0, 'T': 1})

print('Original data shape:\n', data.shape, '\n')
print(data.info())
data.head(1)

In [None]:
#select which groups to viz
TBIID_to_keep = ['C', 'T']
print(data['Group'].value_counts())
data = data[data['Group'].isin(TBIID_to_keep)]
print(data['Group'].value_counts())

In [None]:
#determined outliers for auditc and QBlstExp (outlier = >3 SD from mean) and remove
#data = data[data["TBIID"] != 'C010']
data = data[data["TBIID"] != 'T080']

In [None]:
#create bins
BMI_bin = []
for value in data['BMI'].values:
    if value <= 20:
        BMI_bin.append(20)
    elif 20 <= value <= 25:
        BMI_bin.append(25)
    elif 25 <= value <= 30:
        BMI_bin.append(30)
    elif 30 <= value <= 35:
        BMI_bin.append(35)
    elif value >= 35:
        BMI_bin.append(40)
    else: 
        BMI_bin.append(np.nan)

print(len(BMI_bin))
print(data.shape)
data['BMI_bin'] = BMI_bin

In [None]:
#cholesterol related measures 
data['Total_HDL_ratio'] = data['TotalChol'] / data['HDL']
data['HDL_LDL_ratio'] = data['HDL'] / data['LDL']
data['LDL_HDL_ratio'] = data['LDL'] / data['HDL']
data['Tri_HDL_ratio'] = data['Trig'] / data['HDL']

In [None]:
#create variable lists depending on data type for graphing purposes

data_cont = ['ScreenAge', 'Education', 'servconn', 'cestotal',
       'MnthSncBlst', 'QKOIorA', 'QKOExpMil', 'QKOAllMil', 'QKOLife',
       'QBlstExp', 'ACRM_count', 'QBEACRM', 'Q5plus2', 'QEDist_sum', 'QEDist_mean',
       'QEDist_min', 'DOPA', 'DA', 'DOPAC', 'NE', 'DHPG', 'da_dopa_ratio', 'dopac_da_ratio',
       'ne_dopa_ratio', 'dhpg_ne_ratio', 
       'CAPSTotal', 'PSQIc1', 'PSQIc2', 'PSQIc3', 'PSQIc4', 'PSQIc5',
       'PSQIc6', 'PSQIc7', 'PSQItot', 'PCLTot', 'PCL_reexp', 'PCL_avoid',
       'PCL_numb', 'PCL_hyper', 'PHQTot', 'PHQ_psych', 'PHQ_somatic',
       'NSITot', 'TBITot', 'NSI_vestibular', 'NSI_somatosensory',
       'NSI_cognitive', 'NSI_affective', 'NSI_ERP_affective',
       'NIS_ERP_vestsom', 'DvpHA', 'DvpHAAct', 'DvpHASlp',
       'DvpHAMd', 'DvpHAStr', 'DvpBP', 'DvpBPAct', 'DvpBPslp', 'DvpBPMd',
       'DvpBPStr', 'BISAtt', 'BISMtr', 'BISNonpl', 'BISTot',
             'PreSleep', 'PreCaff', 'PreETOH', 'PreNic', 'PreTHC']

health_cont = ['HEIGHT', 'WEIGHT', 'BMI',
       'BPSYS', 'BPDIAS', 'HRATE', 'BGlucose', 'BNa', 'BUN', 'BCreat', 'UNa',
       'BK', 'TotalChol', 'LDL', 'HDL', 'Trig', 'Total_HDL_ratio', 'HDL_LDL_ratio', 'LDL_HDL_ratio', 'Tri_HDL_ratio', 'CSFPROTEIN_x',
       'CSFGLUCOSE', 'CSFRBCS']

data_cat = ['Status_x', 'VisitSeq', 'GType', 'Race', 'Hispanic', 'Handedness', 'Marital', 'APOEGen', 
           'HYPERTEN', 'HYPERCHO', 'DIABETES', 'B12DEF', 'THYROID',
           'MHxPain', 'MHxHA', 'MHxHtn', 'MHxCard', 'MHxGI', 'MHxNeuro', 'MHxLung', 'MHxApnea', 
            'SCPTSD', 'SCMDD', 'SCPD', 'SCGAD', 'SCNone', 'capsCrtA']

data_audit = ['auditc', 'AUDIT1', 'AUDIT2', 'AUDIT3']


In [None]:
#focus on first visit
data_v1 = data.copy()[data['VisitSeq'] == 1]

In [None]:
for param in health_cont:
    print(param)
    
    try:
        #t-test
        C = data_v1[data_v1['Group'] == 'C'][param].dropna()
        T = data_v1[data_v1['Group'] == 'T'][param].dropna()
        print(sp.stats.ttest_ind(C, T, axis=0, equal_var=True))
        
        g = sns.catplot(x='Group', y=param, kind='bar', data=data_v1, ci=68, height=5, aspect=4)
        plt.show()
        
        print('\n')
        
    except:
        pass

In [None]:
#get counts for chi squared analysis of group differences
BMI_bin_counts = (data_v1.groupby(['Group'])['BMI_bin'].value_counts()).reset_index(name='count')
BMI_bin_counts.sort_values(['Group', 'BMI_bin'])

In [None]:
#get counts for chi squared analysis of group differences
BMI_bin_counts = (data_v1.groupby(['Group'])['BMI_bin'].value_counts() / data_v1.groupby(['Group'])['BMI_bin'].count() * 100).reset_index(name='count')
BMI_bin_counts.sort_values(['Group', 'BMI_bin'])

In [None]:
sig_params = ['BMI', 'HRATE', 'BGlucose', 'CSFPROTEIN_x',
              'MnthSncBlst', 'ScreenAge', 'servconn', 'cestotal', 'ACRM_count', 'QKOIorA', 'QBlstExp',
              'PCLTot', 'PHQTot', 'PSQItot', 'NSITot', 'TBITot', 'auditc', 'group_num', ]

data_sig_health = data_v1[sig_params]

data_corr = data_sig_health.corr()
data_corr_0 = data_sig_health[data_sig_health['group_num'] == 0].corr()
data_corr_1 = data_sig_health[data_sig_health['group_num'] == 1].corr()

In [None]:
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(data_corr, dtype=np.bool))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
f, ax = plt.subplots(figsize=(13, 9))
sns.heatmap(data_corr, mask=mask, cmap=cmap, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()

f, ax = plt.subplots(figsize=(13, 9))
sns.heatmap(data_corr_0, mask=mask, cmap=cmap, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()

f, ax = plt.subplots(figsize=(13, 9))
sns.heatmap(data_corr_1, mask=mask, cmap=cmap, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()

In [None]:
data_sig_health.to_csv('data_sig_health.csv', index=False)

# clustering

In [None]:
#perform clustering on individual health mesurement questions from visit sequence 1
data_first = data.copy()[data['VisitSeq'] == 1]
data_allgroups_health = data_first[['Group', 'TBIID', 'BMI', 'BPSYS', 'BPDIAS', 'HRATE', 'BGlucose', 'BNa', 'BUN', 'BCreat']]

print(data_allgroups_health.shape)
data_allgroups_health.dropna(axis=0, inplace=True)
print(data_allgroups_health.shape)
#save ids in cluster analysis and create new df
cluster_TBIID = data_allgroups_health['TBIID'].values
data_first_cluster = data_first[data_first['TBIID'].isin(cluster_TBIID)]

data_allgroups_health = data_allgroups_health.set_index(['TBIID', 'Group'])
data_allgroups_health.head(1)

In [None]:
# center and scale the data
scaler = StandardScaler()
#scaler = MinMaxScaler()
#scaler = RobustScaler()

health_scaled = scaler.fit_transform(data_allgroups_health)

In [None]:
#pick cluster number based on silhouette coefficient
k_range = range(2,10)

base_scores = []
sil_scores = []
db_scores = []
mse_scores = []

for k in k_range:
    km_ss = KMeans(n_clusters=k, random_state=39)
    km_ss.fit(health_scaled)
    
    mse_scores.append(km_ss.inertia_)
    sil_scores.append(silhouette_score(health_scaled, km_ss.labels_))
    db_scores.append(davies_bouldin_score(health_scaled, km_ss.labels_))
    
# plot the results
plt.plot(k_range, mse_scores)
plt.title('AUDIT-C Questions K means clustering')
plt.xlabel('Number of clusters')
plt.ylabel('MSE score')
plt.show()

# plot the results
plt.plot(k_range, sil_scores)
plt.title('AUDIT-C Questions K means clustering')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.show()

# plot the results
plt.plot(k_range, db_scores)
plt.title('AUDIT-C Questions K means clustering')
plt.xlabel('Number of clusters')
plt.ylabel('Davies Bouldin Score')
plt.show()

In [None]:
k_range = range(2,10)

gm_score=[]
gm_aic=[]
gm_bic= []

for k in k_range:
    gm = GaussianMixture(n_components=k, random_state=39)
    gm.fit(health_scaled)
    
    gm_score.append(gm.score(health_scaled))
    gm_aic.append(gm.aic(health_scaled))
    gm_bic.append(gm.bic(health_scaled))

# plot the results
plt.plot(k_range, gm_score)
plt.title('AUDIT-C Questions GMM')
plt.xlabel('Number of clusters')
plt.ylabel('Base score')
plt.show()

# plot the results
plt.plot(k_range, gm_aic)
plt.title('AUDIT-C Questions GMM')
plt.xlabel('Number of clusters')
plt.ylabel('AIC')
plt.show()

# plot the results
plt.plot(k_range, gm_bic)
plt.title('AUDIT-C Questions GMM')
plt.xlabel('Number of clusters')
plt.ylabel('BIC')
plt.show()

In [None]:
#choose k=2 clusters and fit data
km_2 = KMeans(n_clusters=2,random_state=39)
km_2.fit(health_scaled)
#new df for cluster info
data_allgroups_health = data_allgroups_health
data_allgroups_health = data_allgroups_health.reset_index()
data_allgroups_health['kmeans_cluster'] = ["cluster_" + str(label) for label in km_2.labels_ ]
data_first_cluster['kmeans_cluster'] = ["cluster_" + str(label) for label in km_2.labels_ ]
data_allgroups_health.head(1)

In [None]:
#viz cluster grouping and auditc answers - seems to be low(1), high freq(2), and binge clusters(0)
health_cols = ['BMI', 'BPSYS', 'BPDIAS', 'HRATE', 'BGlucose', 'BNa', 'BUN', 'BCreat']
for variable in health_cols:
    try:
        plt.figure(figsize=(7,5))
        g = sns.barplot(x='kmeans_cluster', y=variable, data=data_allgroups_health, ci=68, palette="rocket", order=["cluster_0", "cluster_1", 'cluster_2'])
        plt.show()
    except:
        pass

In [None]:
data_allgroups_health.groupby(['Group'])['kmeans_cluster'].value_counts() 

In [None]:
#viz cluster grouping and auditc answers - seems to be low(1), high freq(2), and binge clusters(0)
health_cols = ['MnthSncBlst', 'ScreenAge', 'servconn', 'cestotal', 'ACRM_count', 'QKOIorA', 'QBlstExp',
              'PCLTot', 'PHQTot', 'PSQItot', 'NSITot', 'TBITot', 'auditc']
for variable in health_cols:
    try:
        plt.figure(figsize=(7,5))
        g = sns.barplot(x='kmeans_cluster', y=variable, data=data_first_cluster, ci=68, palette="rocket", hue='Group', order=["cluster_0", "cluster_1", "cluster_2"])
        plt.show()
    except:
        pass

In [None]:
#exclude 3rd visit seq
visits = [1,2]
data_v12 = data[data['VisitSeq'].isin(visits)]
#get TBIIDs of participants with a second visit and use to filter
TBIID_v2 = data_v12[data_v12['VisitSeq'] == 2]['TBIID'].values

#create new column for if came to second visitseq - want to see if params at visitseq 1 map on to whether they came back again
data_v12['visit2'] = [1 if TBIID in TBIID_v2 else 0 for TBIID in data_v12['TBIID']]

#create new df with only participants who came to both visits 1 and 2
data_v12_only = data_v12[data_v12['TBIID'].isin(TBIID_v2)]
print(data_v12_only.shape)
print(data_v12_only.groupby(['VisitSeq', 'Group'])['Group'].value_counts())
data_v12_only.head()