Goal of analysis script is to pull in excel file of deidentified alcohol data and analyze

Import modules

In [None]:
import pandas as pd
import numpy as np
import re
import os
from scipy import stats

# visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

Open dialog box and prompt user to select file containing the data 

In [None]:
from tkinter import filedialog
from tkinter import *
 
root = Tk()
root.filename =  filedialog.askopenfilename(initialdir = "/",title = "Select file",filetypes = (("jpeg files","*.jpg"),("all files","*.*")))
print (root.filename)

Create pandas dataframe from excel file

In [None]:
data = pd.read_csv(root.filename)
df = pd.DataFrame(data = data)
print(df.shape)
df = df[(df['Group'] != 'STBI') & (df['Group'] != 'VTBI') & (df['Group'] != 'NDC')]
print(df.shape)
df['Number'] = np.arange(len(df.index))
df.replace(to_replace='TBI', value='mTBI', inplace=True) 
df.set_index(['Number', 'Group'], inplace = True)
df.head()

In [None]:
df.rename(columns = {'PSQItot': 'Sleep_tot', 
                     'LECTotal': 'LifeEvents_tot', 
                     'PSQItot': 'Sleep_tot', 
                     'QKOIorA': 'KnockOut_dep', 
                     'QKOExpMil': 'KnockOut_mil', 
                     'QKOAllMil': 'KnockOut_anymil', 
                     'QKOLife': 'KnockOut_life', 
                     'QBlstExp': 'Blast_mil', 
                    'QBEIorA': 'Blast_dep'}, inplace = True)

In [None]:
df.groupby(["Group"]).describe()

We will use IQR (interquartile range) to determine outliers within each group. We will use the definition of outlier as any data point more than 1.5 IQRs below the first quartile or above the third quartile.

In [None]:
#create new data frame organized by group so we can compute outliers for each group individually
unstack = df.unstack(level = -1)
unstack.head()

In [None]:
#compute quartiles, IQRs, and bounds for each parameter for each group
quartile_1 = unstack.quantile(0.25)
quartile_3 = unstack.quantile(0.75)
iqr = quartile_3 - quartile_1
lower_bound = quartile_1 - (iqr * 1.5)
upper_bound = quartile_3 + (iqr * 1.5)
lower_bound.head()

In [None]:
#use bounds to exclude any data points outside of the bounds (outliers will be replaced with NaN)
outliers = unstack[(unstack <= upper_bound) & (unstack >= lower_bound)]
#stack to return dataframe to original orientation
df_no_outliers = outliers.stack()
df_no_outliers = df_no_outliers.reset_index().drop('Number', axis = 1)
print(df_no_outliers.shape)
df_no_outliers.head()

In [None]:
df_no_outliers.info()

In [None]:
print(df_no_outliers.count(axis = 0).sort_values())

In [None]:
#create AUDIT-C dataframe
audit = pd.DataFrame(data = df_no_outliers[df_no_outliers.columns[0:5]])
print(audit.shape)
print(audit.count(axis = 0).sort_values())
audit.head()

In [None]:
#Get value counts (as percent) for each group and each parameter for vizualization purposes
AUDIT1 = audit.groupby(['Group']).AUDIT1.value_counts(normalize = True)
AUDIT2 = audit.groupby(['Group']).AUDIT2.value_counts(normalize = True)
AUDIT3 = audit.groupby(['Group']).AUDIT3.value_counts(normalize = True)
AUDITtot = audit.groupby(['Group']).AUDITtot.value_counts(normalize = True)
AUDIT_counts_norm = pd.concat([AUDIT1, AUDIT2, AUDIT3, AUDITtot], axis = 1)
AUDIT_counts_norm.fillna(value = 0, axis =0, inplace = True)
AUDIT_index_norm = AUDIT_counts_norm
AUDIT_counts_norm.reset_index(inplace = True)
AUDIT_counts_norm.rename(columns = {'level_0': 'Group', 'level_1': 'Score'}, inplace = True)
AUDIT_counts_norm

In [None]:
#value counts non-normalized for viz and stats
AUDIT1 = audit.groupby(['Group']).AUDIT1.value_counts()
AUDIT2 = audit.groupby(['Group']).AUDIT2.value_counts()
AUDIT3 = audit.groupby(['Group']).AUDIT3.value_counts()
AUDITtot = audit.groupby(['Group']).AUDITtot.value_counts()
AUDIT_counts = pd.concat([AUDIT1, AUDIT2, AUDIT3, AUDITtot], axis = 1)
AUDIT_counts.fillna(value = 0, axis =0, inplace = True)
AUDIT_index = AUDIT_counts
AUDIT_counts.reset_index(inplace = True)
AUDIT_counts.rename(columns = {'level_0': 'Group', 'level_1': 'Score'}, inplace = True)
AUDIT_counts.head(30)

In [None]:
colors = ['blue', 'red']

In [None]:
#add extra rows to dataframe for graphing and stats
def extra_levels(data, i, j):
    x = 0
    while x < j:
        extra_level = pd.DataFrame(data = ['DC', i, 0, 0, 0, 0]).T
        extra_level.columns = ['Group', 'Score', 'AUDIT1', 'AUDIT2', 'AUDIT3', 'AUDITtot']
        data = pd.concat([data, extra_level], axis = 0, ignore_index=True)
        i = i + 1
        x = x + 1
    return data

In [None]:
AUDIT_chi_data = extra_levels(AUDIT_index, 8, 3)
AUDIT_chi_data.shape

In [None]:
AUDIT_counts_4 = AUDIT_counts_norm[AUDIT_counts_norm['Score'] < 5]
AUDIT_counts_2 = AUDIT_counts_norm[AUDIT_counts_norm['Score'] < 3]
AUDIT_counts_2

In [None]:
plt.figure(figsize=(25,20))
sns.barplot(x = 'Score', y = 'AUDITtot', hue = 'Group', data = AUDIT_counts_norm, alpha=0.9, palette = colors)
plt.title('AUDIT-C: Total score (normalized)', fontsize = 50)
plt.ylabel('Count', fontsize=60)
plt.yticks(fontsize = 30)
plt.xlabel('Score', fontsize=60)
plt.xticks(fontsize = 30)
plt.legend(loc = 'upper right', fontsize=30)
#plt.text(1, 1, 'p < .085 ', fontsize=30)
#plt.show()
plt.savefig('AUDIT-C_total.jpeg')

In [None]:
stats.chi2_contingency([AUDIT_chi_data.AUDITtot[AUDIT_chi_data.Group == 'mTBI'], AUDIT_chi_data.AUDITtot[AUDIT_chi_data.Group == 'DC']])

In [None]:
stats.mannwhitneyu(audit[audit.Group == 'mTBI'].AUDITtot.dropna(axis = 0), audit[audit.Group == 'DC'].AUDITtot.dropna(axis = 0), alternative = 'greater')

In [None]:
plt.figure(figsize=(25,20))
sns.barplot(x = 'Score', y = 'AUDIT1', hue = 'Group', data = AUDIT_counts_4, alpha=0.9, palette = colors)
plt.title('Q1: How often do you have a drink containing alcohol?', fontsize = 50)
plt.ylabel('Count', fontsize=60)
plt.yticks(fontsize = 30)
plt.xlabel('Score', fontsize=60)
plt.xticks(fontsize = 30)
plt.legend(loc = 'upper right', fontsize=30)
#plt.text(3.2, 32.6, 'p < .045 ', fontsize=30)
#plt.show()
plt.savefig('AUDIT-C_q1.jpeg')

In [None]:
stats.chi2_contingency([AUDIT_counts_4.AUDIT1[AUDIT_counts_4.Group == 'mTBI'], AUDIT_counts_4.AUDIT1[AUDIT_counts_4.Group == 'DC']])

In [None]:
stats.mannwhitneyu(audit[audit.Group == 'mTBI'].AUDIT1.dropna(axis = 0), audit[audit.Group == 'DC'].AUDIT1.dropna(axis = 0), alternative = 'greater')

In [None]:
plt.figure(figsize=(25,20))
sns.barplot(x = 'Score', y = 'AUDIT2', hue = 'Group', data = AUDIT_counts_4, alpha=0.9, palette = colors)
plt.title('Q2: How many drinks containing alcohol \n do you have on a typical day when you are drinking?', fontsize = 50)
plt.ylabel('Count', fontsize=60)
plt.yticks(fontsize = 30)
plt.xlabel('Score', fontsize=60)
plt.xticks(fontsize = 30)
plt.legend(loc = 'upper right', fontsize=30)
#plt.text(3.2, 67, 'p < .043 ', fontsize=30)
#plt.show()

plt.savefig('AUDIT-C_q2.jpeg')

In [None]:
stats.chi2_contingency([AUDIT_counts_4.AUDIT2[AUDIT_counts_4.Group == 'mTBI'], AUDIT_counts_4.AUDIT2[AUDIT_counts_4.Group == 'DC']])

In [None]:
stats.mannwhitneyu(audit[audit.Group == 'mTBI'].AUDIT2.dropna(axis = 0), audit[audit.Group == 'DC'].AUDIT2.dropna(axis = 0), alternative = 'greater')

In [None]:
plt.figure(figsize=(25,20))
sns.barplot(x = 'Score', y = 'AUDIT3', hue = 'Group', data = AUDIT_counts_2, alpha=0.9, palette = colors)
plt.title('Q3: How often do you have six or more drinks on one occasion?', fontsize = 50)
plt.ylabel('Count', fontsize=60)
plt.yticks(fontsize = 30)
plt.xlabel('Score', fontsize=60)
plt.xticks(fontsize = 30)
plt.legend(loc = 'upper right', fontsize=30)
#plt.text(1.75, 42, 'p > .05 ', fontsize=30)
#plt.show()

plt.savefig('AUDIT-C_q3.jpeg')

In [None]:
stats.chi2_contingency([AUDIT_counts_2.AUDIT3[AUDIT_counts_2.Group == 'mTBI'], AUDIT_counts_2.AUDIT3[AUDIT_counts_2.Group == 'DC']])

In [None]:
stats.mannwhitneyu(audit[audit.Group == 'mTBI'].AUDIT3.dropna(axis = 0), audit[audit.Group == 'DC'].AUDIT3.dropna(axis = 0), alternative = 'greater')

In [None]:
#do AUDIT-C scores correlate differently across groups?
pp = sns.pairplot(audit.dropna(axis = 0), hue = 'Group', kind = "reg", dropna = True)
pp.axes[0,0].set_ylim([0,5])
pp.axes[1,0].set_ylim([0,5])
pp.axes[2,0].set_ylim([0,3])
pp.axes[3,0].set_xlim([0,11])
pp.axes[0,0].set_xlim([0,4])
pp.axes[0,1].set_xlim([0,4])
pp.axes[0,2].set_xlim([0,2])
pp.axes[0,3].set_xlim([0,10])

In [None]:
df_no_outliers.columns.values

In [None]:
df_dep_var = ['AUDIT1', 'AUDIT2', 'AUDIT3', 'AUDITtot', 'Sleep_tot', 'PHQTot', 
       'PCLTot', 'CAPSTotal', 'PTSD_YN', 'LifeEvents_tot', 'NSITot',
       'KnockOut_dep', 'KnockOut_mil', 'KnockOut_anymil', 'KnockOut_life',
       'Blast_mil', 'QBEACRM', 'Blast_dep', 'DA', 'DOPA', 'NE', 'LPHRate', 'LPBPSys', 'LPBPDias']

In [None]:
#does AUDIT-C scores correlate with other measures differently across groups?
sns.pairplot(df_no_outliers, x_vars = df_dep_var, y_vars = ['AUDIT1', 'AUDIT2', 'AUDIT3', 'AUDITtot'], hue = 'Group', kind = "reg", dropna=True)

In [None]:
#does AUDIT-C scores correlate with other measures within the TBI group?
sns.pairplot(df_no_outliers[df_no_outliers.Group == 'mTBI'], x_vars = df_dep_var, y_vars = ['AUDIT1', 'AUDIT2', 'AUDIT3', 'AUDITtot'], kind = "reg", dropna=True)

In [None]:
#within the blast group, what parameters correlate?
corr = df_no_outliers[df_no_outliers['Group'] == 'mTBI'].corr()
fig, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(corr, annot=True)

In [None]:
#Create dataframe of data from CSF measurements
df_monoamines = df_no_outliers[['Group', 'DA', 'DOPA', 'NE']]

In [None]:
print(df_monoamines.shape)
print(df_monoamines.count(axis = 0).sort_values())
df_monoamines.dropna(axis = 0, inplace = True)
#df_monoamines = df_monoamines[df_monoamines.DA > 0] 
print(df_monoamines.shape)
df_monoamines.head()

In [None]:
#Bar plot CSF DA between groups
plt.figure(figsize=(10,10))
sns.barplot(x="Group", y="DA", data=df_no_outliers, alpha = 0.9, palette = colors)
plt.ylabel('CSF DA (pg/ml)', fontsize=40)
plt.yticks(fontsize = 25)
plt.xlabel(' ', fontsize=40)
plt.xticks(fontsize = 25)

#plt.show()

plt.savefig('DA_bar.jpeg')

In [None]:
stats.ttest_ind(df_monoamines[df_monoamines.Group == 'DC']['DA'], df_monoamines[df_monoamines.Group == 'mTBI']['DA'])

In [None]:
sns.barplot(x="Group", y="DOPA", data=df_no_outliers)

In [None]:
stats.ttest_ind(df_monoamines[df_monoamines.Group == 'DC']['DOPA'], df_monoamines[df_monoamines.Group == 'mTBI']['DOPA'])

In [None]:
sns.barplot(x="Group", y="NE", data=df_no_outliers)

In [None]:
stats.ttest_ind(df_monoamines[df_monoamines.Group == 'DC']['NE'], df_monoamines[df_monoamines.Group == 'mTBI']['NE'])

In [None]:
#Create new dataframe with only participants who have CSF DA data
df_DA = df_no_outliers[df_no_outliers.DA > 0]

In [None]:
#does CSF DA correlate with other measures differently across groups?
sns.pairplot(df_DA, x_vars = df_dep_var, y_vars = ['DA'], hue = 'Group', kind = "reg", dropna=True)

In [None]:
corr = df_DA.groupby(["Group"]).corr()
fig, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(corr, annot=True)

In [None]:
#does CSF DA correlate with other measures differently?
sns.pairplot(df_DA[df_DA.Group == 'mTBI'], x_vars = df_dep_var, y_vars = ['DA'], kind = "reg", dropna=True)

In [None]:
#create pearson correlation statistic function
def da_pearson(data, group, param):
    r_value, p_value = stats.pearsonr(data[(data.Group == group) & (data[param] > 0)][param], 
               data[(data.Group == group) & (data[param] > 0)]['DA'])
    return r_value, p_value

In [None]:
#use pearson function across parameters to see if DA significantly correlates
scores = {}
for param in df_dep_var:
    score = da_pearson(df_DA, 'mTBI', param)
    scores[param] = score

In [None]:
scores

In [None]:
plt.figure(figsize=(10,10))
sns.regplot(data = df_DA[df_DA.Group == 'mTBI'], x = 'Blast_dep', y = 'DA', color = 'r')
#sns.regplot(data = df_DA[df_DA.Group == 'DC'], x = 'Blast_dep', y = 'DA', color = 'b')
plt.ylabel('CSF DA (pg/ml)', fontsize=40)
plt.yticks(fontsize = 25)
plt.xlabel('Blast exposures ', fontsize=40)
plt.text(30, 27, 'p < .05', fontsize=20)
plt.xticks(fontsize = 25)

plt.savefig('DAvsBlast.jpeg')

In [None]:
plt.figure(figsize=(10,10))
sns.regplot(data = df_DA[df_DA.Group == 'mTBI'], x = 'AUDITtot', y = 'DA', color = 'r')
#sns.regplot(data = df_DA[df_DA.Group == 'DC'], x = 'AUDITtot', y = 'DA', color = 'b')
plt.ylabel('CSF DA (pg/ml)', fontsize=40)
plt.yticks(fontsize = 25)
plt.xlabel('AUDIT-C Total', fontsize=40)
plt.text(9, 16, 'p > .05', fontsize=20)
plt.xticks(fontsize = 25)

plt.savefig('DAvsAUDITtot.jpeg')

In [None]:
plt.figure(figsize=(10,10))
sns.regplot(data = df_DA[df_DA.Group == 'mTBI'], x = 'PCLTot', y = 'DA', color = 'r')
#sns.regplot(data = df_DA[df_DA.Group == 'DC'], x = 'AUDITtot', y = 'DA', color = 'b')
plt.ylabel('CSF DA (pg/ml)', fontsize=40)
plt.yticks(fontsize = 25)
plt.xlabel('PCL Total', fontsize=40)
plt.text(70, 20, 'p < .01', fontsize=20)
plt.xticks(fontsize = 25)

plt.savefig('DAvsPCLtot.jpeg')

In [None]:
plt.figure(figsize=(10,10))
sns.regplot(data = df_DA[df_DA.Group == 'mTBI'], x = 'NSITot', y = 'DA', color = 'r')
#sns.regplot(data = df_DA[df_DA.Group == 'DC'], x = 'AUDITtot', y = 'DA', color = 'b')
plt.ylabel('CSF DA (pg/ml)', fontsize=40)
plt.yticks(fontsize = 25)
plt.xlabel('NSI Total', fontsize=40)
plt.text(60, 20, 'p < .06', fontsize=20)
plt.xticks(fontsize = 25)

plt.savefig('DAvsNSItot.jpeg')

In [None]:
plt.figure(figsize=(10,10))
sns.regplot(data = df_DA[df_DA.Group == 'mTBI'], x = 'PHQTot', y = 'DA', color = 'r')
#sns.regplot(data = df_DA[df_DA.Group == 'DC'], x = 'PHQTot', y = 'DA', color = 'b')
plt.ylabel('CSF DA (pg/ml)', fontsize=40)
plt.yticks(fontsize = 25)
plt.xlabel('PHQ9 Total', fontsize=40)
plt.text(23, 20, 'p < .05', fontsize=20)
plt.xticks(fontsize = 25)

plt.savefig('DAvsPHQtot.jpeg')

In [None]:
param_names = list(df_dep_var)
for param in param_names:
    df_no_outliers.groupby("Group")[param].mean().plot(kind='bar', yerr=df_no_outliers.groupby("Group")[param].sem())
    plt.ylabel([param])
    plt.show()

In [None]:
writer = pd.ExcelWriter('AUDIT_counts.xlsx', engine='xlsxwriter')
AUDIT_counts.to_excel(writer, sheet_name='Sheet1')
writer.save()

In [None]:
writer = pd.ExcelWriter('ST.xlsx', engine='xlsxwriter')
ST_data.to_excel(writer)
writer.save()