In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os
import scipy as sp

#visualizing results
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
#import yellowbrick as yb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import silhouette_score
from sklearn.decomposition import KernelPCA
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
data_path = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/PeskindTBI/merge_data_first.csv'

In [None]:
#read in csv containing data from all surveys
data = pd.read_csv(data_path)
data = pd.DataFrame(data = data)

print('Original data shape:\n', data.shape, '\n')
print(data.info())
data.head(1)

In [None]:
data.set_index(['TBIID', 'Group'], inplace=True)

## AUDIT-C clustering with C and T

In [None]:
data_allgroups_auditqs = data[['AUDIT1', 'AUDIT2', 'AUDIT3']]
print(data_allgroups_auditqs.shape)
data_allgroups_auditqs.dropna(axis=0, inplace=True)
print(data_allgroups_auditqs.shape)
data_allgroups_auditqs.head()

In [None]:
# center and scale the data
scaler = StandardScaler()

data_allgroups_auditqs_scaled = scaler.fit_transform(data_allgroups_auditqs)

In [None]:
k_range = range(2,10)
scores = []
for k in k_range:
    km_ss = KMeans(n_clusters=k, random_state=1)
    km_ss.fit(data_allgroups_auditqs_scaled)
    scores.append(silhouette_score(data_allgroups_auditqs_scaled, km_ss.labels_))

# plot the results
plt.plot(k_range, scores)
plt.title('data_allgroups_auditqs_scaled clustering')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')

In [None]:
km_3_blast = KMeans(n_clusters=3,random_state=1)
km_3_blast.fit(data_allgroups_auditqs_scaled)
data_allgroups_auditqs['kmeans_cluster'] = ["cluster_" + str(label) for label in km_3_blast.labels_ ]
data_allgroups_auditqs.head(1)

In [None]:
data_allgroups_auditqs.groupby('Group')['kmeans_cluster'].value_counts()

In [None]:
#add cluster column to initial df
participants = data.index.get_level_values('TBIID')

for part in participants:
    try:
        data.loc[data.index.get_level_values('TBIID') == part, 'kmeans_cluster_AUDITC'] = data_allgroups_auditqs[data_allgroups_auditqs.index.get_level_values('TBIID') == part]['kmeans_cluster'].values
    except:
        data.loc[data.index.get_level_values('TBIID') == part, 'kmeans_cluster_AUDITC'] = np.nan
data.head()

In [None]:
#plt.style.use('seaborn-white')
auditc_cols = ['AUDITtot', 'AUDIT1', 'AUDIT2', 'AUDIT3']
data.reset_index(inplace=True)
for variable in auditc_cols:
    try:
        plt.figure(figsize=(7,7))
        g = sns.barplot(x='kmeans_cluster_AUDITC', y=variable, data=data, ci=68, hue='Group', palette="rocket", order=["cluster_0", "cluster_1", "cluster_2"])
        plt.show()
    except:
        pass

In [None]:
data.to_csv('data_auditc_clustering.csv')

In [None]:
AUDIT_tot_counts_cluster = data.groupby(['Group', 'kmeans_cluster_AUDITC'])['AUDITtot'].value_counts()
AUDIT_1_counts_cluster = data.groupby(['Group', 'kmeans_cluster_AUDITC'])['AUDIT1'].value_counts()
AUDIT_2_counts_cluster = data.groupby(['Group', 'kmeans_cluster_AUDITC'])['AUDIT2'].value_counts()
AUDIT_3_counts_cluster = data.groupby(['Group', 'kmeans_cluster_AUDITC'])['AUDIT3'].value_counts()
AUDIT_tot_counts_cluster.to_csv('AUDIT_tot_counts_cluster.csv')
AUDIT_1_counts_cluster.to_csv('AUDIT_1_counts_cluster.csv')
AUDIT_2_counts_cluster.to_csv('AUDIT_2_counts_cluster.csv')
AUDIT_3_counts_cluster.to_csv('AUDIT_3_counts_cluster.csv')

In [None]:
variables_long = ['GType', 'Race', 'Hispanic', 'Handedness',
       'ScreenAge', 'Education', 'PsyEduc', 'cestotal', 'NSITot',
       'TBITot', 'CAPSTotal', 'PCLTot', 'PTSD_YN', 'PSQItot', 'PHQTot',
       'BNITotIm', 'BISTot', 'AUDITtot', 'AUDIT1', 'AUDIT2', 'AUDIT3',
       'dopac_da_ratio', 'DA', 'DOPA', 'NE', 'QKOIorA', 'QKOExpMil', 'QKOAllMil', 'QKOLife',
       'QBlstExp', 'QBEACRM', 'MnthSncBlst', 'QEDist_sum', 'QEDist_mean',
       'DvpHA', 'DvpHAAct', 'DvpHASlp', 'DvpHAMd', 'DvpHAStr', 'DvpBP',
       'DvpBPAct', 'DvpBPslp', 'DvpBPMd', 'DvpBPStr', 'tbiDizzy',
       'tbiBalan', 'tbiCoord', 'tbiHeada', 'tbiNaus', 'tbiVision',
       'tbiLight', 'tbiHear', 'tbiNoise', 'tbiTingl', 'tbiTstsml',
       'tbiAppet', 'tbiConc', 'tbiForget', 'tbiDecis', 'tbiSlow',
       'tbiEnergy', 'tbiSleep', 'tbiAnx', 'tbiSad', 'tbiIrrit',
       'tbiOverw', 'tbiDisin', 'tbiWithd', 'tbiRing', 'tbiMoods',
       'tbiFight', 'tbiSpch', 'hrslp', 'PSQI1hr', 'PSQI1min', 'PSQI2',
       'PSQI3hr', 'PSQI3min', 'PSQI5a', 'PSQI5b', 'PSQI5c', 'PSQI5d',
       'PSQI5e', 'PSQI5f', 'PSQI5g', 'PSQI5h', 'PSQI5i', 'PSQI5j',
       'PSQI6', 'PSQI7', 'PSQI8', 'PSQI9', 'PSQIc1', 'PSQIc2', 'PSQIc3',
       'PSQIc4', 'PSQIc5', 'PSQIc6', 'PSQIc7', 'PCL1', 'PCL2', 'PCL3',
       'PCL4', 'PCL5', 'PCL6', 'PCL7', 'PCL8', 'PCL9', 'PCL10', 'PCL11',
       'PCL12', 'PCL13', 'PCL14', 'PCL15', 'PCL16', 'PCL17', 'PHQ1',
       'PHQ2', 'PHQ3', 'PHQ4', 'PHQ5', 'PHQ6', 'PHQ7', 'PHQ8', 'PHQ9',
       'BNI1Im', 'BNI2Im', 'BNI3Im', 'BNI4Im', 'BNI5Im', 'BNI6Im',
       'BNI7Im', 'BNI8Im', 'BNI9Im', 'BNI10Im', 'BNI11', 'BISAtt',
       'BISMtr', 'BISNonpl', 'PETType', 'Frontal_Mid_l', 'Frontal_Mid_r',
       'Insula_l', 'Insula_r', 'Cingulum_Ant_l', 'Cingulum_Ant_r',
       'Amygdala_l', 'Amygdala_r', 'CaudateNucl_l', 'CaudateNucl_r',
       'Putamen_l', 'Putamen_r', 'Pallidum_l', 'Pallidum_r', 'Medulla',
       'Midbrain', 'Pons', 'kmeans_cluster_AUDITC']

for variable in variables_long:
    try:
        plt.figure(figsize=(7,5))
        sns.barplot(x='kmeans_cluster_AUDITC', y=variable, data=data, ci=68, hue='Group', palette="rocket", order=["cluster_0", "cluster_1", "cluster_2"])
        plt.savefig(str(variable + '_plot.png'))
        plt.show()
    except:
        pass

In [None]:
tsne = TSNE(n_components=2, random_state=1234)
tsne_features = tsne.fit_transform(data_TBI_variables_scaled)

print(tsne_features.shape)
tsne_df = pd.DataFrame(data = tsne_features, columns = ['tsne_0', 'tsne_1'], index = data_TBI_variables.index)
data_TBI_variables = pd.concat([data_TBI_variables, tsne_df], axis = 1)

In [None]:
pca_ss = KernelPCA(n_components=2, kernel='linear', random_state=1)
kernel_pca_linear = pca_ss.fit_transform(data_TBI_variables_scaled)

print(kernel_pca_linear.shape)

kernel_pca_linear_df = pd.DataFrame(data = kernel_pca_linear, columns = ['pca_linear_0', 'pca_linear_1'], index = data_TBI_variables.index)
data_TBI_variables = pd.concat([data_TBI_variables, kernel_pca_linear_df], axis = 1)

In [None]:
pca_ss = KernelPCA(n_components=2, kernel='rbf', random_state=1)
kernel_pca_rbf = pca_ss.fit_transform(data_TBI_variables_scaled)

print(kernel_pca_rbf.shape)

kernel_pca_rbf_df = pd.DataFrame(data = kernel_pca_rbf, columns = ['pca_rbf_0', 'pca_rbf_1'], index = data_TBI_variables.index)
data_TBI_variables = pd.concat([data_TBI_variables, kernel_pca_rbf_df], axis = 1)

In [None]:
for variable in variables:
    plt.figure(figsize=(7,7))
    sns.scatterplot(x = 'pca_rbf_0', y = 'pca_rbf_1', data = data_TBI_variables, hue=variable)
    plt.show()
#plt.title("Projection of the data on 2 components + ground truth labels")