## AUDIT-C clustering with C and T from first visit sequence

Are there within and/or between group differences across the AUDIT-C questions? Use kmeans cluster analysis to explore.

- Question 1: How often do you have a drink containing alcohol? 
    - Response categories: never, monthly or less, 2 to 4 times/month, 2 to 3 times/week, 4 or more times/week). 
- Question 2 How many drinks do you have on a typical day when you are drinking?
    - Response categories: “1 or 2”, “3 or 4”, “5 or 6”, “7 to 9”, or “10 or more”
- Question 3 How often do you have 5 or more drinks on one occasion?
    - Response categories: “never”, “less than monthly”, “monthly”, “2-3 times”, or “4 or more times”

In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os
import scipy as sp

#visualizing results
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
%matplotlib inline
import seaborn as sns
sns.set_context('poster')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn import metrics
from sklearn.manifold import TSNE

from sklearn.cluster import KMeans

import scipy.cluster.hierarchy as shc

In [None]:
data_path = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/PeskindTBI/final_data/data_final_earliest.csv'

In [None]:
#read in csv containing data from all surveys and all visitseqs
data = pd.read_csv(data_path, index_col=0)
data = pd.DataFrame(data = data)

print('Original data shape:\n', data.shape, '\n')
print(data.info())
data.head(1)

In [None]:
#determined outliers for auditc and QBlstExp (outlier = >3 SD from mean) and remove
data = data[data["TBIID"] != 'C010']
data = data[data["TBIID"] != 'T080']

### create data set

In [None]:
#perform clustering on individual auditc questions (3 total questions) from visit sequence 1
data_first = data[data['VisitSeq'] == 1]
data_allgroups_auditqs = data_first[['VisitSeq', 'Group', 'TBIID', 'auditc', 'AUDIT1', 'AUDIT2', 'AUDIT3']].set_index(['VisitSeq', 'Group', 'TBIID'])
print(data_allgroups_auditqs.shape)
data_allgroups_auditqs.dropna(axis=0, inplace=True)
print(data_allgroups_auditqs.shape)
data_allgroups_auditqs.head()

### scale

In [None]:
#make three data sets
AUDIT_raw = data_allgroups_auditqs[['AUDIT1', 'AUDIT2', 'AUDIT3']]

# center and scale the data
scaler = StandardScaler()
#scaler = RobustScaler()

AUDIT_raw_scaled = scaler.fit_transform(AUDIT_raw)

### determine optimal k cluster number

In [None]:
data = AUDIT_raw_scaled

#pick cluster number based on silhouette coefficient
k_range = range(2,7)

base_scores = []
sil_scores = []
db_scores = []
ch_scores = []
mse_scores = []

for k in k_range:
    km_ss = KMeans(n_clusters=k, random_state=39)
    km_ss.fit(data)
    
    base_scores.append(-km_ss.score(data))
    sil_scores.append(metrics.silhouette_score(data, km_ss.labels_))
    db_scores.append(metrics.davies_bouldin_score(data, km_ss.labels_))
    ch_scores.append(metrics.calinski_harabasz_score(data, km_ss.labels_))
    mse_scores.append(km_ss.inertia_)
    
# plot the results
plt.plot(k_range, base_scores)
plt.title('AUDIT-C Questions K means clustering')
plt.xlabel('Number of clusters')
plt.ylabel('Base score')
plt.show()

# plot the results
plt.plot(k_range, sil_scores)
plt.title('AUDIT-C Questions K means clustering')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.show()

# plot the results
plt.plot(k_range, db_scores)
plt.title('AUDIT-C Questions K means clustering')
plt.xlabel('Number of clusters')
plt.ylabel('Davies Bouldin Score')
plt.show()

# plot the results
plt.plot(k_range, ch_scores)
plt.title('AUDIT-C Questions K means clustering')
plt.xlabel('Number of clusters')
plt.ylabel('Calinski Harabasz Score')
plt.show()
    
# plot the results
plt.plot(k_range, mse_scores)
plt.title('AUDIT-C Questions K means clustering')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

#### Dendrogram to determine cluster number

In [None]:
#viz dendrogram to find if three clusters makes sense
plt.figure(figsize=(10, 7))  
plt.title("AUDIT-C 3 Question Dendrogram")  
plt.ylabel("Distance (dissimilarity)")
plt.xlabel("Participants")
dend = shc.dendrogram(shc.linkage(AUDIT_raw_scaled, method='ward'), 
                      distance_sort='ascending',
                      show_leaf_counts=True, leaf_font_size=8)

In [None]:
#viz clustering with heat map of AUDIT-C answers
sns.clustermap(AUDIT_raw_scaled, metric="euclidean", standard_scale=1, method="ward", cmap="Blues")

#### kmeans determine stability

In [None]:
def kmeans_stability(data, k):
    
    ####determine cluster stability - bootstrap starting random state - e.g. different cluster initialization 
    
    scores = {}
    
    homogeneity_score_list = []
    completeness_score_list = []
    v_measure_score_list = []
    adjusted_rand_score_list = []
    adjusted_mutual_info_score_list = []
    
    #create initial cluster as baseline comparison
    km_orig = KMeans(n_clusters=k, random_state=39)
    km_orig.fit(data)
    orig_clusters = km_orig.labels_

    #bootstrap random state and compare to baseline cluster
    for i in range(1,99,3):

        #fit
        km_int = KMeans(n_clusters=k, random_state=i)
        km_int.fit(data)
        int_clusters = km_int.labels_
    
        #compute metrics
        homogeneity_score_int = metrics.homogeneity_score(orig_clusters, int_clusters)
        completeness_score_int = metrics.completeness_score(orig_clusters, int_clusters)
        v_measure_score_int = metrics.v_measure_score(orig_clusters, int_clusters)
        adjusted_rand_score_int = metrics.adjusted_rand_score(orig_clusters, int_clusters)
        adjusted_mutual_info_score_int = metrics.adjusted_mutual_info_score(orig_clusters,  int_clusters)
    
        homogeneity_score_list.append(homogeneity_score_int)
        completeness_score_list.append(completeness_score_int)
        v_measure_score_list.append(v_measure_score_int)
        adjusted_rand_score_list.append(adjusted_rand_score_int)
        adjusted_mutual_info_score_list.append(adjusted_mutual_info_score_int)
    
    scores['homogeneity_score'] = homogeneity_score_list
    scores['completeness_score'] = completeness_score_list
    scores['v_measure_score'] = v_measure_score_list
    scores['adjusted_rand_score'] = adjusted_rand_score_list
    scores['adjusted_mutual_info_score'] = adjusted_mutual_info_score_list
    
    return scores

In [None]:
scores_k2 = pd.DataFrame.from_dict(kmeans_stability(AUDIT_raw_scaled, 2)).mean()
scores_k3 = pd.DataFrame.from_dict(kmeans_stability(AUDIT_raw_scaled, 3)).mean()
scores_k4 = pd.DataFrame.from_dict(kmeans_stability(AUDIT_raw_scaled, 4)).mean()
scores_k5 = pd.DataFrame.from_dict(kmeans_stability(AUDIT_raw_scaled, 5)).mean()

pd.DataFrame(data=[scores_k2, scores_k3, scores_k4, scores_k5], index=['k=2', 'k=3', 'k=4', 'k=5'])

### Kmeans clustering on AUDIT_raw_scaled with k=3

In [None]:
#choose k=3 clusters and fit data
km_3 = KMeans(n_clusters=3,random_state=99)
km_3.fit(AUDIT_raw_scaled)
#new df for cluster info
data_allgroups_auditqs_clusters = data_allgroups_auditqs
data_allgroups_auditqs_clusters = data_allgroups_auditqs_clusters.reset_index()
data_allgroups_auditqs_clusters['kmeans_cluster'] = ["cluster_" + str(label) for label in km_3.labels_ ]
print(data_allgroups_auditqs_clusters.shape)
data_allgroups_auditqs_clusters.head(1)

In [None]:
#merge with orig df so each TBIID entry has its corr cluster assignment added
data_clusters = pd.merge(data_first, data_allgroups_auditqs_clusters[['TBIID', 'kmeans_cluster']], on=['TBIID'], how='outer')
print(data_clusters.shape)
print(data_clusters['kmeans_cluster'].value_counts())
data_clusters.head(1)

In [None]:
#viz cluster grouping and auditc answers - seems to be low(1), high freq(2), and binge clusters(0)
auditc_cols = ['auditc', 'AUDIT1', 'AUDIT2', 'AUDIT3']
for variable in auditc_cols:
    try:
        plt.figure(figsize=(7,7))
        g = sns.barplot(x='kmeans_cluster', y=variable, data=data_clusters[data_clusters['VisitSeq'] == 1], ci=68, palette="rocket", order=["cluster_0", "cluster_1", "cluster_2"])
        plt.show()
    except:
        pass

In [None]:
#rename clusters for easier viz understanding
data_clusters.replace({'kmeans_cluster': {'cluster_0':'Often', 'cluster_1':'Heavy','cluster_2':'Low',}}, inplace=True)
#look at counts in each cluster for each group (is there a difference in cluster patterns between groups?)
data_clusters[data_clusters['VisitSeq'] == 1].groupby(['kmeans_cluster']).count()['TBIID'].reset_index(name='count').sort_values(['kmeans_cluster'])

In [None]:
(data_clusters[data_clusters['VisitSeq'] == 1].groupby(['Group'])['kmeans_cluster'].value_counts() / data_clusters[data_clusters['VisitSeq'] == 1].groupby(['Group'])['kmeans_cluster'].count()).reset_index(name='percent').sort_values(['kmeans_cluster', 'Group'])

In [None]:
tsne = TSNE(n_components=2, perplexity=50, random_state=1234)
tsne_features = tsne.fit_transform(AUDIT_raw_scaled)

print(tsne_features.shape)
tsne_df = pd.DataFrame(data = tsne_features, columns = ['tsne_0', 'tsne_1'], index = data_allgroups_auditqs_clusters.index)
data_allgroups_auditqs_clusters_TSNE = pd.concat([data_allgroups_auditqs_clusters, tsne_df], axis = 1)
data_allgroups_auditqs_clusters_TSNE.replace({'kmeans_cluster': {'cluster_0':'Often', 'cluster_1':'Heavy','cluster_2':'Low',}}, inplace=True)
data_allgroups_auditqs_clusters_TSNE.head()

In [None]:
plt.figure(figsize=(10,5))
sns.scatterplot(x='tsne_0', y='tsne_1', data=data_allgroups_auditqs_clusters_TSNE, hue='kmeans_cluster')
plt.legend(loc='center left', bbox_to_anchor=(1, .95))

### Save cluster data

In [None]:
data_clusters.head()

In [None]:
#select only visit 1
data_clusters = data_clusters[data_clusters['VisitSeq'] == 1]
data_clusters.to_csv('data_clusters.csv')

In [None]:
AUDIT_cluster_counts

In [None]:
#get AUDIT counts and percents for chi squared analysis and viz
AUDIT_tot_counts = data_allgroups_auditqs_clusters.groupby('Group')['auditc'].value_counts().reset_index(name='count').sort_values(['Group', 'auditc'])
AUDIT_1_counts = data_allgroups_auditqs_clusters.groupby('Group')['AUDIT1'].value_counts().reset_index(name='count').sort_values(['Group', 'AUDIT1'])
AUDIT_2_counts = data_allgroups_auditqs_clusters.groupby('Group')['AUDIT2'].value_counts().reset_index(name='count').sort_values(['Group', 'AUDIT2'])
AUDIT_3_counts = data_allgroups_auditqs_clusters.groupby('Group')['AUDIT3'].value_counts().reset_index(name='count').sort_values(['Group', 'AUDIT3'])
AUDIT_tot_counts.to_csv('AUDIT_tot_counts.csv')
AUDIT_1_counts.to_csv('AUDIT_1_counts.csv')
AUDIT_2_counts.to_csv('AUDIT_2_counts.csv')
AUDIT_3_counts.to_csv('AUDIT_3_counts.csv')

AUDIT_tot_perc = (data_allgroups_auditqs_clusters.groupby('Group')['auditc'].value_counts() / data_allgroups_auditqs_clusters.groupby('Group')['auditc'].count()).reset_index(name='perc').sort_values(['Group', 'auditc'])
AUDIT_1_perc = (data_allgroups_auditqs_clusters.groupby('Group')['AUDIT1'].value_counts() / data_allgroups_auditqs_clusters.groupby('Group')['AUDIT1'].count()).reset_index(name='perc').sort_values(['Group', 'AUDIT1'])
AUDIT_2_perc = (data_allgroups_auditqs_clusters.groupby('Group')['AUDIT2'].value_counts() / data_allgroups_auditqs_clusters.groupby('Group')['AUDIT2'].count()).reset_index(name='perc').sort_values(['Group', 'AUDIT2'])
AUDIT_3_perc = (data_allgroups_auditqs_clusters.groupby('Group')['AUDIT3'].value_counts() / data_allgroups_auditqs_clusters.groupby('Group')['AUDIT3'].count()).reset_index(name='perc').sort_values(['Group', 'AUDIT3'])
AUDIT_tot_perc.to_csv('AUDIT_tot_perc.csv')
AUDIT_1_perc.to_csv('AUDIT_1_perc.csv')
AUDIT_2_perc.to_csv('AUDIT_2_perc.csv')
AUDIT_3_perc.to_csv('AUDIT_3_perc.csv')

In [None]:
#get value counts for AUDIT-C for each cluster (not split by group)
AUDIT_tot_counts_cluster = data_clusters.groupby(['kmeans_cluster'])['auditc'].value_counts().reset_index(name='count').sort_values(['kmeans_cluster', 'auditc'])
AUDIT_1_counts_cluster = data_clusters.groupby(['kmeans_cluster'])['AUDIT1'].value_counts().reset_index(name='count').sort_values(['kmeans_cluster', 'AUDIT1'])
AUDIT_2_counts_cluster = data_clusters.groupby(['kmeans_cluster'])['AUDIT2'].value_counts().reset_index(name='count').sort_values(['kmeans_cluster', 'AUDIT2'])
AUDIT_3_counts_cluster = data_clusters.groupby(['kmeans_cluster'])['AUDIT3'].value_counts().reset_index(name='count').sort_values(['kmeans_cluster', 'AUDIT3'])
AUDIT_tot_counts_cluster.to_csv('AUDIT_tot_counts_cluster.csv')
AUDIT_1_counts_cluster.to_csv('AUDIT_1_counts_cluster.csv')
AUDIT_2_counts_cluster.to_csv('AUDIT_2_counts_cluster.csv')
AUDIT_3_counts_cluster.to_csv('AUDIT_3_counts_cluster.csv')

AUDIT_tot_perc_cluster = (data_clusters.groupby(['kmeans_cluster'])['auditc'].value_counts() / data_clusters.groupby(['kmeans_cluster'])['auditc'].count()).reset_index(name='perc').sort_values(['kmeans_cluster', 'auditc'])
AUDIT_1_perc_cluster = (data_clusters.groupby(['kmeans_cluster'])['AUDIT1'].value_counts() / data_clusters.groupby(['kmeans_cluster'])['AUDIT1'].count()).reset_index(name='perc').sort_values(['kmeans_cluster', 'AUDIT1'])
AUDIT_2_perc_cluster = (data_clusters.groupby(['kmeans_cluster'])['AUDIT2'].value_counts() / data_clusters.groupby(['kmeans_cluster'])['AUDIT2'].count()).reset_index(name='perc').sort_values(['kmeans_cluster', 'AUDIT2'])
AUDIT_3_perc_cluster = (data_clusters.groupby(['kmeans_cluster'])['AUDIT3'].value_counts()/ data_clusters.groupby(['kmeans_cluster'])['AUDIT3'].count()).reset_index(name='perc').sort_values(['kmeans_cluster', 'AUDIT3'])
AUDIT_tot_perc_cluster.to_csv('AUDIT_tot_perc_cluster.csv')
AUDIT_1_perc_cluster.to_csv('AUDIT_1_perc_cluster.csv')
AUDIT_2_perc_cluster.to_csv('AUDIT_2_perc_cluster.csv')
AUDIT_3_perc_cluster.to_csv('AUDIT_3_perc_cluster.csv')

#get value counts by group for each cluster
AUDIT_cluster_counts = data_clusters.groupby('Group')['kmeans_cluster'].value_counts().reset_index(name='count').sort_values(['Group', 'kmeans_cluster'])
AUDIT_cluster_perc = (data_clusters.groupby('Group')['kmeans_cluster'].value_counts() / data_clusters.groupby('Group')['kmeans_cluster'].count()).reset_index(name='perc').sort_values(['Group', 'kmeans_cluster'])
AUDIT_cluster_counts.to_csv('AUDIT_cluster_counts.csv')
AUDIT_cluster_perc.to_csv('AUDIT_cluster_perc.csv')