In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os
import scipy as sp

#visualizing results
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
%matplotlib inline
import seaborn as sns
sns.set_context('poster')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import silhouette_score
from sklearn.decomposition import KernelPCA
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn_extra.cluster import KMedoids
from kmodes.kmodes import KModes

import scipy.cluster.hierarchy as shc

from minisom import MiniSom

In [None]:
data_path = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/PeskindTBI/final_data/first_visits_short.csv'

In [None]:
#read in csv containing data from all surveys and all visitseqs
data = pd.read_csv(data_path, index_col=0)
data = pd.DataFrame(data = data)

print('Original data shape:\n', data.shape, '\n')
print(data.info())
data.head(1)

In [None]:
#determined outliers for auditc and QBlstExp (outlier = >3 SD from mean) and remove
data = data[data["TBIID"] != 'C010']
data = data[data["TBIID"] != 'T080']

## AUDIT-C clustering with C and T from first visit sequence

- AUDIT-C answers are categorical, so does not readily apply to common cluster methods (e.g. categorical vs. continuous)
    - although some argue that because AUDIT-C questions contain numbers (number of drinks etc.) you can use as if numerical
- will compare three ways of expressing AUDIT-C data
    - raw scores
    - one hot encoded
    - transformed into continuous data by taking the value or the mean of the value of the response categories (Letourneau 2018)
    
    
- Question 1: How often do you have a drink containing alcohol? 
    - Response categories: never, monthly or less, 2 to 4 times/month, 2 to 3 times/week, 4 or more times/week). 
- Question 2 How many drinks do you have on a typical day when you are drinking?
    - Response categories: “1 or 2”, “3 or 4”, “5 or 6”, “7 to 9”, or “10 or more”
- Question 3 How often do you have 5 or more drinks on one occasion?
    - Response categories: “never”, “less than monthly”, “monthly”, “2-3 times”, or “4 or more times”

In [None]:
#perform clustering on individual auditc questions (3 total questions) from visit sequence 1
data_first = data[data['VisitSeq'] == 1]
data_allgroups_auditqs = data_first[['VisitSeq', 'Group', 'TBIID', 'auditc', 'AUDIT1', 'AUDIT2', 'AUDIT3']].set_index(['VisitSeq', 'Group', 'TBIID'])
print(data_allgroups_auditqs.shape)
data_allgroups_auditqs.dropna(axis=0, inplace=True)
print(data_allgroups_auditqs.shape)
data_allgroups_auditqs.head()

In [None]:
#tranformed
data_allgroups_auditqs['AUDIT1_transf'] = data_allgroups_auditqs['AUDIT1'].replace({0:0, 1:0.5, 2:3, 3:10, 4:16})
data_allgroups_auditqs['AUDIT2_transf'] = data_allgroups_auditqs['AUDIT2'].replace({0:1.5, 1:3.5, 2:5.5, 3:8, 4:10})
data_allgroups_auditqs['AUDIT3_transf'] = data_allgroups_auditqs['AUDIT3'].replace({0:0, 1:0.5, 2:1, 3:2.5, 4:4})

data_allgroups_auditqs.head()

In [None]:
#one hot encoding
dummy_AUDIT1 = pd.get_dummies(data_allgroups_auditqs['AUDIT1'], prefix='AUDIT1', prefix_sep='_')
dummy_AUDIT2 = pd.get_dummies(data_allgroups_auditqs['AUDIT2'], prefix='AUDIT2', prefix_sep='_')
dummy_AUDIT3 = pd.get_dummies(data_allgroups_auditqs['AUDIT3'], prefix='AUDIT3', prefix_sep='_')

data_allgroups_auditqs = pd.concat([data_allgroups_auditqs, dummy_AUDIT1, dummy_AUDIT2, dummy_AUDIT3], axis=1)
data_allgroups_auditqs.head()

In [None]:
#make three data sets
AUDIT_raw = data_allgroups_auditqs[['AUDIT1', 'AUDIT2', 'AUDIT3']]
AUDIT_transf = data_allgroups_auditqs[['AUDIT1_transf', 'AUDIT2_transf','AUDIT3_transf']]
AUDIT_onehot = data_allgroups_auditqs[['AUDIT1_0.0', 'AUDIT1_1.0', 'AUDIT1_2.0', 'AUDIT1_3.0', 'AUDIT1_4.0', 
                                      'AUDIT2_0.0', 'AUDIT2_1.0', 'AUDIT2_2.0', 'AUDIT2_3.0', 'AUDIT2_4.0', 
                                      'AUDIT3_0.0', 'AUDIT3_1.0', 'AUDIT3_2.0', 'AUDIT3_3.0', 'AUDIT3_4.0']]

#create dic to save cluster evaluation metrics
cluster_dic = {}

# center and scale the data
scaler = StandardScaler()

AUDIT_raw_scaled = scaler.fit_transform(AUDIT_raw)
AUDIT_transf_scaled = scaler.fit_transform(AUDIT_transf)
AUDIT_onehot_scaled = scaler.fit_transform(AUDIT_onehot)

#### Kmeans

In [None]:
#pick cluster number based on silhouette coefficient
k_range = range(2,10)
sil_scores = []
mse_scores = []
for k in k_range:
    km_ss = KMeans(n_clusters=k, random_state=39)
    km_ss.fit(AUDIT_transf_scaled)
    sil_scores.append(silhouette_score(AUDIT_transf_scaled, km_ss.labels_))
    mse_scores.append(km_ss.inertia_)
    
# plot the results
plt.plot(k_range, sil_scores)
plt.title('AUDIT-C Questions K means clustering')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.show()
    
# plot the results
plt.plot(k_range, mse_scores)
plt.title('AUDIT-C Questions K means clustering')
plt.xlabel('Number of clusters')
plt.ylabel('SSE')
plt.show()

#### Kmediods

In [None]:
#pick cluster number based on silhouette coefficient
k_range = range(2,10)
scores = []
for k in k_range:
    kmed_ss = KMedoids(n_clusters=k, random_state=39)
    kmed_ss.fit_predict(AUDIT_raw)
    scores.append(kmed_ss.inertia_)

# plot the results
plt.plot(k_range, scores)
plt.title('AUDIT-C Questions K mediods clustering')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')

#### Kmodes

In [None]:
#pick cluster number based on silhouette coefficient
k_range = range(2,10)
scores = []
for k in k_range:
    kmode_ss = KModes(n_clusters=k, random_state=39)
    kmode_ss.fit(AUDIT_raw)
    scores.append(silhouette_score(AUDIT_raw, kmode_ss.labels_))

# plot the results
plt.plot(k_range, scores)
plt.title('AUDIT-C Questions K Modes clustering')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')

In [None]:
#cluster_dic['kmode_onehot_silcoef'] = scores[1:6]
cluster_params_df = pd.DataFrame.from_dict(cluster_dic, orient='index')
cluster_params_df

### Focus on kmean with transformed_scaled data set

In [None]:
#viz dendrogram to find if three clusters makes sense
plt.figure(figsize=(10, 7))  
plt.title("AUDIT-C 3 Question Dendrogram")  
plt.ylabel("Distance (dissimilarity)")
plt.xlabel("Participants")
dend = shc.dendrogram(shc.linkage(AUDIT_transf_scaled, method='ward'), 
                      distance_sort='ascending',
                      show_leaf_counts=True, leaf_font_size=8)

In [None]:
#viz clustering with heat map of AUDIT-C answers
sns.clustermap(AUDIT_transf_scaled, metric="euclidean", standard_scale=1, method="ward", cmap="Blues")

In [None]:
#choose k=4 clusters and fit data
km_4 = KMeans(n_clusters=4,random_state=10)
km_4.fit(AUDIT_transf_scaled)
#new df for cluster info
data_allgroups_auditqs_clusters = data_allgroups_auditqs
data_allgroups_auditqs_clusters = data_allgroups_auditqs_clusters.reset_index()
data_allgroups_auditqs_clusters['kmeans_cluster'] = ["cluster_" + str(label) for label in km_4.labels_ ]
data_allgroups_auditqs_clusters.head(1)

In [None]:
#merge with orig df so each TBIID entry has its corr cluster assignment added
data_clusters = pd.merge(data, data_allgroups_auditqs_clusters[['TBIID', 'kmeans_cluster']], on=['TBIID'], how='outer')
print(data_clusters.shape)
print(data_clusters['VisitSeq'].value_counts())
data_clusters.head(1)

In [None]:
#viz cluster grouping and auditc answers - seems to be low(1), high freq(2), and binge clusters(0)
auditc_cols = ['auditc', 'AUDIT1', 'AUDIT2', 'AUDIT3']
for variable in auditc_cols:
    try:
        plt.figure(figsize=(7,7))
        g = sns.barplot(x='kmeans_cluster', y=variable, data=data_clusters[data_clusters['VisitSeq'] == 1], ci=68, palette="rocket", order=["cluster_0", "cluster_1", "cluster_2", "cluster_3"])
        plt.show()
    except:
        pass

In [None]:
#rename clusters for easier viz understanding
data_clusters.replace({'kmeans_cluster': {'cluster_0':'low', 'cluster_1':'heavy', 'cluster_2':'freq', 'cluster_3':'binge'}}, inplace=True)
#look at counts in each cluster for each group (is there a difference in cluster patterns between groups?)
data_clusters[data_clusters['VisitSeq'] == 1].groupby(['Group'])['kmeans_cluster'].value_counts().reset_index(name='count').sort_values(['Group', 'kmeans_cluster'])

In [None]:
tsne = TSNE(n_components=2, perplexity=50, random_state=1234)
tsne_features = tsne.fit_transform(AUDIT_transf_scaled)

print(tsne_features.shape)
tsne_df = pd.DataFrame(data = tsne_features, columns = ['tsne_0', 'tsne_1'], index = data_allgroups_auditqs_clusters.index)
data_allgroups_auditqs_clusters_TSNE = pd.concat([data_allgroups_auditqs_clusters, tsne_df], axis = 1)
data_allgroups_auditqs_clusters_TSNE.replace({'kmeans_cluster': {'cluster_0':'heavy', 'cluster_1':'freq', 'cluster_2':'low', 'cluster_3':'binge'}}, inplace=True)
data_allgroups_auditqs_clusters_TSNE.head()

In [None]:
plt.figure(figsize=(10,5))
sns.scatterplot(x='tsne_0', y='tsne_1', data=data_allgroups_auditqs_clusters_TSNE, hue='kmeans_cluster')
plt.legend(loc='center left', bbox_to_anchor=(1, .95))

### Try SOM

In [None]:
data_clusters_v1 = data_clusters[data_clusters['VisitSeq'] == 1]
data_clusters_v1['kmeans_cluster'].value_counts()

In [None]:
participants = data_allgroups_auditqs.reset_index(['TBIID'])['TBIID'].values
categories = data_clusters_v1['kmeans_cluster']

category_color = {'C': 'darkgreen',
                  'T': 'darkorange'}

category_color = {'low': 'darkgreen',
                  'freq': 'limegreen',
                  'heavy': 'darkorange',
                  'binge': 'crimson'}

colors_dict = {c: category_color[dm] for c, dm in zip(participants, categories)}


In [None]:
X = AUDIT_transf_scaled
size = 15
som = MiniSom(size, size, len(X[0]),
              neighborhood_function='gaussian', sigma=1.5,
              random_seed=39)

som.pca_weights_init(X)
som.train_random(X, 5000, verbose=True)

In [None]:
som_map = som.labels_map(X, participants)
  
plt.figure(figsize=(10, 5))
for p, countries in som_map.items():
    countries = list(countries)
    x = p[0] + .1
    y = p[1] - .3
    for i, c in enumerate(countries):
        off_set = (i+1)/len(countries) - 0.05
        plt.text(x, y+off_set, c, color=colors_dict[c], fontsize=10)
plt.pcolor(som.distance_map().T, cmap='gray_r', alpha=.2)
plt.xticks(np.arange(size+1))
plt.yticks(np.arange(size+1))
plt.grid()

legend_elements = [Patch(facecolor=clr,
                         edgecolor='w',
                         label=l) for l, clr in category_color.items()]
plt.legend(handles=legend_elements, loc='center left', bbox_to_anchor=(1, .95))
plt.show()

In [None]:
W = som.get_weights()
plt.figure(figsize=(10, 10))
feature_names = ['AUDIT1_transf', 'AUDIT2_transf','AUDIT3_transf']
for i, f in enumerate(feature_names):
    plt.subplot(3, 3, i+1)
    plt.title(f)
    plt.pcolor(W[:,:,i].T, cmap='coolwarm')
    plt.xticks(np.arange(size+1))
    plt.yticks(np.arange(size+1))
plt.tight_layout()
plt.show()

In [None]:
Z = np.zeros((size, size))
plt.figure(figsize=(8, 8))
for i in np.arange(som._weights.shape[0]):
    for j in np.arange(som._weights.shape[1]):
        feature = np.argmax(W[i, j , :])
        plt.plot([j+.5], [i+.5], 'o', color='C'+str(feature),
                 marker='s', markersize=24)

legend_elements = [Patch(facecolor='C'+str(i),
                         edgecolor='w',
                         label=f) for i, f in enumerate(feature_names)]

plt.legend(handles=legend_elements,
           loc='center left',
           bbox_to_anchor=(1, .95))
        
plt.xlim([0, size])
plt.ylim([0, size])
plt.show()

### TBI only add'n analysis and viz

In [None]:
#select just TBI participants and exclude visit '88' for add'l analysis
visit_seq = [1,2]
data_clusters_TBI = data_clusters[(data_clusters['Group'] == 'T') & (data_clusters['VisitSeq'].isin(visit_seq))]
data_clusters_TBI.to_csv('data_clusters_TBI.csv')

In [None]:
#viz for first visit seq

data_viz = data_clusters_TBI[data_clusters_TBI['VisitSeq'] == 1]

cont_vars = ['servconn', 'cestotal', 'MnthSncBlst', 'QKOIorA', 'QKOExpMil', 'QKOAllMil', 'QKOLife',
       'QBlstExp', 'QBEACRM', 'Q5plus2', 'QEDist_sum', 'QEDist_mean',
       'QEDist_min', 'auditc', 'AUDIT1', 'AUDIT2', 'AUDIT3', 'DOPA', 'DA',
       'DOPAC', 'NE', 'DHPG', 'da_dopa_ratio', 'dopac_da_ratio',
       'ne_dopa_ratio', 'dhpg_ne_ratio', 
       'BMI', 'BPSYS', 'BPDIAS', 'HRATE', 'BGlucose', 'BNa', 'BUN', 'BCreat', 'BOsmo',
       'UOsmo', 'USG', 'UNa', 'BK', 'TotalChol', 'LDL', 'HDL', 'Trig', 'CSFPROTEIN_x',
       'CSFGLUCOSE', 'CSFRBCS',  
       'CAPSTotal', 'PSQIc1', 'PSQIc2', 'PSQIc3', 'PSQIc4', 'PSQIc5',
       'PSQIc6', 'PSQIc7', 'PSQItot', 'PCLTot', 'PCL_reexp', 'PCL_avoid',
       'PCL_numb', 'PCL_hyper', 'PHQTot', 'PHQ_psych', 'PHQ_somatic',
       'NSITot', 'TBITot', 'NSI_vestibular', 'NSI_somatosensory',
       'NSI_cognitive', 'NSI_affective', 'NSI_ERP_affective',
       'NIS_ERP_vestsom', 
            'Amygdala_l', 'Amygdala_r',
       'Pallidum_l', 'Pallidum_r', 'Midbrain', 'auditc',
            'Plasma1_bFGF',
       'Plasma1_CRP', 'Plasma1_Eotaxin', 'Plasma1_Eotaxin3',
       'Plasma1_Flt1', 'Plasma1_ICAM1', 'Plasma1_IFNγ', 'Plasma1_IL10',
       'Plasma1_IL12_IL23p40', 'Plasma1_IL12p70', 'Plasma1_IL15',
       'Plasma1_IL16', 'Plasma1_IL17A', 'Plasma1_IL1α', 'Plasma1_IL6',
       'Plasma1_IL7', 'Plasma1_IL8', 'Plasma1_IP10', 'Plasma1_MCP1',
       'Plasma1_MCP4', 'Plasma1_MDC', 'Plasma1_MIP1α', 'Plasma1_MIP1β',
       'Plasma1_PlGF', 'Plasma1_SAA', 'Plasma1_TARC', 'Plasma1_Tie2',
       'Plasma1_TNFα', 'Plasma1_TNFβ', 'Plasma1_VCAM1', 'Plasma1_VEGF',
       'Plasma1_VEGFC', 'Plasma1_VEGFD']
    
for param in cont_vars:
    print(param)
    
    try:
        g = sns.catplot(x='kmeans_cluster', y=param, kind='bar', data=data_viz, ci=68, height=5, aspect=4)
        plt.show()
        
        print('\n')
        
    except:
        pass

In [None]:
#viz for first visit seq

data_cat = ['Status_x', 'VisitSeq', 'GType', 'Race', 'Hispanic', 'Handedness', 'Marital', 'APOEGen', 
           'HYPERTEN', 'HYPERCHO', 'DIABETES', 'B12DEF', 'THYROID',
           'MHxPain', 'MHxHA', 'MHxHtn', 'MHxCard', 'MHxGI', 'MHxNeuro', 'MHxLung', 'MHxApnea', 
            'SCPTSD', 'SCMDD', 'SCPD', 'SCGAD', 'SCNone', 'capsCrtA']

data_audit = ['auditc', 'AUDIT1', 'AUDIT2', 'AUDIT3']

data_viz = data_clusters_TBI[data_clusters_TBI['VisitSeq'] == 1]

for param in data_cat:
    print(param)

    data_int = (data_viz[data_viz[param] != 9].groupby('kmeans_cluster')[param].value_counts() /
                data_viz[data_viz[param] != 9].groupby('kmeans_cluster')[param].count()).reset_index(name='perc')
        
    try:
        g = sns.catplot(x=param, y='perc', kind='bar', data=data_int, hue='kmeans_cluster', ci=68, height=5, aspect=4)
        plt.show()
        
        print('\n')
        
    except:
        pass
    
for param in data_audit:
    print(param)

    data_int = (data_viz.groupby('kmeans_cluster')[param].value_counts() /
                        data_viz.groupby('kmeans_cluster')[param].count()).reset_index(name='perc')
        
    try:
        g = sns.catplot(x=param, y='perc', kind='bar', data=data_int, hue='kmeans_cluster', ci=68, height=5, aspect=4)
        plt.show()
        
        print('\n')
        
    except:
        pass

In [None]:
#does drinking cluster affect follow-up visits
print(data_clusters_TBI.groupby(['VisitSeq', 'kmeans_cluster'])['kmeans_cluster'].count())
sns.countplot(x='VisitSeq', data=data_clusters_TBI, hue='kmeans_cluster', palette="rocket")

In [None]:
#get TBIIDs of participants with a second visit and use to filter
TBIID_v2 = data_clusters_TBI[data_clusters_TBI['VisitSeq'] == 2]['TBIID'].values

#create new df with only participants who came to both visits 1 and 2
data_clusters_TBI_v12_only = data_clusters_TBI[data_clusters_TBI['TBIID'].isin(TBIID_v2)]
print(data_clusters_TBI_v12_only.shape)
data_clusters_TBI_v12_only.head()

In [None]:
#viz for first visit seq

data_viz = data_clusters_TBI_v12_only

for param in cont_vars:
    print(param)
    
    try:
        g = sns.catplot(x='VisitSeq', y=param, kind='bar', hue='kmeans_cluster', data=data_viz, ci=68, height=5, aspect=4)
        plt.show()
        
        print('\n')
        
    except:
        pass

In [None]:
#select only visit 1
data_clusters_visit1 = data_clusters[data_clusters['VisitSeq'] == 1]
data_clusters_visit1.to_csv('data_auditc_clustering_visit1.csv')

In [None]:
#get AUDIT counts and percents for chi squared analysis and viz
AUDIT_tot_counts = data_clusters_visit1.groupby('Group')['auditc'].value_counts()
AUDIT_1_counts = data_clusters_visit1.groupby('Group')['AUDIT1'].value_counts()
AUDIT_2_counts = data_clusters_visit1.groupby('Group')['AUDIT2'].value_counts()
AUDIT_3_counts = data_clusters_visit1.groupby('Group')['AUDIT3'].value_counts()
AUDIT_tot_counts.to_csv('AUDIT_tot_counts.csv')
AUDIT_1_counts.to_csv('AUDIT_1_counts.csv')
AUDIT_2_counts.to_csv('AUDIT_2_counts.csv')
AUDIT_3_counts.to_csv('AUDIT_3_counts.csv')

AUDIT_tot_perc = data_clusters_visit1.groupby('Group')['auditc'].value_counts() / data_clusters_visit1.groupby('Group')['auditc'].count()
AUDIT_1_perc = data_clusters_visit1.groupby('Group')['AUDIT1'].value_counts() / data_clusters_visit1.groupby('Group')['AUDIT1'].count()
AUDIT_2_perc = data_clusters_visit1.groupby('Group')['AUDIT2'].value_counts() / data_clusters_visit1.groupby('Group')['AUDIT2'].count()
AUDIT_3_perc = data_clusters_visit1.groupby('Group')['AUDIT3'].value_counts() / data_clusters_visit1.groupby('Group')['AUDIT3'].count()
AUDIT_tot_perc.to_csv('AUDIT_tot_perc.csv')
AUDIT_1_perc.to_csv('AUDIT_1_perc.csv')
AUDIT_2_perc.to_csv('AUDIT_2_perc.csv')
AUDIT_3_perc.to_csv('AUDIT_3_perc.csv')

In [None]:
#get value counts for AUDIT-C for each cluster (not split by group)
AUDIT_tot_counts_cluster = data_clusters_visit1.groupby(['kmeans_cluster'])['auditc'].value_counts()
AUDIT_1_counts_cluster = data_clusters_visit1.groupby(['kmeans_cluster'])['AUDIT1'].value_counts()
AUDIT_2_counts_cluster = data_clusters_visit1.groupby(['kmeans_cluster'])['AUDIT2'].value_counts()
AUDIT_3_counts_cluster = data_clusters_visit1.groupby(['kmeans_cluster'])['AUDIT3'].value_counts()
AUDIT_tot_counts_cluster.to_csv('AUDIT_tot_counts_cluster.csv')
AUDIT_1_counts_cluster.to_csv('AUDIT_1_counts_cluster.csv')
AUDIT_2_counts_cluster.to_csv('AUDIT_2_counts_cluster.csv')
AUDIT_3_counts_cluster.to_csv('AUDIT_3_counts_cluster.csv')

AUDIT_tot_perc_cluster = data_clusters_visit1.groupby(['kmeans_cluster'])['auditc'].value_counts() / data_clusters_visit1.groupby(['kmeans_cluster'])['auditc'].count()
AUDIT_1_perc_cluster = data_clusters_visit1.groupby(['kmeans_cluster'])['AUDIT1'].value_counts() / data_clusters_visit1.groupby(['kmeans_cluster'])['AUDIT1'].count()
AUDIT_2_perc_cluster = data_clusters_visit1.groupby(['kmeans_cluster'])['AUDIT2'].value_counts() / data_clusters_visit1.groupby(['kmeans_cluster'])['AUDIT2'].count()
AUDIT_3_perc_cluster = data_clusters_visit1.groupby(['kmeans_cluster'])['AUDIT3'].value_counts()/ data_clusters_visit1.groupby(['kmeans_cluster'])['AUDIT3'].count()
AUDIT_tot_perc_cluster.to_csv('AUDIT_tot_perc_cluster.csv')
AUDIT_1_perc_cluster.to_csv('AUDIT_1_perc_cluster.csv')
AUDIT_2_perc_cluster.to_csv('AUDIT_2_perc_cluster.csv')
AUDIT_3_perc_cluster.to_csv('AUDIT_3_perc_cluster.csv')

#get value counts by group for each cluster
AUDIT_cluster_counts = data_clusters_visit1.groupby('Group')['kmeans_cluster'].value_counts() 
AUDIT_cluster_perc = data_clusters_visit1.groupby('Group')['kmeans_cluster'].value_counts() / data_clusters_visit1.groupby('Group')['kmeans_cluster'].count()
AUDIT_cluster_counts.to_csv('AUDIT_cluster_counts.csv')
AUDIT_cluster_perc.to_csv('AUDIT_cluster_perc.csv')