Careless responder survey level clustering

In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os
from itertools import groupby
import datetime as dt
import scipy as sp
import math

#visualizing results
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
#import yellowbrick as yb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import silhouette_score

In [None]:
data_path = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/TILES/final_data/surveys_CR.pkl'

In [None]:
#read in csv containing data from all surveys
full_data = pd.read_pickle(data_path)
full_data = pd.DataFrame(data = full_data)
full_data.reset_index(inplace=True, drop=True)

print('Original data shape:\n', full_data.shape, '\n')
#ensure no replicate ID (212 participants in study)
print('Original data unique IDs:\n', full_data['ParticipantID'].unique().shape, '\n')
#ensure no replicate ID (212 participants in study)
print('Original data unique IDs:\n', full_data['MitreID'].unique().shape, '\n')
#how much missing data is there?
print('Original data missing value counts:\n', full_data.isnull().sum(), '\n')
#what is the data type of each column?
print('Original data data types:\n', full_data.info(), '\n')

In [None]:
full_data['survey_type'].value_counts()

In [None]:
#should be 71
len(full_data['wave_study_day'].unique())

In [None]:
full_data.head()

In [None]:
#create new feature that is log transformed of 'time to complete'
full_data['time_to_complete_log'] = np.log(full_data['time_to_complete'])

In [None]:
print(full_data[full_data['time_to_complete_log'] < 0]['survey_type'].value_counts())
full_data = full_data[full_data['time_to_complete_log'] > 0]
print(full_data[full_data['time_to_complete_log'] < 0]['survey_type'].value_counts())

In [None]:
full_data.set_index(['MitreID', 'survey_type', 'wave_study_day', 'wave_study_date_bin'], inplace=True)
full_data.head()

In [None]:
engage_CRs = ['mean_diff_hinderance_vs_support', 'time_to_complete_log',
       'longstring_count_engage', 'longstring_answer_engage',
       'longstring_count_norm_engage', 'longstring_mult_engage',
       'ls_auc_engage', 'ls_std_engage', 'ls_skew_engage',
       'ls_kurt_engage', 'ls_auc_seeded_engage', 'ls_std_seeded_engage',
       'ls_skew_seeded_engage', 'ls_kurt_seeded_engage', 'context_homevsworking',
       'context_workvsactivities', 'context_workvswork',
       'context_drivevsdrive']

psych_flex_CRs = ['longstring_count_pf',
       'longstring_answer_pf', 'longstring_count_norm_pf',
       'longstring_mult_pf', 'ls_auc_pf', 'ls_std_pf', 'ls_skew_pf',
       'ls_kurt_pf', 'ls_auc_seeded_pf', 'ls_std_seeded_pf',
       'ls_skew_seeded_pf', 'ls_kurt_seeded_pf', 'time_to_complete_log']

jobsatwork_CRs = ['mean_diff_ave',
       'mean_diff_affect_posneg', 'mean_diff_nervous_anxiety',
       'mean_diff_irb_irb', 'mean_diff_dalal_posneg', 
                  'longstring_count_norm_ave', 'longstring_count_norm_affect', 'longstring_count_norm_irbd', 'longstring_count_norm_dalal',
                  'time_to_complete_log',
       'longstring_count_affect', 'longstring_answer_affect',
        'longstring_mult_affect',
       'ls_auc_affect', 'ls_std_affect', 'ls_skew_affect',
       'ls_kurt_affect', 'ls_auc_seeded_affect', 'ls_std_seeded_affect',
       'ls_skew_seeded_affect', 'ls_kurt_seeded_affect',
       'longstring_count_irbd', 'longstring_answer_irbd',
       'longstring_mult_irbd',
       'ls_auc_irbd', 'ls_std_irbd', 'ls_skew_irbd', 'ls_kurt_irbd',
       'ls_auc_seeded_irbd', 'ls_std_seeded_irbd', 'ls_skew_seeded_irbd',
       'ls_kurt_seeded_irbd', 'longstring_count_dalal',
       'longstring_answer_dalal', 
       'longstring_mult_dalal', 'ls_auc_dalal', 'ls_std_dalal',
       'ls_skew_dalal', 'ls_kurt_dalal', 'ls_auc_seeded_dalal',
       'ls_std_seeded_dalal', 'ls_skew_seeded_dalal',
       'ls_kurt_seeded_dalal', 'context_homevsworking',
       'context_workvsactivities', 'context_workvswork',
       'context_drivevsdrive']

jobsnotatwork_CRs = ['mean_diff_ave',
       'mean_diff_affect_posneg', 'mean_diff_nervous_anxiety',
       'longstring_count_affect', 'longstring_answer_affect',
       'longstring_count_norm_affect', 'longstring_mult_affect',
       'ls_auc_affect', 'ls_std_affect', 'ls_skew_affect',
       'ls_kurt_affect', 'ls_auc_seeded_affect', 'ls_std_seeded_affect',
       'ls_skew_seeded_affect', 'ls_kurt_seeded_affect', 'time_to_complete_log', 'context_homevsworking',
       'context_workvsactivities', 'context_workvswork',
       'context_drivevsdrive']

health_CRs = ['mean_diff_ave',
       'mean_diff_affect_posneg', 'mean_diff_nervous_anxiety', 'intake_check_alc',
       'intake_check_nic', 'time_to_complete_log',
       'longstring_count_affect', 'longstring_answer_affect',
       'longstring_count_norm_affect', 'longstring_mult_affect',
       'ls_auc_affect', 'ls_std_affect', 'ls_skew_affect',
       'ls_kurt_affect', 'ls_auc_seeded_affect', 'ls_std_seeded_affect',
       'ls_skew_seeded_affect', 'ls_kurt_seeded_affect', 'context_homevsworking',
       'context_workvsactivities', 'context_workvswork',
       'context_drivevsdrive']

personality_CRs = ['mean_diff_ave',
       'mean_diff_affect_posneg', 'mean_diff_nervous_anxiety', 'mean_diff_extraversion',
       'mean_diff_agreeableness', 'mean_diff_conscientiousness',
       'mean_diff_neuroticism', 'mean_diff_openness', 
                   'longstring_count_norm_ave', 'longstring_count_norm_affect', 'longstring_count_norm_personality',
                   'time_to_complete_log', 
       'longstring_count_affect', 'longstring_answer_affect',
       'longstring_mult_affect',
       'ls_auc_affect', 'ls_std_affect', 'ls_skew_affect',
       'ls_kurt_affect', 'ls_auc_seeded_affect', 'ls_std_seeded_affect',
       'ls_skew_seeded_affect', 'ls_kurt_seeded_affect', 
       'longstring_count_personality', 'longstring_answer_personality',
        'longstring_mult_personality',
       'ls_auc_personality', 'ls_std_personality', 'ls_skew_personality',
       'ls_kurt_personality', 'ls_auc_seeded_personality',
       'ls_std_seeded_personality', 'ls_skew_seeded_personality',
       'ls_kurt_seeded_personality', 'context_homevsworking',
       'context_workvsactivities', 'context_workvswork',
       'context_drivevsdrive']

## Clustering

### Personality

In [None]:
personality = full_data[full_data.index.get_level_values('survey_type') == 'personality'][personality_CRs]
personality['mean_diff_pers'] = personality.loc[:,'mean_diff_extraversion':'mean_diff_neuroticism'].mean(axis=1)
print(personality.shape)
personality.dropna(inplace=True, axis=0)
print(personality.shape)
personality.head(1)

In [None]:
#select features with highest coef of var
personality_var = sp.stats.variation(personality, axis=0)
sorted_var = sorted(zip(personality.columns.values, personality_var), key=lambda x: x[1])
sorted_var

In [None]:
for feature in personality_CRs:
    print(feature)
    personality[feature].hist(bins=50)
    plt.show()

In [None]:
personality_CRs_final = ['mean_diff_pers', 'time_to_complete_log', 
                         'longstring_count_affect', 'longstring_mult_affect', 
                         'longstring_count_personality', 'longstring_mult_personality']

personality_features = personality[personality_CRs_final]

# center and scale the data
scaler = StandardScaler()

#full feature set
personality_scaled = scaler.fit_transform(personality_features)

#determine optimal k
k_range = range(2,10)
scores = []
for k in k_range:
    km_ss = KMeans(n_clusters=k, random_state=1)
    km_ss.fit(personality_scaled)
    scores.append(silhouette_score(personality_scaled, km_ss.labels_))

# plot the results
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.title('Personality kmeans at survey level')

In [None]:
personality_km_survey = KMeans(n_clusters=2,random_state=1234)
personality_km_survey.fit(personality_scaled)
print(silhouette_score(personality_scaled, personality_km_survey.labels_))

personality['clusters'] = [label for label in personality_km_survey.labels_ ]
personality['clusters_pers'] = [label for label in personality_km_survey.labels_ ]
personality_features['clusters'] = [label for label in personality_km_survey.labels_ ]

print(personality_features['clusters'].value_counts())

sns.pairplot(personality_features, hue='clusters')

In [None]:
personality['good_bad'] = [1 if x == 1 else 0 for x in personality['clusters']]
personality.head()

In [None]:
personality_CRs_final = ['mean_diff_affect_posneg', 'mean_diff_pers',
                   'time_to_complete_log', 
       'longstring_count_affect', 'longstring_answer_affect',
       'longstring_mult_affect',
       'ls_auc_affect', 'ls_std_affect', 'ls_skew_affect',
       'ls_kurt_affect']


personality_features = personality[personality_CRs_final]

from sklearn.cluster import AgglomerativeClustering

# center and scale the data
scaler = StandardScaler()

#full feature set
personality_scaled = scaler.fit_transform(personality_features)

#determine optimal k
k_range = range(2,10)
scores = []
for k in k_range:
    clustering = AgglomerativeClustering(n_clusters=k)
    clustering.fit(personality_scaled)
    scores.append(silhouette_score(personality_scaled, clustering.labels_))

# plot the results
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.title('Personality kmeans at survey level')

In [None]:
personality_ag_survey = AgglomerativeClustering(n_clusters=2)
personality_ag_survey.fit(personality_scaled)
print(silhouette_score(personality_scaled, personality_ag_survey.labels_))

personality['clusters'] = [label for label in personality_ag_survey.labels_ ]
personality['clusters_pers'] = [label for label in personality_ag_survey.labels_ ]

print(personality['clusters'].value_counts())
x = [child if child > 784 else 0 for child in personality_ag_survey.children_]

#sns.pairplot(personality_features, hue='clusters')

### Health

In [None]:
health = full_data[full_data.index.get_level_values('survey_type') == 'health'][health_CRs]
print(health.shape)
health.dropna(inplace=True, axis=0)
print(health.shape)

In [None]:
health_CRs_final = ['time_to_complete_log',
       'longstring_count_affect', 'longstring_mult_affect']

health_features = health[health_CRs_final]

# center and scale the data
scaler = StandardScaler()

#full feature set
health_scaled = scaler.fit_transform(health_features)

#determine optimal k
k_range = range(2,10)
scores = []
for k in k_range:
    km_ss = KMeans(n_clusters=k, random_state=1)
    km_ss.fit(health_scaled)
    scores.append(silhouette_score(health_scaled, km_ss.labels_))

# plot the results
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.title('Health kmeans at survey level')

In [None]:
health_km_survey = KMeans(n_clusters=2,random_state=1234)
health_km_survey.fit(health_scaled)
print(silhouette_score(health_scaled, health_km_survey.labels_))

health['clusters'] = [label for label in health_km_survey.labels_ ]
health['clusters_health'] = [label for label in health_km_survey.labels_ ]
health_features['clusters'] = [label for label in health_km_survey.labels_ ]

print(health_features['clusters'].value_counts())

sns.pairplot(health_features, hue='clusters')

In [None]:
health['good_bad'] = [0 if x == 0 else 1 for x in health['clusters']]
health.head()

### Jobs at work

In [None]:
jobs = full_data[full_data.index.get_level_values('survey_type') == 'jobs_atwork'][jobsatwork_CRs]
jobs['mean_diff_jobs'] = jobs.loc[:,'mean_diff_irb_irb':'mean_diff_dalal_posneg'].mean(axis=1)
print(jobs.shape)
jobs.dropna(inplace=True, axis=0)
print(jobs.shape)
jobs.head(1)

In [None]:
jobsatwork_CRs_final = ['mean_diff_jobs', 'time_to_complete_log',
                  'longstring_count_affect', 'longstring_mult_affect']

#'longstring_count_irbd', 'longstring_mult_irbd', 'longstring_count_dalal', 'longstring_mult_dalal'

jobs_features = jobs[jobsatwork_CRs_final]

# center and scale the data
scaler = StandardScaler()

#full feature set
jobs_scaled = scaler.fit_transform(jobs_features)

#determine optimal k
k_range = range(2,20)
scores = []
for k in k_range:
    km_ss = KMeans(n_clusters=k, random_state=1)
    km_ss.fit(jobs_scaled)
    scores.append(silhouette_score(jobs_scaled, km_ss.labels_))

# plot the results
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.title('Jobs at work kmeans at survey level')

In [None]:
jobs_km_survey = KMeans(n_clusters=2,random_state=1234)
jobs_km_survey.fit(jobs_scaled)
print(silhouette_score(jobs_scaled, jobs_km_survey.labels_))

jobs['clusters'] = [label for label in jobs_km_survey.labels_ ]
jobs['clusters_jobs'] = [label for label in jobs_km_survey.labels_ ]
jobs_features['clusters'] = [label for label in jobs_km_survey.labels_ ]

print(jobs_features['clusters'].value_counts())

sns.pairplot(jobs_features, hue='clusters')

In [None]:
jobs['good_bad'] = [0 if x == 0 else 1 for x in jobs['clusters']]
jobs.head()

### Engage

In [None]:
engage = full_data[full_data.index.get_level_values('survey_type') == 'engage_psycap'][engage_CRs]
print(engage.shape)
engage.dropna(inplace=True, axis=0)
print(engage.shape)
engage.head(1)

In [None]:
engage_CRs_final = ['mean_diff_hinderance_vs_support', 'time_to_complete_log',
              'longstring_count_engage', 'longstring_mult_engage']


engage_features = engage[engage_CRs_final]

# center and scale the data
scaler = StandardScaler()

#full feature set
engage_scaled = scaler.fit_transform(engage_features)

#determine optimal k
k_range = range(2,10)
scores = []
for k in k_range:
    km_ss = KMeans(n_clusters=k, random_state=1)
    km_ss.fit(engage_scaled)
    scores.append(silhouette_score(engage_scaled, km_ss.labels_))

# plot the results
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.title('Engage kmeans at survey level')

In [None]:
engage_km_survey = KMeans(n_clusters=2,random_state=1234)
engage_km_survey.fit(engage_scaled)
print(silhouette_score(engage_scaled, engage_km_survey.labels_))

engage['clusters'] = [label for label in engage_km_survey.labels_ ]
engage['clusters_engage'] = [label for label in engage_km_survey.labels_ ]
engage_features['clusters'] = [label for label in engage_km_survey.labels_ ]

print(engage_features['clusters'].value_counts())

sns.pairplot(engage_features, hue='clusters')

In [None]:
engage['good_bad'] = [0 if x == 0 else 1 for x in engage['clusters']]
engage.head()

### Psych Felx

In [None]:
psych_flex = full_data[full_data.index.get_level_values('survey_type') == 'psych_flex'][psych_flex_CRs]
print(psych_flex.shape)
psych_flex.dropna(inplace=True, axis=0)
print(psych_flex.shape)
psych_flex.head(1)

In [None]:
psych_flex_CRs_final = ['time_to_complete_log',
                        'longstring_count_pf', 'longstring_mult_pf']


psych_flex_features = psych_flex[psych_flex_CRs_final]

# center and scale the data
scaler = StandardScaler()

#full feature set
psych_flex_scaled = scaler.fit_transform(psych_flex_features)

#determine optimal k
k_range = range(2,10)
scores = []
for k in k_range:
    km_ss = KMeans(n_clusters=k, random_state=1)
    km_ss.fit(psych_flex_scaled)
    scores.append(silhouette_score(psych_flex_scaled, km_ss.labels_))

# plot the results
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.title('Psych flex kmeans at survey level')

In [None]:
psych_flex_km_survey = KMeans(n_clusters=2,random_state=1234)
psych_flex_km_survey.fit(psych_flex_scaled)
print(silhouette_score(psych_flex_scaled, psych_flex_km_survey.labels_))

psych_flex['clusters'] = [label for label in psych_flex_km_survey.labels_ ]
psych_flex['clusters_psych_flex'] = [label for label in psych_flex_km_survey.labels_ ]
psych_flex_features['clusters'] = [label for label in psych_flex_km_survey.labels_ ]

print(psych_flex_features['clusters'].value_counts())

sns.pairplot(psych_flex_features, hue='clusters')

In [None]:
psych_flex['good_bad'] = [0 if x == 1 else 1 for x in psych_flex['clusters']]
psych_flex.head()

### Compare clustering across survey types

In [None]:
clusters_df = pd.concat([personality, health, jobs, engage, psych_flex], axis = 0)
clusters_df.head()

In [None]:
count_df_MitreID = clusters_df[clusters_df['good_bad'] == 1].groupby(['MitreID'])['good_bad'].count() / clusters_df.groupby(['MitreID'])['good_bad'].count()
count_df_MitreID = count_df_MitreID.reset_index(name='good_ratio')
count_df_MitreID.head()

In [None]:
sns.catplot('MitreID', 'good_ratio', data=count_df_MitreID, kind='bar', height=5, aspect=4)

In [None]:
#get ratio of good to total for each survey type and bin
count_df = clusters_df[clusters_df['good_bad'] == 1].groupby(['MitreID', 'survey_type', 'wave_study_date_bin'])['good_bad'].count() / clusters_df.groupby(['MitreID', 'survey_type', 'wave_study_date_bin'])['good_bad'].count()
count_df = count_df.reset_index(name='good_ratio')
count_df.head()

In [None]:
sns.catplot('survey_type', 'good_ratio', data=count_df, kind='bar', col='wave_study_date_bin')

In [None]:
sns.catplot('MitreID', 'good_ratio', data=count_df, kind='bar', height=5, aspect=4, col='survey_type', col_wrap=1)

In [None]:
for part in count_df['MitreID'].unique():
    print(part)
    sns.catplot('survey_type', 'good_ratio', data=count_df[count_df['MitreID'] == part], kind='bar')
    plt.show()

In [None]:
#get mean of cluters for each participant for each survey type
clusters_survey = clusters_df.groupby('MitreID')[['clusters_pers', 'clusters_health', 'clusters_jobs', 'clusters_engage', 'clusters_psych_flex']].mean()
print(clusters_survey.shape)
clusters_survey.dropna(axis=0, inplace=True)
print(clusters_survey.shape)
clusters_survey.head()

In [None]:
# center and scale the data
scaler = StandardScaler()

#full feature set
clusters_survey_scaled = scaler.fit_transform(clusters_survey)

#determine optimal k
k_range = range(2,10)
scores = []
for k in k_range:
    km_ss = KMeans(n_clusters=k, random_state=1)
    km_ss.fit(clusters_survey_scaled)
    scores.append(silhouette_score(clusters_survey_scaled, km_ss.labels_))

# plot the results
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.title('clusters_survey_scaled kmeans at survey level')

In [None]:
clusters_km = KMeans(n_clusters=2,random_state=1234)
clusters_km.fit(clusters_survey_scaled)
print(silhouette_score(clusters_survey_scaled, clusters_km.labels_))

clusters_survey['clusters'] = [label for label in clusters_km.labels_]

print(clusters_survey['clusters'].value_counts())

sns.pairplot(clusters_survey, hue='clusters')

In [None]:
clusters_survey.corr()

In [None]:
sns.pairplot(clusters_survey, 
             kind='reg')

In [None]:
clusters_survey.to_csv('clusters_survey.csv')

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
Z = linkage(engage_scaled, 'ward')
fig = plt.figure(figsize=(25, 10))
dn = dendrogram(Z)

## TSNE 

In [None]:
%%time
# center and scale the data
scaler = StandardScaler()

#full feature set
clusters_survey_scaled = scaler.fit_transform(clusters_survey)

# This function can take a long to run
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2)
tsne_clusters_survey_scaled = tsne.fit_transform(clusters_survey_scaled)

In [None]:
print(tsne_clusters_survey_scaled.shape)
tsne_clusters_survey_scaled_df = pd.DataFrame(data = tsne_clusters_survey_scaled, columns = ['tsne_0', 'tsne_1'], index = clusters_survey.index)
tsne_clusters_survey_scaled_df = pd.concat([clusters_survey, tsne_clusters_survey_scaled_df], axis = 1)

plt.figure(figsize=(7,7))
sns.scatterplot(x = 'tsne_0', y = 'tsne_1', data = tsne_clusters_survey_scaled_df)
plt.title("Projection of the data on 2 components + ground truth labels")