Careless responder survey level clustering

In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os
from itertools import groupby
import datetime as dt
import scipy as sp
import math

#visualizing results
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
#import yellowbrick as yb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import silhouette_score

In [None]:
data_path = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/TILES/final_data/surveys_CR.pkl'

In [None]:
#read in csv containing data from all surveys
full_data = pd.read_pickle(data_path)
full_data = pd.DataFrame(data = full_data)
full_data.reset_index(inplace=True, drop=True)

print('Original data shape:\n', full_data.shape, '\n')
#ensure no replicate ID (212 participants in study)
print('Original data unique IDs:\n', full_data['ParticipantID'].unique().shape, '\n')
#ensure no replicate ID (212 participants in study)
print('Original data unique IDs:\n', full_data['MitreID'].unique().shape, '\n')
#how much missing data is there?
print('Original data missing value counts:\n', full_data.isnull().sum(), '\n')
#what is the data type of each column?
print('Original data data types:\n', full_data.info(), '\n')

In [None]:
full_data['survey_type'].value_counts()

In [None]:
#should be 71
len(full_data['wave_study_day'].unique())

In [None]:
full_data.head(1)

In [None]:
full_data.set_index(['MitreID', 'survey_type', 'wave_study_day', 'wave_study_date_bin'], inplace=True)
full_data.head()

### explore CR across survey types

In [None]:
full_data.index.get_level_values('survey_type').value_counts()

In [None]:
engage_CRs = ['mean_diff_hinderance_vs_support', 'time_to_complete',
       'longstring_count_engage', 'longstring_answer_engage',
       'longstring_count_norm_engage', 'longstring_mult_engage',
       'ls_auc_engage', 'ls_std_engage', 'ls_skew_engage',
       'ls_kurt_engage', 'ls_auc_seeded_engage', 'ls_std_seeded_engage',
       'ls_skew_seeded_engage', 'ls_kurt_seeded_engage', 'context_homevsworking',
       'context_workvsactivities', 'context_workvswork',
       'context_drivevsdrive']

psych_flex_CRs = ['longstring_count_pf',
       'longstring_answer_pf', 'longstring_count_norm_pf',
       'longstring_mult_pf', 'ls_auc_pf', 'ls_std_pf', 'ls_skew_pf',
       'ls_kurt_pf', 'ls_auc_seeded_pf', 'ls_std_seeded_pf',
       'ls_skew_seeded_pf', 'ls_kurt_seeded_pf', 'time_to_complete']

jobsatwork_CRs = ['mean_diff_ave',
       'mean_diff_affect_posneg', 'mean_diff_nervous_anxiety',
       'mean_diff_irb_irb', 'mean_diff_dalal_posneg', 
                  'longstring_count_norm_ave', 'longstring_count_norm_affect', 'longstring_count_norm_irbd', 'longstring_count_norm_dalal',
                  'time_to_complete',
       'longstring_count_affect', 'longstring_answer_affect',
        'longstring_mult_affect',
       'ls_auc_affect', 'ls_std_affect', 'ls_skew_affect',
       'ls_kurt_affect', 'ls_auc_seeded_affect', 'ls_std_seeded_affect',
       'ls_skew_seeded_affect', 'ls_kurt_seeded_affect',
       'longstring_count_irbd', 'longstring_answer_irbd',
       'longstring_mult_irbd',
       'ls_auc_irbd', 'ls_std_irbd', 'ls_skew_irbd', 'ls_kurt_irbd',
       'ls_auc_seeded_irbd', 'ls_std_seeded_irbd', 'ls_skew_seeded_irbd',
       'ls_kurt_seeded_irbd', 'longstring_count_dalal',
       'longstring_answer_dalal', 
       'longstring_mult_dalal', 'ls_auc_dalal', 'ls_std_dalal',
       'ls_skew_dalal', 'ls_kurt_dalal', 'ls_auc_seeded_dalal',
       'ls_std_seeded_dalal', 'ls_skew_seeded_dalal',
       'ls_kurt_seeded_dalal', 'context_homevsworking',
       'context_workvsactivities', 'context_workvswork',
       'context_drivevsdrive']

jobsnotatwork_CRs = ['mean_diff_ave',
       'mean_diff_affect_posneg', 'mean_diff_nervous_anxiety',
       'longstring_count_affect', 'longstring_answer_affect',
       'longstring_count_norm_affect', 'longstring_mult_affect',
       'ls_auc_affect', 'ls_std_affect', 'ls_skew_affect',
       'ls_kurt_affect', 'ls_auc_seeded_affect', 'ls_std_seeded_affect',
       'ls_skew_seeded_affect', 'ls_kurt_seeded_affect', 'time_to_complete', 'context_homevsworking',
       'context_workvsactivities', 'context_workvswork',
       'context_drivevsdrive']

health_CRs = ['mean_diff_ave',
       'mean_diff_affect_posneg', 'mean_diff_nervous_anxiety', 'intake_check_alc',
       'intake_check_nic', 'time_to_complete',
       'longstring_count_affect', 'longstring_answer_affect',
       'longstring_count_norm_affect', 'longstring_mult_affect',
       'ls_auc_affect', 'ls_std_affect', 'ls_skew_affect',
       'ls_kurt_affect', 'ls_auc_seeded_affect', 'ls_std_seeded_affect',
       'ls_skew_seeded_affect', 'ls_kurt_seeded_affect', 'context_homevsworking',
       'context_workvsactivities', 'context_workvswork',
       'context_drivevsdrive']

personality_CRs = ['mean_diff_ave',
       'mean_diff_affect_posneg', 'mean_diff_nervous_anxiety', 'mean_diff_extraversion',
       'mean_diff_agreeableness', 'mean_diff_conscientiousness',
       'mean_diff_neuroticism', 'mean_diff_openness', 
                   'longstring_count_norm_ave', 'longstring_count_norm_affect', 'longstring_count_norm_personality',
                   'time_to_complete', 
       'longstring_count_affect', 'longstring_answer_affect',
       'longstring_mult_affect',
       'ls_auc_affect', 'ls_std_affect', 'ls_skew_affect',
       'ls_kurt_affect', 'ls_auc_seeded_affect', 'ls_std_seeded_affect',
       'ls_skew_seeded_affect', 'ls_kurt_seeded_affect', 
       'longstring_count_personality', 'longstring_answer_personality',
        'longstring_mult_personality',
       'ls_auc_personality', 'ls_std_personality', 'ls_skew_personality',
       'ls_kurt_personality', 'ls_auc_seeded_personality',
       'ls_std_seeded_personality', 'ls_skew_seeded_personality',
       'ls_kurt_seeded_personality', 'context_homevsworking',
       'context_workvsactivities', 'context_workvswork',
       'context_drivevsdrive']

## Clustering

In [None]:
personality_CRs = ['mean_diff_ave',
       'mean_diff_affect_posneg', 'mean_diff_nervous_anxiety', 'mean_diff_extraversion',
       'mean_diff_agreeableness', 'mean_diff_conscientiousness',
       'mean_diff_neuroticism', 'mean_diff_openness', 
                   'longstring_count_norm_ave', 'longstring_count_norm_affect', 'longstring_count_norm_personality',
                   'time_to_complete', 
       'longstring_count_affect', 'longstring_answer_affect',
       'longstring_mult_affect',
       'ls_auc_affect', 'ls_std_affect', 'ls_skew_affect',
       'ls_kurt_affect', 'ls_auc_seeded_affect', 'ls_std_seeded_affect',
       'ls_skew_seeded_affect', 'ls_kurt_seeded_affect', 
       'longstring_count_personality', 'longstring_answer_personality',
        'longstring_mult_personality',
       'ls_auc_personality', 'ls_std_personality', 'ls_skew_personality',
       'ls_kurt_personality', 'ls_auc_seeded_personality',
       'ls_std_seeded_personality', 'ls_skew_seeded_personality',
       'ls_kurt_seeded_personality', 'context_homevsworking',
       'context_workvsactivities', 'context_workvswork',
       'context_drivevsdrive']

personality = full_data[full_data.index.get_level_values('survey_type') == 'personality'][personality_CRs]
print(personality.shape)
personality['mean_diff_pers'] = personality.loc[:,'mean_diff_extraversion':'mean_diff_neuroticism'].mean(axis=1)
personality.dropna(inplace=True, axis=0)
print(personality.shape)
personality.head(1)

In [None]:
#select features with highest coef of var
personality_var = sp.stats.variation(personality, axis=0)
sorted_var = sorted(zip(personality.columns.values, personality_var), key=lambda x: x[1])
sorted_var

In [None]:
personality_CRs_final = ['mean_diff_pers',
                         'longstring_count_affect', 
                         'longstring_mult_affect',
                         'longstring_count_personality',
                         'longstring_mult_personality']


personality_features = personality[personality_CRs_final]

# center and scale the data
scaler = StandardScaler()

#full feature set
personality_scaled = scaler.fit_transform(personality_features)

#determine optimal k
k_range = range(2,10)
scores = []
for k in k_range:
    km_ss = KMeans(n_clusters=k, random_state=1)
    km_ss.fit(personality_scaled)
    scores.append(silhouette_score(personality_scaled, km_ss.labels_))

# plot the results
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.title('Personality kmeans at survey level')

In [None]:
personality_km_survey = KMeans(n_clusters=2,random_state=1234)
personality_km_survey.fit(personality_scaled)
print(silhouette_score(personality_scaled, personality_km_survey.labels_))

personality['clusters'] = [label for label in personality_km_survey.labels_ ]
personality_features['clusters'] = [label for label in personality_km_survey.labels_ ]

print(personality_features['clusters'].value_counts())

sns.pairplot(personality_features, hue='clusters')

In [None]:
personality['good_bad'] = ['good' if x == 0 else 'bad' for x in personality['clusters']]
personality.head()

In [None]:
health_CRs = ['mean_diff_ave',
       'mean_diff_affect_posneg', 'mean_diff_nervous_anxiety', 'intake_check_alc',
       'intake_check_nic', 'time_to_complete',
       'longstring_count_affect', 'longstring_answer_affect',
       'longstring_count_norm_affect', 'longstring_mult_affect',
       'ls_auc_affect', 'ls_std_affect', 'ls_skew_affect',
       'ls_kurt_affect', 'ls_auc_seeded_affect', 'ls_std_seeded_affect',
       'ls_skew_seeded_affect', 'ls_kurt_seeded_affect', 'context_homevsworking',
       'context_workvsactivities', 'context_workvswork',
       'context_drivevsdrive']

health = full_data[full_data.index.get_level_values('survey_type') == 'health'][health_CRs]
print(health.shape)
health.dropna(inplace=True, axis=0)
print(health.shape)

health_CRs_final = ['longstring_count_affect', 'longstring_mult_affect',
       'ls_auc_affect', 'ls_std_affect']

health_features = health[health_CRs_final]

# center and scale the data
scaler = StandardScaler()

#full feature set
health_scaled = scaler.fit_transform(health_features)

#determine optimal k
k_range = range(2,10)
scores = []
for k in k_range:
    km_ss = KMeans(n_clusters=k, random_state=1)
    km_ss.fit(health_scaled)
    scores.append(silhouette_score(health_scaled, km_ss.labels_))

# plot the results
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.title('Health kmeans at survey level')

In [None]:
health_km_survey = KMeans(n_clusters=3,random_state=1234)
health_km_survey.fit(health_scaled)
print(silhouette_score(health_scaled, health_km_survey.labels_))

health['clusters'] = [label for label in health_km_survey.labels_ ]
health_features['clusters'] = [label for label in health_km_survey.labels_ ]

print(health_features['clusters'].value_counts())

sns.pairplot(health_features, hue='clusters')

In [None]:
health['good_bad'] = ['bad' if x == 0 else 'good' for x in health['clusters']]
health.head()

In [None]:
jobsatwork_CRs = ['mean_diff_ave',
       'mean_diff_affect_posneg', 'mean_diff_nervous_anxiety',
       'mean_diff_irb_irb', 'mean_diff_dalal_posneg', 
                  'longstring_count_norm_ave', 'longstring_count_norm_affect', 'longstring_count_norm_irbd', 'longstring_count_norm_dalal',
                  'time_to_complete',
       'longstring_count_affect', 'longstring_answer_affect',
        'longstring_mult_affect',
       'ls_auc_affect', 'ls_std_affect', 'ls_skew_affect',
       'ls_kurt_affect', 'ls_auc_seeded_affect', 'ls_std_seeded_affect',
       'ls_skew_seeded_affect', 'ls_kurt_seeded_affect',
       'longstring_count_irbd', 'longstring_answer_irbd',
       'longstring_mult_irbd',
       'ls_auc_irbd', 'ls_std_irbd', 'ls_skew_irbd', 'ls_kurt_irbd',
       'ls_auc_seeded_irbd', 'ls_std_seeded_irbd', 'ls_skew_seeded_irbd',
       'ls_kurt_seeded_irbd', 'longstring_count_dalal',
       'longstring_answer_dalal', 
       'longstring_mult_dalal', 'ls_auc_dalal', 'ls_std_dalal',
       'ls_skew_dalal', 'ls_kurt_dalal', 'ls_auc_seeded_dalal',
       'ls_std_seeded_dalal', 'ls_skew_seeded_dalal',
       'ls_kurt_seeded_dalal', 'context_homevsworking',
       'context_workvsactivities', 'context_workvswork',
       'context_drivevsdrive']

jobs = full_data[full_data.index.get_level_values('survey_type') == 'jobs_atwork'][jobsatwork_CRs]
print(jobs.shape)
jobs.dropna(inplace=True, axis=0)
print(jobs.shape)

jobsatwork_CRs_final = ['mean_diff_dalal_posneg', 'time_to_complete',
       'longstring_count_affect', 'longstring_mult_affect',
       'longstring_count_dalal',
       'longstring_mult_dalal']

jobs_features = jobs[jobsatwork_CRs_final]

# center and scale the data
scaler = StandardScaler()

#full feature set
jobs_scaled = scaler.fit_transform(jobs_features)

#determine optimal k
k_range = range(2,10)
scores = []
for k in k_range:
    km_ss = KMeans(n_clusters=k, random_state=1)
    km_ss.fit(jobs_scaled)
    scores.append(silhouette_score(jobs_scaled, km_ss.labels_))

# plot the results
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.title('Jobs at work kmeans at survey level')

In [None]:
jobs_km_survey = KMeans(n_clusters=7,random_state=1234)
jobs_km_survey.fit(jobs_scaled)
print(silhouette_score(jobs_scaled, jobs_km_survey.labels_))

jobs['clusters'] = [label for label in jobs_km_survey.labels_ ]
jobs_features['clusters'] = [label for label in jobs_km_survey.labels_ ]

print(jobs_features['clusters'].value_counts())

sns.pairplot(jobs_features, hue='clusters')

In [None]:
engage_CRs = ['mean_diff_hinderance_vs_support', 'time_to_complete',
       'longstring_count_engage', 'longstring_answer_engage',
       'longstring_count_norm_engage', 'longstring_mult_engage',
       'ls_auc_engage', 'ls_std_engage', 'ls_skew_engage',
       'ls_kurt_engage', 'ls_auc_seeded_engage', 'ls_std_seeded_engage',
       'ls_skew_seeded_engage', 'ls_kurt_seeded_engage', 'context_homevsworking',
       'context_workvsactivities', 'context_workvswork',
       'context_drivevsdrive']

engage = full_data[full_data.index.get_level_values('survey_type') == 'engage_psycap'][engage_CRs]
print(engage.shape)
engage.dropna(inplace=True, axis=0)
print(engage.shape)

engage_CRs_final = ['mean_diff_hinderance_vs_support',  'ls_auc_engage', 'ls_std_engage',
      'longstring_count_engage', 'longstring_mult_engage']

engage_features = engage[engage_CRs_final]

# center and scale the data
scaler = StandardScaler()

#full feature set
engage_scaled = scaler.fit_transform(engage_features)

#determine optimal k
k_range = range(2,10)
scores = []
for k in k_range:
    km_ss = KMeans(n_clusters=k, random_state=1)
    km_ss.fit(engage_scaled)
    scores.append(silhouette_score(engage_scaled, km_ss.labels_))

# plot the results
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.title('Engage kmeans at survey level')

In [None]:
engage_km_survey = KMeans(n_clusters=2,random_state=1234)
engage_km_survey.fit(engage_scaled)
print(silhouette_score(engage_scaled, engage_km_survey.labels_))

engage['clusters'] = [label for label in engage_km_survey.labels_ ]
engage_features['clusters'] = [label for label in engage_km_survey.labels_ ]

print(engage_features['clusters'].value_counts())

sns.pairplot(engage_features, hue='clusters')

In [None]:
engage['good_bad'] = ['good' if x == 0 else 'bad' for x in engage['clusters']]
engage.head()

In [None]:
clusters_df = pd.concat([engage, personality, health], axis = 0)
clusters_df.head()

In [None]:
count_df = clusters_df[clusters_df['good_bad'] == 'good'].groupby(['MitreID', 'survey_type'])['good_bad'].count() / clusters_df.groupby(['MitreID', 'survey_type'])['good_bad'].count()
count_df = count_df.reset_index(name='count')
for part in count_df['MitreID'].unique():
    sns.catplot('survey_type', 'count', data=count_df[count_df['MitreID'] == part], kind='bar')
    plt.show()

## TSNE 

In [None]:
%%time

# This function can take a long to run
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2)
tsne_features_engage_full = tsne.fit_transform(features_engage_full_scaled)
tsne_features_psychflex_full = tsne.fit_transform(features_psychflex_full_scaled)
tsne_features_jobs_full = tsne.fit_transform(features_jobs_atwork_full_scaled)

In [None]:
print(tsne_features_engage.shape)
tsne_features_engage_df = pd.DataFrame(data = tsne_features_engage_full, columns = ['tsne_0', 'tsne_1'], index = engage_only_CRs.index)
tsne_features_engage_df = pd.concat([engage_only_CRs, tsne_features_engage_df], axis = 1)

plt.figure(figsize=(7,7))
sns.scatterplot(x = 'tsne_0', y = 'tsne_1', data = tsne_features_engage_df)
plt.title("Projection of the data on 2 components + ground truth labels")

In [None]:
print(tsne_features_psychflex.shape)
tsne_features_psychflex_df = pd.DataFrame(data = tsne_features_psychflex_full, columns = ['tsne_0', 'tsne_1'], index = psych_flex_only_CRs.index)
tsne_features_psychflex_df = pd.concat([psych_flex_only_CRs, tsne_features_psychflex_df], axis = 1)

plt.figure(figsize=(7,7))
sns.scatterplot(x = 'tsne_0', y = 'tsne_1', data = tsne_features_psychflex_df)
plt.title("Projection of the data on 2 components + ground truth labels")

In [None]:
print(tsne_features_jobs.shape)
tsne_features_jobs_df = pd.DataFrame(data = tsne_features_jobs_full, columns = ['tsne_0', 'tsne_1'], index = jobs_atwork_only_CRs.index)
tsne_features_jobs_df = pd.concat([jobs_atwork_only_CRs, tsne_features_jobs_df], axis = 1)

plt.figure(figsize=(7,7))
sns.scatterplot(x = 'tsne_0', y = 'tsne_1', data = tsne_features_jobs_df)
plt.title("Projection of the data on 2 components + ground truth labels")

In [None]:
%%time

# This function can take a long to run
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2)
tsne_features_engage_select = tsne.fit_transform(features_engage_select_scaled)
tsne_features_psychflex_select = tsne.fit_transform(features_psychflex_select_scaled)
tsne_features_jobs_select = tsne.fit_transform(features_jobs_atwork_select_scaled)

In [None]:
print(tsne_features_engage.shape)
tsne_features_engage_df = pd.DataFrame(data = tsne_features_engage_select, columns = ['tsne_0', 'tsne_1'], index = engage_only_CRs.index)
tsne_features_engage_df = pd.concat([engage_only_CRs, tsne_features_engage_df], axis = 1)

plt.figure(figsize=(7,7))
sns.scatterplot(x = 'tsne_0', y = 'tsne_1', data = tsne_features_engage_df)
plt.title("Projection of the data on 2 components + ground truth labels")

In [None]:
print(tsne_features_psychflex.shape)
tsne_features_psychflex_df = pd.DataFrame(data = tsne_features_psychflex_select, columns = ['tsne_0', 'tsne_1'], index = psych_flex_only_CRs.index)
tsne_features_psychflex_df = pd.concat([psych_flex_only_CRs, tsne_features_psychflex_df], axis = 1)

plt.figure(figsize=(7,7))
sns.scatterplot(x = 'tsne_0', y = 'tsne_1', data = tsne_features_psychflex_df)
plt.title("Projection of the data on 2 components + ground truth labels")

In [None]:
print(tsne_features_jobs.shape)
tsne_features_jobs_df = pd.DataFrame(data = tsne_features_jobs_select, columns = ['tsne_0', 'tsne_1'], index = jobs_atwork_only_CRs.index)
tsne_features_jobs_df = pd.concat([jobs_atwork_only_CRs, tsne_features_jobs_df], axis = 1)

plt.figure(figsize=(7,7))
sns.scatterplot(x = 'tsne_0', y = 'tsne_1', data = tsne_features_jobs_df)
plt.title("Projection of the data on 2 components + ground truth labels")