Careless responder feature engineering and analysis (inital focus on psych_flex survey only)

In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os
from itertools import groupby
import datetime as dt
import scipy as sp

#visualizing results
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
#import yellowbrick as yb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import silhouette_score

In [None]:
data_path = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/TILES/final_data/final_data_complete.pkl'

In [None]:
#read in csv containing data from all surveys
data = pd.read_pickle(data_path)
data = pd.DataFrame(data = data)
data.reset_index(inplace=True, drop=True)

print('Original data shape:\n', data.shape, '\n')
#ensure no replicate ID (212 participants in study)
print('Original data unique IDs:\n', data['ParticipantID'].unique().shape, '\n')
#ensure no replicate ID (212 participants in study)
print('Original data unique IDs:\n', data['MitreID'].unique().shape, '\n')
#how much missing data is there?
print('Original data missing value counts:\n', data.isnull().sum(), '\n')
#what is the data type of each column?
print('Original data data types:\n', data.info(), '\n')

In [None]:
data['survey_type'].value_counts()

In [None]:
#should be 71
len(data['wave_study_day'].unique())

In [None]:
#create study date bins
data['wave_study_date_bin'] = pd.cut(data['wave_study_day'], 5)
data['wave_study_date_bin'].value_counts()

In [None]:
data.head()

## Features and clustering on Engage surveys

### notes for CR features for Engage surveys

Context question
- Semantic Antonyms
    - if context1 = home (0), then context2 ≠ work and work related (0)
    - if context1 = work (1), then context2 ≠ leisure sports (2), household activities (5), org/civic (9)
- Semantic Synonyms
    - if context1 = work (2), then context2 most likely work and work related (1)
    - If context1 = vehicle (5), then context2 most likely travel or commute (11)
- Internal consistency
    - if context1 = 5 (other) then should have a write in
    - if context2 = 12 (other) then should have a write in

Longstring
- All questions use same scale (1=not at all, 7=very much), but there are 5 different constructs assessed

Semantic consistency
- Internal consistency (within construct) should be greater than consistency across constructs

Semantic synonyms 
- not applicable 

Semantic antonyms
- Hindrance stressors should be negatively correlated with support 


In [None]:
#split off completed engage and related columns
engage_only = data[(data['survey_type'] == 'engage_psycap') & (data['completed'] == 1.0)]

print(engage_only.shape)
engage_only['ParticipantID'].unique().shape

In [None]:
#context semantic consistency features
        
context_check = []
write_in_location = []
write_in_activity = []

for index, row in engage_only.iterrows():
    
    #if at home should not be working
    if (row['location_num'] == 0) & (row['activity_num'] == 0):
        context_check.append(1)
    
    #if at work should not be playing sports, household activities, civic duties
    elif (row['location_num'] == 1) & ((row['activity_num'] == 2) | (row['activity_num'] == 5) | (row['activity_num'] == 9)):
        context_check.append(1)
        
    else:
        context_check.append(0)
    
    #if put other then should have write in
    if row['location_num'] == 5:
        write_in_location.append(1)
    else:
        write_in_location.append(0)
        
    if (row['activity_num'] == 11):
        write_in_activity.append(1)
    else:
        write_in_activity.append(0)
        
engage_only['context_check'] = context_check
engage_only['write_in_location'] = write_in_location
engage_only['write_in_activity'] = write_in_activity

In [None]:
#long string analysis (e.g. max length of same number answered for engage_3:engage_29)
#create features related to long string analysis (feature of how long the string is and feature of what the string consisted of)

max_strings = []
max_answers = []

for index, row in engage_only.iterrows():
    
    groups = groupby(row['engage_3':'engage_29'])
    result = [(label, sum(1 for _ in group)) for label, group in groups]

    max_pair = max(result, key=lambda x:x[1])
    max_string_length = max_pair[1]
    max_answer = max_pair[0]

    max_strings.append(max_string_length)
    
    max_answers.append(max_answer)
    
engage_only['longest_string_count'] = max_strings
engage_only['longest_string_answer'] = max_answers

In [None]:
skew_seeded = []

for index, row in engage_only.iterrows():
    num = sp.stats.skew(np.append(row.loc['engage_3':'engage_29'].values, 4.0))
    skew_seeded.append(num)
    
engage_only['skew_seeded'] = skew_seeded

In [None]:
#create feature that is surevy response skew
engage_only['skew'] = engage_only.loc[:, 'engage_3':'engage_29'].skew(axis=1)
engage_only['kurtosis'] = engage_only.loc[:, 'engage_3':'engage_29'].kurtosis(axis=1)

In [None]:
engage_only.dropna(subset=['skew_seeded'], inplace=True)
engage_only.shape

In [None]:
engage_only_features = engage_only[['MitreID', 'context_check', 'write_in_location', 'write_in_activity', 'longest_string_count', 'skew_seeded', 'skew', 'kurtosis', 'time_to_complete']]
engage_only_features.set_index('MitreID', inplace=True)
engage_only_features.head()

In [None]:
engage_only_features.corr()

In [None]:
sns.pairplot(engage_only_features, kind='reg')

### notes for CR features for Psych Flex

Should have answered every question

Longstring
- Legitimate longstrings of  ≥ 8 are unlikely for response “5”
    - make column with longest string
    - make column with number that longest string consisted of

Semantic consistency
- Legitimate scores of pf_mgt=5 are almost impossible

Semantic antonyms
- Not applicable

Semantic synonyms 
- not applicable 


In [None]:
#split off completed PF and related columns
psych_flex_only = data[(data['survey_type'] == 'psych_flex') & (data['completed'] == 1.0)]

print(psych_flex_only.shape)
psych_flex_only['ParticipantID'].unique().shape

## Survey level features and clustering

In [None]:
#long string analysis (e.g. max length of same number answered for pf_03:pf_15)
#create features related to long string analysis (feature of how long the string is and feature of what the string consisted of)

max_strings = []
max_answers = []

for index, row in psych_flex_only.iterrows():
    
    groups = groupby(row['pf_03':'pf_15'])
    result = [(label, sum(1 for _ in group)) for label, group in groups]

    max_pair = max(result, key=lambda x:x[1])
    max_string_length = max_pair[1]
    max_answer = max_pair[0]

    max_strings.append(max_string_length)
    
    max_answers.append(max_answer)
    
psych_flex_only['longest_string_count'] = max_strings
psych_flex_only['longest_string_answer'] = max_answers

In [None]:
#create feature that is surevy response skew
psych_flex_only['skew'] = psych_flex_only.loc[:, 'pf_03':'pf_15'].skew(axis=1)
psych_flex_only['kurtosis'] = psych_flex_only.loc[:, 'pf_03':'pf_15'].kurtosis(axis=1)

In [None]:
#create feature that is count of exp questions that are 1 (eg answered) - this didn't work as a feature
psych_flex_only['exp_count'] = psych_flex_only.loc[:, 'exp_0':'exp_13'].sum(axis=1)
psych_flex_only['exp_mean'] = psych_flex_only.loc[:, 'exp_neg':'exp_neut'].mean(axis=1)

In [None]:
psych_flex_only.dropna(subset=['kurtosis'], inplace=True)
psych_flex_only.shape

In [None]:
PF_survey_features = psych_flex_only[['MitreID', 'longest_string_count', 'exp_count', 'exp_mean', 'skew', 'kurtosis', 'time_to_complete']]
PF_survey_features_skew = psych_flex_only[['MitreID', 'longest_string_count', 'skew', 'time_to_complete']]
PF_survey_features_kurtosis = psych_flex_only[['MitreID', 'longest_string_count', 'kurtosis', 'time_to_complete']]
PF_survey_features.set_index('MitreID', inplace=True)
PF_survey_features_skew.set_index('MitreID', inplace=True)
PF_survey_features_kurtosis.set_index('MitreID', inplace=True)
PF_survey_features.head()

In [None]:
PF_survey_features.corr()

In [None]:
sns.pairplot(PF_survey_features, kind='reg')

In [None]:
# center and scale the data
scaler = StandardScaler()

PF_survey_features_scaled = scaler.fit_transform(PF_survey_features_kurtosis)

In [None]:
k_range = range(2,10)
scores = []
for k in k_range:
    km_ss = KMeans(n_clusters=k, random_state=1)
    km_ss.fit(PF_survey_features_scaled)
    scores.append(silhouette_score(PF_survey_features_scaled, km_ss.labels_))

# plot the results
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.title('PF kmeans at survey level')

In [None]:
PF_km_survey = KMeans(n_clusters=5,random_state=1234)
PF_km_survey.fit(PF_survey_features_scaled)
PF_survey_features_kurtosis['kmeans_scaled_survey'] = ["cluster_" + str(label) for label in PF_km_survey.labels_ ]
PF_survey_features_kurtosis.head()

In [None]:
PF_survey_features['kmeans_scaled_survey'].value_counts()

In [None]:
sns.pairplot(PF_survey_features_kurtosis, hue = 'kmeans_scaled_survey')

In [None]:
#add cluster column to PF survey df
participants = psych_flex_only['MitreID'].unique()

for part in participants:
    psych_flex_only.loc[psych_flex_only['MitreID'] == part, 'kmeans_cluster_survey'] = PF_survey_features[PF_survey_features.index == part]['kmeans_scaled_survey'].values
    
psych_flex_only.head()

In [None]:
#add cluster ratio to PF survey df
participants = psych_flex_only['MitreID'].unique()

for part in participants:
    psych_flex_only.loc[psych_flex_only['MitreID'] == part, 'kmeans_cluster_survey_ratio_c0'] = \
    psych_flex_only[(psych_flex_only['MitreID'] == part) & (psych_flex_only['kmeans_cluster_survey'] == 'cluster_0')].shape[0] / \
    psych_flex_only[psych_flex_only['MitreID'] == part].shape[0]
    
    psych_flex_only.loc[psych_flex_only['MitreID'] == part, 'kmeans_cluster_survey_ratio_c2'] = \
    psych_flex_only[(psych_flex_only['MitreID'] == part) & (psych_flex_only['kmeans_cluster_survey'] == 'cluster_2')].shape[0] / \
    psych_flex_only[psych_flex_only['MitreID'] == part].shape[0]
    
psych_flex_only.head()

## Participant level features and clustering

In [None]:
#create participant level CR features df
PF_part_features = pd.DataFrame()
PF_part_features['MitreID'] = psych_flex_only['MitreID'].unique()
print(PF_part_features.shape)

#long string count features
PF_part_features['ls_count_ave'] = psych_flex_only.groupby('MitreID')['longest_string_count'].mean().values
#PF_part_features['ls_count_var'] = psych_flex_only.groupby('MitreID')['longest_string_count'].var().values
#PF_part_features['ls_count_skew'] = psych_flex_only.groupby('MitreID')['longest_string_count'].skew().values
PF_part_features['ls_count_kurt'] = psych_flex_only.groupby('MitreID')['longest_string_count'].apply(pd.DataFrame.kurt).values

#long string answer choice features
#PF_part_features['ls_num_ave'] = psych_flex_only.groupby('MitreID')['longest_string_answer'].mean().values
#PF_CRpart_features['ls_num_min'] = PF_demog_df.groupby('MitreID')['longest_string_answer'].min().values
#PF_CRpart_features['ls_num_max'] = PF_demog_df.groupby('MitreID')['longest_string_answer'].max().values
#PF_part_features['ls_num_skew'] = psych_flex_only.groupby('MitreID')['longest_string_answer'].skew().values

#time to complete survey features
PF_part_features['ttc_ave'] = psych_flex_only.groupby('MitreID')['time_to_complete'].mean().values
#PF_part_features['ttc_var'] = psych_flex_only.groupby('MitreID')['time_to_complete'].var().values
#PF_part_features['ttc_skew'] = psych_flex_only.groupby('MitreID')['time_to_complete'].skew().values
PF_part_features['ttc_kurtosis'] = psych_flex_only.groupby('MitreID')['time_to_complete'].apply(pd.DataFrame.kurt).values

#surveys completed features
#PF_CRpart_features['survey_ratio'] = PF_demog_df.groupby('MitreID')['survey_count'].count().values / 50
#PF_CRpart_features['survey_max'] = PF_demog_df.groupby('MitreID')['survey_count'].max().values

PF_part_features.set_index('MitreID', inplace=True)
PF_part_features.dropna(subset=['ls_count_kurt'], inplace=True)
print(PF_part_features.shape)
PF_part_features.head()

In [None]:
sns.pairplot(PF_part_features)

In [None]:
# center and scale the data
scaler = StandardScaler()

PF_part_features_scaled = scaler.fit_transform(PF_part_features)

In [None]:
k_range = range(2,10)
scores = []
for k in k_range:
    km_ss = KMeans(n_clusters=k, random_state=1)
    km_ss.fit(PF_part_features_scaled)
    scores.append(silhouette_score(PF_part_features_scaled, km_ss.labels_))

# plot the results
plt.plot(k_range, scores)
plt.title('PF kmeans at participant level')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')

In [None]:
PF_km_part = KMeans(n_clusters=4,random_state=1234)
PF_km_part.fit(PF_part_features_scaled)
PF_part_features['kmeans_scaled_part'] = ["cluster_" + str(label) for label in PF_km_part.labels_ ]
PF_part_features.head()

In [None]:
PF_part_features['kmeans_scaled_part'].value_counts()

In [None]:
sns.pairplot(PF_part_features, hue='kmeans_scaled_part')

In [None]:
#fill out df at participant level

participants = PF_part_features.index

PF_part_clustering_demog = pd.DataFrame()

for part in participants:
    data_int = psych_flex_only.loc[psych_flex_only['MitreID'] == part][['ParticipantID', 'ID', 'PrimaryUnit', 'SmartPhone',
       'Sex', 'Shift', 'Wave', 'GenInst', 'gender', 'age', 'bornUS',
       'country', 'lang', 'englyrs', 'educ', 'jobstat', 'occup',
       'occup_TEXT', 'supervise', 'quantsup', 'size', 'duration',
       'income', 'record_id', 'race', 'ethnic', 'relationship',
       'pregnant', 'children', 'housing', 'household___1',
       'household___2', 'household___3', 'household___4', 'household___5',
       'household___6', 'household___7', 'currentposition',
       'position_other', 'certifications', 'nurseyears', 'shift', 'hours',
       'overtime', 'commute_type', 'commute_time', 'extrajob',
       'extrahours', 'student', 'demographics_complete',
       'time_to_complete_demogs', 'demographics_timestamp',
       'rand_36_item_sf_health_survey_instrument_version_1_timestamp',
       'satisfaction_with_life_scale_swls_timestamp',
       'perceived_stress_scale_pss_timestamp', 'mpfi24_timestamp',
       'shipley.vocab', 'shipley.abs', 'irb', 'itp', 'ocb',
       'inter.deviance', 'org.deviance', 'extraversion', 'agreeableness',
       'conscientiousness', 'neuroticism', 'openness', 'pos.affect',
       'neg.affect', 'stai.trait', 'audit', 'gats.status',
       'gats.quantity', 'gats.quantity.sub', 'ipaq', 'psqi', 'mpfi24_01',
       'mpfi24_02', 'mpfi24_03', 'mpfi24_04', 'mpfi24_05', 'mpfi24_06',
       'mpfi24_07', 'mpfi24_08', 'mpfi24_09', 'mpfi24_10', 'mpfi24_11',
       'mpfi24_12', 'mpfi24_13', 'mpfi24_14', 'mpfi24_15', 'mpfi24_16',
       'mpfi24_17', 'mpfi24_18', 'mpfi24_19', 'mpfi24_20', 'mpfi24_21',
       'mpfi24_22', 'mpfi24_23', 'mpfi24_24', 'General_Health',
       'Physical_Functioning', 'Limits_Physical', 'Emotional_Wellbeing',
       'Limits_Emotional', 'Social_Functioning', 'Pain', 'energy',
       'fatigue', 'LifeSatisfaction', 'Stress', 'WAAQ', 'Flexibility',
       'Inflexibility', 'Acceptance', 'Awareness', 'Self_as_Context',
       'Defusion', 'Values', 'Action', 'Avoidance', 'LackofAwareness',
       'Self_as_Content', 'Fusion', 'LackofValues', 'Inaction',
       'Engagement', 'Engage_Vigor', 'Engage_Dedication',
       'Engage_Absorbtion', 'PsyCap', 'Psycap_Hope', 'Psycap_Efficacy',
       'Psycap_Reslilience', 'Psycap_Optimism', 'challengestressors',
       'Hindrancestressors', 'poststudy_survey_timestamp_post',
       'General_Health_post', 'Physical_Functioning_post',
       'Limits_Physical_post', 'Emotional_Wellbeing_post',
       'Limits_Emotional_post', 'Social_Functioning_post', 'Pain_post',
       'energy_post', 'fatigue_post', 'LifeSatisfaction_post',
       'Stress_post', 'WAAQ_post', 'Flexibility_post',
       'Inflexibility_post', 'Acceptance_post', 'Awareness_post',
       'Self_as_Context_post', 'Defusion_post', 'Values_post',
       'Action_post', 'Avoidance_post', 'LackofAwareness_post',
       'Self_as_Content_post', 'Fusion_post', 'LackofValues_post',
       'Inaction_post', 'Engagement_post', 'Engage_Vigor_post',
       'Engage_Dedication_post', 'Engage_Absorbtion_post', 'PsyCap_post',
       'Psycap_Hope_post', 'Psycap_Efficacy_post',
       'Psycap_Reslilience_post', 'Psycap_Optimism_post',
       'challengestressors_post', 'Hindrancestressors_post', 'PsyFlexTot',
       'PsyFlexSDTot', 'Context_Neg_Tot', 'Context_Pos_Tot',
       'Context_All_Tot']]
    df_part_long = data_int.iloc[0]
    df_part_long = pd.DataFrame(data= df_part_long).T
    #df_part_long = pd.concat([psych_flex_only[psych_flex_only['ID'] == part]]*len(PF_part_features[PF_part_features.index ==  part]), ignore_index=True)
    df_part_long.reset_index(inplace=True)
    psych_flex_demog_int = pd.concat([df_part_long, PF_part_features[PF_part_features.index ==  part].reset_index()], axis = 1)
    PF_part_clustering_demog = PF_part_clustering_demog.append(psych_flex_demog_int)

#confirm the two data tables are now the same lenght
print('psych_flex_only and psych_flex_demog are the same length:', PF_part_features.shape[0] == PF_part_clustering_demog.shape[0])
print(PF_part_features.shape[0])
print(PF_part_clustering_demog.shape[0])
print('does the math make sense?', PF_part_clustering_demog.shape[0] == (PF_part_clustering_demog['ID'].values == PF_part_clustering_demog['MitreID'].values).sum())
PF_part_clustering_demog.head()

In [None]:
#new df of all survey data with MitreIDs that match clustering MitreIDs
ID_diff = list(set(data['MitreID'].unique()) - set(PF_part_clustering_demog['ID'].unique()))

data_match = data.loc[~data['MitreID'].isin(ID_diff)]

psych_flex_only_match = psych_flex_only.loc[~psych_flex_only['MitreID'].isin(ID_diff)]
psych_flex_only_match.head()

In [None]:
#split off completed engage 
engage_only = data_match[(data_match['survey_type'] == 'engage_psycap') & (data_match['completed'] == 1.0)]
engage_only.groupby('MitreID')['engage_mgt'].mean().shape

In [None]:
#surveys completed features
PF_part_clustering_demog['survey_complete_ratio'] = psych_flex_only_match.groupby('MitreID')['survey_count'].count().values / 50
PF_part_clustering_demog['survey_complete_max'] = psych_flex_only_match.groupby('MitreID')['survey_count'].max().values
PF_part_clustering_demog['kmeans_cluster_survey_ratio_c0'] = psych_flex_only_match.groupby('MitreID')['kmeans_cluster_survey_ratio_c0'].max().values
PF_part_clustering_demog['kmeans_cluster_survey_ratio_c2'] = psych_flex_only_match.groupby('MitreID')['kmeans_cluster_survey_ratio_c2'].max().values

PF_part_clustering_demog['pf_mgt_ave'] = psych_flex_only_match.groupby('MitreID')['pf_mgt'].mean().values
PF_part_clustering_demog['exp_neg_ave'] = psych_flex_only_match.groupby('MitreID')['exp_neg'].mean().values
PF_part_clustering_demog['exp_pos_ave'] = psych_flex_only_match.groupby('MitreID')['exp_pos'].mean().values
PF_part_clustering_demog['exp_neut_ave'] = psych_flex_only_match.groupby('MitreID')['exp_neut'].mean().values

PF_part_clustering_demog['engage_mgt_ave'] = engage_only.groupby('MitreID')['engage_mgt'].mean().values
PF_part_clustering_demog['psycap_mgt_ave'] = engage_only.groupby('MitreID')['psycap_mgt'].mean().values
PF_part_clustering_demog['support_mgt_ave'] = engage_only.groupby('MitreID')['support_mgt'].mean().values
PF_part_clustering_demog['challenge_mgt_ave'] = engage_only.groupby('MitreID')['challenge_mgt'].mean().values
PF_part_clustering_demog['hindrance_mgt_ave'] = engage_only.groupby('MitreID')['hindrance_mgt'].mean().values

In [None]:
print(PF_part_clustering_demog.shape)
PF_part_clustering_demog.head()

In [None]:
path_incentives = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/TILES/final_data/Weekly incentives per participant.xlsx'

#read in incentives df
incentives_df = pd.read_excel(path_incentives)
incentives_df = pd.DataFrame(data = incentives_df)
incentives_df.sort_values('MITRE ID').head()

In [None]:
incentives_df.columns.values

In [None]:
participants = PF_part_clustering_demog['MitreID']

#PF_CRpart_features_cluster_df['total_incentive'] = np.nan

for part in participants:
    try:
        PF_part_clustering_demog.loc[PF_part_clustering_demog['MitreID'] == part, 'total_incentive'] = incentives_df[incentives_df['MITRE ID'] == part]['Total Incentive '].values
        
    except:
        pass

In [None]:
#save to csv
PF_part_clustering_demog.to_csv('PF_clustering_participant.csv')

In [None]:
#add cluster column to PF survey df
participants = PF_part_features.index

for part in participants:
    psych_flex_only.loc[psych_flex_only['MitreID'] == part, 'kmeans_cluster_part'] = PF_part_features[PF_part_features.index == part]['kmeans_scaled_part'].values
    
psych_flex_only.head()

## viz for JV conference

In [None]:
#break off only good responders
psych_flex_noCR = psych_flex_only[psych_flex_only['kmeans_cluster_part'] != 'cluster_1']

In [None]:
#add change from mean for pf_mgt to PF survey df
participants = psych_flex_noCR['MitreID'].unique()

for part in participants:
    pf_mgt_mean = psych_flex_noCR[psych_flex_noCR['MitreID'] == part]['pf_mgt'].mean()
    psych_flex_noCR.loc[psych_flex_noCR['MitreID'] == part, 'pf_mgt_change'] = psych_flex_noCR[psych_flex_noCR['MitreID'] == part]['pf_mgt'] - pf_mgt_mean

psych_flex_noCR.head()

In [None]:
#plt.style.use('seaborn-white')
plt.rcParams['figure.figsize'] = 10,5
g = sns.lineplot(x='survey_count', y="pf_mgt", data=psych_flex_noCR, lw=1, ci=68, palette="rocket", legend=False)

In [None]:
plt.style.use('seaborn-white')
plt.rcParams['figure.figsize'] = 10,5
g = sns.lineplot(y="pf_mgt", data=psych_flex_noCR, x='wave_study_date', ci=68, palette="rocket")

In [None]:
plt.style.use('seaborn-white')
plt.rcParams['figure.figsize'] = 10,5
g = sns.barplot(y="pf_mgt", data=psych_flex_noCR, x='activity_num', palette="rocket")

In [None]:
plt.style.use('seaborn-white')
plt.rcParams['figure.figsize'] = 10,5
g = sns.barplot(y="pf_mgt_change", data=psych_flex_noCR, x='activity_num', ci=68, palette="rocket")
#g.set(ylim=(0, 5))

In [None]:
psych_flex_noCR_exp_cols = psych_flex_noCR[['pf_mgt_change', 'exp_0', 'exp_1', 'exp_2', 'exp_3', 'exp_4', 'exp_5', 'exp_6', 'exp_7', 'exp_8', 'exp_9', 'exp_10', 'exp_11', 'exp_12', 'exp_13']]
psych_flex_noCR_exp_melt = psych_flex_noCR_exp_cols.melt('pf_mgt_change', var_name='experience',  value_name='vals')
g = sns.factorplot(x="vals", y="pf_mgt_change", hue='experience', data=psych_flex_noCR_exp_melt)

In [None]:
psych_flex_noCR_exp_cols = psych_flex_noCR[['pf_mgt_change', 'exp_pos', 'exp_neg', 'exp_neut']]
psych_flex_noCR_exp_melt = psych_flex_noCR_exp_cols.melt('pf_mgt_change', var_name='experience_group',  value_name='vals')
g = sns.factorplot(x="vals", y="pf_mgt_change", hue='experience_group', data=psych_flex_noCR_exp_melt)

In [None]:
exp_col = ['exp_pos', 'exp_neg', 'exp_neut']

i=1
plt.figure(figsize=(30,20))
for name in exp_col:
    plt.subplot(4, 4, i)
    sns.barplot(y="pf_mgt_change", data=psych_flex_noCR, x=name, palette="rocket")
    i = i + 1

In [None]:
exp_col = ['exp_0', 'exp_1', 'exp_2', 'exp_3', 'exp_4', 'exp_5',
       'exp_6', 'exp_7', 'exp_8', 'exp_9', 'exp_10', 'exp_11', 'exp_12',
       'exp_13']

i=1
plt.figure(figsize=(20,15))
for name in exp_col:
    plt.subplot(4, 4, i)
    sns.barplot(y="pf_mgt", data=psych_flex_noCR, x=name, palette="rocket")
    i = i + 1

## participant level clustering viz

In [None]:
g = sns.barplot(y="ls_count_ave", data=PF_part_clustering_demog, x='kmeans_scaled_part', ci=68)
plt.show()
g = sns.barplot(y="ls_count_kurt", data=PF_part_clustering_demog, x='kmeans_scaled_part', ci=68)
plt.show()
g = sns.barplot(y="ttc_ave", data=PF_part_clustering_demog, x='kmeans_scaled_part', ci=68)
plt.show()
#g = sns.barplot(y="ttc_skew", data=PF_part_clustering_demog, x='kmeans_scaled_part', ci=68)
#plt.show()
g = sns.barplot(y="kmeans_cluster_survey_ratio_c0", data=PF_part_clustering_demog, x='kmeans_scaled_part', ci=68)
plt.show()
g = sns.barplot(y="kmeans_cluster_survey_ratio_c2", data=PF_part_clustering_demog, x='kmeans_scaled_part', ci=68)
plt.show()

In [None]:
g = sns.countplot(x="kmeans_cluster_part", data=psych_flex_only, hue='wave_study_date_bin')


In [None]:
g = sns.barplot(y="total_incentive", data=PF_part_clustering_demog, x='kmeans_scaled_part', ci=68)

In [None]:
g = sns.barplot(y="survey_complete_ratio", data=PF_part_clustering_demog, x='kmeans_scaled_part', ci=68)
plt.show()
g = sns.barplot(y="survey_complete_max", data=PF_part_clustering_demog, x='kmeans_scaled_part', ci=68)
plt.show()

In [None]:
PF_part_clustering_demog.columns.values 

In [None]:
variables = ['shipley.vocab', 'shipley.abs', 'irb',
       'itp', 'ocb', 'inter.deviance', 'org.deviance', 'extraversion',
       'agreeableness', 'conscientiousness', 'neuroticism', 'openness',
       'pos.affect', 'neg.affect', 'stai.trait', 'audit', 'gats.status',
       'gats.quantity', 'gats.quantity.sub', 'ipaq', 'psqi', 
       'gender', 'bornUS', 'englyrs', 'educ',
       'jobstat', 'supervise', 'quantsup', 'size',
       'duration', 'income', 'relationship',
       'pregnant', 'children', 'housing','nurseyears', 'shift', 'hours',
       'overtime', 'commute_time', 'extrajob',
       'extrahours', 'student', 
       'mpfi24_01', 'mpfi24_02', 'mpfi24_03', 'mpfi24_04', 'mpfi24_05',
       'mpfi24_06', 'mpfi24_07', 'mpfi24_08', 'mpfi24_09', 'mpfi24_10',
       'mpfi24_11', 'mpfi24_12', 'mpfi24_13', 'mpfi24_14', 'mpfi24_15',
       'mpfi24_16', 'mpfi24_17', 'mpfi24_18', 'mpfi24_19', 'mpfi24_20',
       'mpfi24_21', 'mpfi24_22', 'mpfi24_23', 'mpfi24_24',
       'General_Health', 'Physical_Functioning', 'Limits_Physical',
       'Emotional_Wellbeing', 'Limits_Emotional', 'Social_Functioning',
       'Pain', 'energy', 'fatigue', 'LifeSatisfaction', 'Stress', 'WAAQ',
       'Flexibility', 'Inflexibility', 'Acceptance', 'Awareness',
       'Self_as_Context', 'Defusion', 'Values', 'Action', 'Avoidance',
       'LackofAwareness', 'Self_as_Content', 'Fusion', 'LackofValues',
       'Inaction', 'Engagement', 'Engage_Vigor', 'Engage_Dedication',
       'Engage_Absorbtion', 'PsyCap', 'Psycap_Hope', 'Psycap_Efficacy',
       'Psycap_Reslilience', 'Psycap_Optimism', 'challengestressors',
       'Hindrancestressors', 
       'General_Health_post', 'Physical_Functioning_post',
       'Limits_Physical_post', 'Emotional_Wellbeing_post',
       'Limits_Emotional_post', 'Social_Functioning_post', 'Pain_post',
       'energy_post', 'fatigue_post', 'LifeSatisfaction_post',
       'Stress_post', 'WAAQ_post', 'Flexibility_post',
       'Inflexibility_post', 'Acceptance_post', 'Awareness_post',
       'Self_as_Context_post', 'Defusion_post', 'Values_post',
       'Action_post', 'Avoidance_post', 'LackofAwareness_post',
       'Self_as_Content_post', 'Fusion_post', 'LackofValues_post',
       'Inaction_post', 'Engagement_post', 'Engage_Vigor_post',
       'Engage_Dedication_post', 'Engage_Absorbtion_post', 'PsyCap_post',
       'Psycap_Hope_post', 'Psycap_Efficacy_post',
       'Psycap_Reslilience_post', 'Psycap_Optimism_post',
       'challengestressors_post', 'Hindrancestressors_post', 'PsyFlexTot',
       'PsyFlexSDTot', 'Context_Neg_Tot', 'Context_Pos_Tot',
       'Context_All_Tot', 
       'survey_complete_ratio', 'survey_complete_max',
       'kmeans_cluster_survey_ratio_c0', 'kmeans_cluster_survey_ratio_c2', 'pf_mgt_ave', 'exp_neg_ave',
       'exp_pos_ave', 'exp_neut_ave', 'engage_mgt_ave', 'psycap_mgt_ave',
       'support_mgt_ave', 'challenge_mgt_ave', 'hindrance_mgt_ave',
       'total_incentive', 'exp_neg', 'exp_pos', 'exp_neut']

In [None]:
for var in variables:
    print(var)
    try:
        PF_part_clustering_demog[var] = PF_part_clustering_demog[var].astype('float')
        g = sns.barplot(y=var, data=PF_part_clustering_demog, x='kmeans_scaled_part', ci=68)
        plt.show()
    except:
        pass