In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os

import scipy.stats as stats

from sklearn.preprocessing import StandardScaler, Normalizer, LabelEncoder, PolynomialFeatures
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold, train_test_split, cross_val_score, cross_val_predict, GridSearchCV, RandomizedSearchCV

from sklearn.cluster import KMeans
from sklearn.metrics.cluster import silhouette_score

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC  
from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error, accuracy_score

#visualizing results
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
#import yellowbrick as yb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

In [None]:
path = 'C:/Users/Schindler/Documents/Schindler_Lab/ML projects/TILES/Data_googledrive/Demog, PRE, PST survey composites.csv'

In [None]:
data = pd.read_csv(path)
data = pd.DataFrame(data = data)
print('Composites shape:\n', data.shape, '\n')
print('Composites data types:\n', data.info(), '\n')
data_pre = data[['ID', 'shipley.vocab', 'shipley.abs', 'irb', 'itp',
       'ocb', 'inter.deviance', 'org.deviance', 'extraversion',
       'agreeableness', 'conscientiousness', 'neuroticism', 'openness',
       'pos.affect', 'neg.affect', 'stai.trait', 'audit', 'gats.status',
       'gats.quantity', 'ipaq', 'psqi', 'gender', 'age',
       'bornUS', 'country', 'lang', 'englyrs', 'educ', 'jobstat', 'occup',
       'occup_TEXT', 'supervise', 'quantsup', 'size', 'duration',
       'income', 'record_id', 'race', 'ethnic', 'relationship',
       'pregnant', 'children', 'housing', 'household___1',
       'household___2', 'household___3', 'household___4', 'household___5',
       'household___6', 'household___7', 'currentposition',
       'position_other', 'certifications', 'nurseyears', 'shift', 'hours',
       'overtime', 'commute_type', 'commute_time', 'extrajob',
       'extrahours', 'student', 'mpfi24_01', 'mpfi24_02', 'mpfi24_03', 'mpfi24_04', 'mpfi24_05',
       'mpfi24_06', 'mpfi24_07', 'mpfi24_08', 'mpfi24_09', 'mpfi24_10',
       'mpfi24_11', 'mpfi24_12', 'mpfi24_13', 'mpfi24_14', 'mpfi24_15',
       'mpfi24_16', 'mpfi24_17', 'mpfi24_18', 'mpfi24_19', 'mpfi24_20',
       'mpfi24_21', 'mpfi24_22', 'mpfi24_23', 'mpfi24_24',
       'General_Health', 'Physical_Functioning', 'Limits_Physical',
       'Emotional_Wellbeing', 'Limits_Emotional', 'Social_Functioning',
       'Pain', 'energy', 'fatigue', 'LifeSatisfaction', 'Stress', 'WAAQ',
       'Flexibility', 'Inflexibility', 'Acceptance', 'Awareness',
       'Self_as_Context', 'Defusion', 'Values', 'Action', 'Avoidance',
       'LackofAwareness', 'Self_as_Content', 'Fusion', 'LackofValues',
       'Inaction', 'Engagement', 'Engage_Vigor', 'Engage_Dedication',
       'Engage_Absorbtion', 'PsyCap', 'Psycap_Hope', 'Psycap_Efficacy',
       'Psycap_Reslilience', 'Psycap_Optimism', 'challengestressors',
       'Hindrancestressors']]
data_pre = data_pre.set_index('ID')
print(data_pre.shape)
data_pre.head()

In [None]:
#fill empty values with NaN
data_pre = data_pre.replace(' ', np.nan)
#look at columns with missing values
print('Composites missing value counts:\n', data_pre.isna().sum().sort_values(ascending=False), '\n')

In [None]:
print(data_pre.shape)
data_clean = data_pre.dropna(axis=1, thresh=210)
print(data_clean.shape)
data_clean = data_clean.dropna(axis=0)
print(data_clean.shape)
data_clean.head()

In [None]:
data_clean.describe()

In [None]:
print(data_clean.info())
data_clean['gats.status'] = data_clean['gats.status'].replace({'never': 0, 'past': 1, 'current': 2})
data_clean = data_clean.astype('float')
print(data_clean.info())

In [None]:
columns = data_clean.columns.values
sns.pairplot(data_clean, x_vars=columns, y_vars='supervise', kind='reg')

Visualization

In [None]:
corr = data_clean.corr()
fig, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(corr, annot=True)

In [None]:
for col in columns:
    sns.distplot(data_clean[col], bins=2)
    plt.show()

K means clustering

In [None]:
data_demo = data_pre[['race', 'ethnic', 'relationship', 'pregnant',
       'children', 'housing', 'currentposition',
       'position_other', 'certifications', 'nurseyears', 'shift', 'hours',
       'overtime', 'commute_type', 'commute_time', 'extrajob',
       'extrahours', 'student']]
print(data_demo.shape)
print(data_demo.isna().sum())
data_demo_clean = data_demo.dropna(axis = 1, thresh=175)
print(data_demo_clean.shape)
data_demo_clean = data_demo_clean.dropna(axis = 0, thresh=16)
print(data_demo_clean.shape)
print(data_demo_clean.info())
data_demo_clean = data_demo_clean.astype('float')
print(data_demo_clean.info())
print(data_demo_clean.isnull().values.any())
data_demo_clean.head()

In [None]:
# center and scale the data
scaler = StandardScaler()

data_scaled = scaler.fit_transform(data_demo_clean)

In [None]:
k_range = range(2,50)
scores = []
for k in k_range:
    km_ss = KMeans(n_clusters=k, random_state=39)
    km_ss.fit(data_scaled)
    scores.append(silhouette_score(data_scaled, km_ss.labels_))

# plot the results
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')

In [None]:
km2 = KMeans(n_clusters=2,random_state=1234)
km2.fit(features_clust_scaled)
data['kmeans_2_scaled'] = [ "cluster_" + str(label) for label in km2.labels_ ]
data.groupby('kmeans_2_scaled').mean()

In [None]:
data.groupby('Severity')['kmeans_2_scaled'].value_counts()