In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os

import scipy.stats as stats

from sklearn.preprocessing import StandardScaler, Normalizer, LabelEncoder, PolynomialFeatures
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold, train_test_split, cross_val_score, cross_val_predict, GridSearchCV, RandomizedSearchCV

from sklearn.cluster import KMeans
from sklearn.metrics.cluster import silhouette_score

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVC  
from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error, accuracy_score

#visualizing results
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
#import yellowbrick as yb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

In [None]:
path = 'C:/Users/Schindler/Documents/Schindler_Lab/ML projects/TILES/Data/EMA PsyFlex.csv'

In [None]:
data = pd.read_csv(path)
data = pd.DataFrame(data = data)
print('Original PsyFlex EMA shape:\n', data.shape, '\n')
#replace empty values with 0 (e.g. these are not missing values but incorrectly entered)
data.replace(to_replace=' ', value= 0, inplace=True)
print('Original PsyFlex EMA survey type counts:\n', data['survey_type'].value_counts(), '\n')
print('Original PsyFlex EMA missing value couts:\n', data.isnull().sum(), '\n')
#determine data types
print('Original PsyFlex EMA data types:\n', data.info(), '\n')
data.head()

In [None]:
#select survey type
data_psych_flex_orig = data[data['survey_type'] == 'psych_flex']
print('Original PsyFlex survey shape:\n', data_psych_flex_orig.shape, '\n')

#drop individual questions
data_psych_flex_orig_short = data_psych_flex_orig[['Timestamp', 'survey_id', 'participant_id', 'ID', 'survey_type',
       'survey_dt', 'completed_ts_utc', 'PsyFlex',
       'Psy_Flex_SD', 'Conttext_All', 'Context_Neg', 'Context_Pos']]

#set index
data_psych_flex_orig_short.set_index(['Timestamp', 'survey_id', 'participant_id', 'ID', 'survey_type', 'survey_dt', 'completed_ts_utc'], inplace=True)

#find how many missing
print('Original PsyFlex survey data missing value counts:\n', data_psych_flex_orig_short.isnull().sum(), '\n')

#drop anything with a na
data_psych_flex_nonull = data_psych_flex_orig_short.dropna()
#confirm no more missing
print('Missing check:\n', data_psych_flex_nonull.isnull().sum(), '\n')

print('No null PsyFlex survey shape:\n', data_psych_flex_nonull.shape, '\n')

print("Fraction of data kept:\n",float(data_psych_flex_nonull.shape[0])/data_psych_flex_orig_short.shape[0], '\n')

#convert data types
data_psych_flex_nonull['PsyFlex'] = data_psych_flex_nonull['PsyFlex'].astype('float')
data_psych_flex_nonull['Psy_Flex_SD'] = data_psych_flex_nonull['Psy_Flex_SD'].astype('float')
data_psych_flex_nonull['Conttext_All'] = data_psych_flex_nonull['Conttext_All'].astype('float')
data_psych_flex_nonull['Context_Neg'] = data_psych_flex_nonull['Context_Neg'].astype('float')
data_psych_flex_nonull['Context_Pos'] = data_psych_flex_nonull['Context_Pos'].astype('float')

print("Confirm data type float:\n", data_psych_flex_nonull.info(), '\n')

data_psych_flex_nonull.head()

In [None]:
sns.pairplot(data_psych_flex_nonull, kind = 'reg')

In [None]:
corr = data_psych_flex_nonull.corr()
fig, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(corr, annot=True)

In [None]:
psy_flex_regressor = data_psych_flex_nonull['PsyFlex']
psy_flex_features = data_psych_flex_nonull[['Context_Pos']]
psy_flex_features.head()

In [None]:
#split data
X_train, X_test, y_train, y_test = train_test_split(psy_flex_features, psy_flex_regressor, test_size = .3, random_state=1)

# center and scale the data
scaler = StandardScaler()

features_scaled = scaler.fit_transform(X_train)

In [None]:
k_fold = KFold(n_splits=3)
cv = list(k_fold.split(features_scaled, y_train))

In [None]:
dm_cv = DummyClassifier(strategy='most_frequent', random_state=0)
lr_cv = LinearRegression()
rf_cv = RandomForestRegressor(max_depth=4, random_state=0)
svm_cv = SVC(probability=True) 
kn_cv = KNeighborsRegressor()

In [None]:
y_pred_dm = cross_val_predict(dm_cv, features_scaled, y_train, cv=cv, method='predict')
#y_pred_prob_dm = cross_val_predict(dm_cv, features_scaled, y_train, cv=cv, method='predict_proba')
#conf_mat_dm = confusion_matrix(y_train, y_pred_dm)
#conf_mat_dm
dm_cv.fit(features_scaled, y_train)
y_pred_dummy = dm_cv.predict(features_scaled)
print("r^2 for dummy data:", metrics.r2_score(y_train, y_pred_dummy))
print("MAE for dummy data:",metrics.mean_absolute_error(y_train, y_pred_dummy))

In [None]:
y_pred_lr = cross_val_predict(lr_cv, features_scaled, y_train, cv=cv, method='predict')
#y_pred_prob_lr = cross_val_predict(lr_cv, features_scaled, y_train, cv=cv, method='predict_proba')
#conf_mat_lr = confusion_matrix(y_train, y_pred_lr)
#conf_mat_lr
lr_cv.fit(features_scaled, y_train)
y_pred_lr = lr_cv.predict(features_scaled)
print("r^2 for lr data:", metrics.r2_score(y_train, y_pred_lr))
print("MAE for lr data:",metrics.mean_absolute_error(y_train, y_pred_lr))

In [None]:
y_pred_rf = cross_val_predict(rf_cv, features_scaled, y_train, cv=cv, method='predict')
#y_pred_prob_rf = cross_val_predict(rf_cv, features_scaled, y_train, cv=cv, method='predict_proba')
#conf_mat_rf = confusion_matrix(y_train, y_pred_rf)
#conf_mat_rf
rf_cv.fit(features_scaled, y_train)
y_pred_lr = rf_cv.predict(features_scaled)
print("r^2 for rf data:", metrics.r2_score(y_train, y_pred_lr))
print("MAE for rf data:",metrics.mean_absolute_error(y_train, y_pred_lr))

K means clustering

In [None]:
k_range = range(2,10)
scores = []
for k in k_range:
    km_ss = KMeans(n_clusters=k, random_state=1)
    km_ss.fit(features_clust_scaled)
    scores.append(silhouette_score(features_clust_scaled, km_ss.labels_))

# plot the results
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')

In [None]:
km2 = KMeans(n_clusters=2,random_state=1234)
km2.fit(features_clust_scaled)
data['kmeans_2_scaled'] = [ "cluster_" + str(label) for label in km2.labels_ ]
data.groupby('kmeans_2_scaled').mean()