In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os

import scipy.stats as stats

from sklearn.preprocessing import StandardScaler, Normalizer, LabelEncoder, PolynomialFeatures
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold, train_test_split, cross_val_score, cross_val_predict, GridSearchCV, RandomizedSearchCV

from sklearn.cluster import KMeans
from sklearn.metrics.cluster import silhouette_score

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC  
from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error, accuracy_score

#visualizing results
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
#import yellowbrick as yb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

In [None]:
path_EMA_MOSAIC = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/TILES/Data_googledrive/EMA surveys -MOSAIC.csv'
path_demog_prepost = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/TILES/Data_googledrive/Demog, PRE, PST survey composites.csv'

In [None]:
#read in data_demog_prepost, contains demographic information and pre/post questionaires
data_demog_prepost = pd.read_csv(path_demog_prepost)
data_demog_prepost = pd.DataFrame(data = data_demog_prepost)
print('Original demog_prepost shape:\n', data_demog_prepost.shape, '\n')
#replace blnaks with nans
data_demog_prepost = data_demog_prepost.replace(' ', np.nan)
#ensure no replicate ID (eg one row per participant in study)
print('Original demog_prepost unique IDs:\n', data_demog_prepost['ID'].unique().shape, '\n')
print('Original demog_prepost missing value couts:\n', data_demog_prepost.isnull().sum(), '\n')
print('Original demog_prepost data types:\n', data_demog_prepost.info(), '\n')

In [None]:
#split off demographic data
data_demog = data_demog_prepost[['ID', 'date_time', 'GenInst',
       'gender', 'age', 'bornUS', 'country', 'lang', 'englyrs', 'educ',
       'jobstat', 'occup', 'occup_TEXT', 'supervise', 'quantsup', 'size',
       'duration', 'income', 'record_id', 'redcap_event_name',
       'demographics_timestamp', 'race', 'ethnic', 'relationship',
       'pregnant', 'children', 'housing', 'household___1',
       'household___2', 'household___3', 'household___4', 'household___5',
       'household___6', 'household___7', 'currentposition',
       'position_other', 'certifications', 'nurseyears', 'shift', 'hours',
       'overtime', 'commute_type', 'commute_time', 'extrajob',
       'extrahours', 'student', 'demographics_complete']]
data_demog.head()

In [None]:
print(data_demog.isna().sum())
print(data_demog.shape)
data_demog = data_demog.dropna(thresh = 100, axis = 1)
print(data_demog.shape)
data_demog['uid'] = data_demog['ID']
data_demog.head()

In [None]:
#read in MOSAIC EMA 
data_EMA = pd.read_csv(path_EMA_MOSAIC)
data_EMA = pd.DataFrame(data = data_EMA)
print('Original MOSAIC EMA shape:\n', data_EMA.shape, '\n')
print('Original MOSAIC EMA survey type counts:\n', data_EMA['survey_type'].value_counts(), '\n')
print('Original MOSAIC EMA missing value couts:\n', data_EMA.isnull().sum(), '\n')
print('Original MOSAIC EMA data types:\n', data_EMA.info(), '\n')

In [None]:
#create date column from timestamp data
data_EMA['date'] = data_EMA['timestamp'].str.split(' ', expand=True)[0]

In [None]:
data_EMA['date'].describe()

In [None]:
data_EMA.groupby(['survey_type', 'uid'])['date'].unique()

In [None]:
data_EMA['date'].sort_values()

In [None]:
#plt.xticks(dates, fontsize=14)
dates = data_EMA['date'].unique()
import time
import datetime

time.mktime(datetime.datetime.strptime(d, "%Y-%m-%d").timetuple())


In [None]:
#stress mean by survey type and date
data_EMA[data_EMA['survey_type'] == 'health'].groupby(['date'])['uid'].count().plot.line()
data_EMA[data_EMA['survey_type'] == 'job'].groupby(['date'])['uid'].count().plot.line()
data_EMA[data_EMA['survey_type'] == 'personality'].groupby(['date'])['uid'].count().plot.line()
plt.legend(('health', 'job', 'personality'))
plt.show

In [None]:
data_EMA['time'].describe()

In [None]:
#create time column from timestamp data
data_EMA['time'] = data_EMA['timestamp'].str.split(' ', expand=True)[1]
#number of respondants by date
#number of respondents by survey type and time
data_EMA[data_EMA['survey_type'] == 'health'].groupby(['time'])['uid'].count().plot.line()
data_EMA[data_EMA['survey_type'] == 'job'].groupby(['time'])['uid'].count().plot.line()
data_EMA[data_EMA['survey_type'] == 'personality'].groupby(['time'])['uid'].count().plot.line()
plt.legend(('health', 'job', 'personality'))

In [None]:
#data consists of three different EMA surveys (health, job, personality), each with different set of questions asked
data_EMA.groupby('survey_type').count()

In [None]:
data_EMA['event_mgt'].mean()

In [None]:
#data consists of three different EMA surveys (health, job, personality), each with different set of questions asked
data_EMA.groupby(['survey_type', 'uid']).count()

In [None]:
#data consists of three different EMA surveys (health, job, personality), each with different set of questions asked
data_EMA.groupby('uid')['survey_type'].count()

In [None]:
data_EMA.groupby(['survey_type', 'stress_mgt'])['uid'].count()

In [None]:
sns.distplot(data_EMA[data_EMA['survey_type'] == 'health']['stress_mgt'].dropna())
sns.distplot(data_EMA[data_EMA['survey_type'] == 'job']['stress_mgt'].dropna())
sns.distplot(data_EMA[data_EMA['survey_type'] == 'personality']['stress_mgt'].dropna())
plt.show()

In [None]:
#stress mean by survey type and date
data_EMA[data_EMA['survey_type'] == 'health'].groupby(['date'])['stress_mgt'].mean().plot.line()
data_EMA[data_EMA['survey_type'] == 'job'].groupby(['date'])['stress_mgt'].mean().plot.line()
data_EMA[data_EMA['survey_type'] == 'personality'].groupby(['date'])['stress_mgt'].mean().plot.line()
plt.legend(('health', 'job', 'personality'))

In [None]:
#interested in stress as starting variable
data_EMA.groupby(['survey_type', 'stress_mgt']).mean()

In [None]:
#lets look at how each parameter changes as a result of reported stress levels for each survey types (eg secondary question of whether survey type affects reponses)

#get column names (eg parameters)
parameters = data_EMA.groupby('stress_mgt').mean().columns.values

for param in parameters:
    data_EMA.groupby(['survey_type', 'stress_mgt'])[param].mean().plot(kind='bar', yerr=data_EMA.groupby(['survey_type', 'stress_mgt'])[param].sem())
    plt.ylabel([param])
    plt.show()

In [None]:
data_EMA.groupby('survey_type')['work_mgt'].unique()

In [None]:
#merge demog and EMA dfs
demo_EMA = data_EMA.merge(data_demog, on = 'uid', how = 'inner')
print(demo_EMA.shape)
demo_EMA.info()
demo_EMA.head()

In [None]:
demo_EMA.groupby(['gender', 'survey_type', 'stress_mgt'])['uid'].count()

In [None]:
param + '.png'

In [None]:
#lets look at how each parameter changes as a result of reported stress levels for each survey type and across gender

#get column names (eg parameters)
parameters = data_EMA.groupby('stress_mgt').mean().columns.values

for param in parameters:
    plt.figure(figsize=(10,10))
    demo_EMA.groupby(['gender', 'stress_mgt'])[param].mean().plot(kind='bar', yerr=demo_EMA.groupby(['gender', 'stress_mgt'])[param].sem())
    plt.ylabel([param])
    plt.savefig(str(param + '.png'))
    plt.show()

In [None]:
#create separate data frame for each survey
print('Health EMA data', '\n')
print('Missing values per column', '\n', data[data['survey_type'] == 'health'].isnull().sum(), '\n')
data_health = data[data['survey_type'] == 'health'].dropna(thresh = 100, axis = 1)
print(data_health.shape, '\n', '\n')

print('Job EMA data', '\n')
print('Missing values per column', '\n', data[data['survey_type'] == 'job'].isnull().sum(), '\n')
data_job = data[data['survey_type'] == 'job'].dropna(thresh = 100, axis = 1)
print(data_job.shape, '\n', '\n')

print('Personality EMA data', '\n')
print('Missing values per column', '\n', data[data['survey_type'] == 'personality'].isnull().sum(), '\n')
data_personality = data[data['survey_type'] == 'personality'].dropna(thresh = 100, axis = 1)
print(data_personality.shape, '\n', '\n')