Combine study data into single "tidy" data frame for subsequent processing and analysis

In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os
from itertools import groupby
import datetime as dt
from numpy import median

#visualizing results
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
#import yellowbrick as yb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

In [None]:
data_path = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/TILES/final_data/data_final_combined.pkl'

In [None]:
#read in csv containing participant info
data = pd.read_pickle(data_path)
data = pd.DataFrame(data = data)
#data_PF.reset_index(inplace=True)

print('Original data shape:\n', data.shape, '\n')
#ensure no replicate ID (212 participants in study)
print('Original data unique IDs:\n', data['ParticipantID'].unique().shape, '\n')
#ensure no replicate ID (212 participants in study)
print('Original data unique IDs:\n', data['MitreID'].unique().shape, '\n')
#how much missing data is there?
print('Original data missing value counts:\n', data.isnull().sum(), '\n')
#what is the data type of each column?
print('Original data data types:\n', data.info(), '\n')

In [None]:
#create day of week column using dt and wave_study_date
data['day_of_week'] = data['wave_study_date'].dt.day_name()
data['day_of_week'] = pd.Categorical(data['day_of_week'], categories=
    ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday'],
    ordered=True)

In [None]:
#fix numeric data columns
data['activity_num'] = pd.to_numeric(data['activity_num'])
data['location_num'] = pd.to_numeric(data['location_num'])

data['pf_mgt'] = pd.to_numeric(data['pf_mgt'])

data['context1'] = pd.to_numeric(data['context1'])
data['context2'] = pd.to_numeric(data['context2'])
data['context3'] = pd.to_numeric(data['context3'])
data['context4'] = pd.to_numeric(data['context4'])
data['stress'] = pd.to_numeric(data['stress'])
data['anxiety'] = pd.to_numeric(data['anxiety'])
data['work'] = pd.to_numeric(data['work'])

In [None]:
#explore unique values for each column - look for where they need to be updated/cleaned etc
final_cols = data.columns
for col in final_cols:
    print(col)
    print(data[col].unique())
    print('\n')

In [None]:
data['survey_type'].unique()

In [None]:
data[data['survey_type'] == 'psych_flex']['location_num'].value_counts()

In [None]:
#create new column corresponding to if they are working or not (as assessed by 'activity_num' == 0 in psych_flex and engage or 
#'context2' == 1 in MGT surveys)

working = []
yes = 1.0
no = 0.0
for index, row in data.iterrows():
    
    if (row['survey_type'] == 'psych_flex') | (row['survey_type'] == 'engage_psycap'):
        
        if row['activity_num'] == 0.0:
            working.append(1.0)
            
        elif row['activity_num'] == -1.0:
            working.append(0.0)
            
        elif row['activity_num'] > 0.0:
            working.append(0.0)
        
        else:
            working.append(np.nan)
            
    elif (row['survey_type'] == 'job') | (row['survey_type'] == 'health') | (row['survey_type'] == 'personality'): 
        
        if row['context2'] == 1.0:
            working.append(1.0)
            
        elif row['context2'] > 1.0:
            working.append(0.0)
            
        else:
            working.append(np.nan)
    
    else:
        working.append(np.nan)
        
print(len(working))
data['working'] = working

In [None]:
#create new column corresponding to if they are at work or not (as assessed by 'location_num' == 1 in engage or 
#'context2' == 1 in MGT surveys; psych_flex did not ask about location)

at_work = []
yes = 1.0
no = 0.0
for index, row in data.iterrows():
    
    if (row['survey_type'] == 'engage_psycap'):
        
        if row['activity_num'] == 0.0:
            at_work.append(1.0)
            
        elif row['activity_num'] == -1.0:
            at_work.append(0.0)
            
        elif row['activity_num'] > 0.0:
            at_work.append(0.0)
        
        else:
            at_work.append(np.nan)
            
    elif (row['survey_type'] == 'job') | (row['survey_type'] == 'health') | (row['survey_type'] == 'personality'): 
        
        if row['context2'] == 1.0:
            at_work.append(1.0)
            
        elif row['context2'] > 1.0:
            at_work.append(0.0)
            
        else:
            at_work.append(np.nan)
    
    else:
        at_work.append(np.nan)
        
print(len(at_work))
data['at_work'] = at_work

In [None]:
data['survey_type'].value_counts()

In [None]:
len(data['wave_study_day'].unique())

In [None]:
plt.figure(figsize=(25, 10))
ax = data[data['survey_type'] == 'job'].groupby(['wave_study_day'])['index'].count().plot()
ax = data[data['survey_type'] == 'health'].groupby(['wave_study_day'])['index'].count().plot()
ax = data[data['survey_type'] == 'personality'].groupby(['wave_study_day'])['index'].count().plot()
ax = data[data['survey_type'] == 'psych_flex'].groupby(['wave_study_day'])['index'].count().plot()
ax = data[data['survey_type'] == 'engage_psycap'].groupby(['wave_study_day'])['index'].count().plot()
plt.legend(('job', 'health', 'personality', 'psych_flex', 'engage_psycap'))
plt.title('Surveys sent across time')
ax.set_ylabel('Survey count')
plt.show

In [None]:
plt.figure(figsize=(25, 10))
sns.countplot(x="survey_type",  data=data, hue='day_of_week')

In [None]:
plt.figure(figsize=(25, 10))
ax = data[data['survey_type'] == 'job'].groupby(['day_of_week'])['index'].count().plot()
ax = data[data['survey_type'] == 'health'].groupby(['day_of_week'])['index'].count().plot()
ax = data[data['survey_type'] == 'personality'].groupby(['day_of_week'])['index'].count().plot()
ax = data[data['survey_type'] == 'psych_flex'].groupby(['day_of_week'])['index'].count().plot()
ax = data[data['survey_type'] == 'engage_psycap'].groupby(['day_of_week'])['index'].count().plot()
plt.legend(('job', 'health', 'personality', 'psych_flex', 'engage_psycap'))
plt.title('Surveys sent by day of week')
plt.xticks(np.arange(7), ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'))
ax.set_ylabel('Survey count')
plt.show

In [None]:
data.groupby('survey_type')['at_work'].value_counts()

In [None]:
plt.figure(figsize=(25, 10))
sns.countplot(x="working", data=data, hue='survey_type')

In [None]:
plt.figure(figsize=(25, 10))
ax = data[data['survey_type'] == 'job'].groupby(['wave_study_day'])['time_to_complete'].mean().plot()
ax = data[data['survey_type'] == 'health'].groupby(['wave_study_day'])['time_to_complete'].mean().plot()
ax = data[data['survey_type'] == 'personality'].groupby(['wave_study_day'])['time_to_complete'].mean().plot()
ax = data[data['survey_type'] == 'psych_flex'].groupby(['wave_study_day'])['time_to_complete'].mean().plot()
ax = data[data['survey_type'] == 'engage_psycap'].groupby(['wave_study_day'])['time_to_complete'].mean().plot()
plt.legend(('job', 'health', 'personality', 'psych_flex', 'engage_psycap'))
plt.title('Mean time to complete survey across time')
ax.set_ylabel('Mean time to complete survey')
plt.show

In [None]:
plt.figure(figsize=(25, 10))
sns.catplot(x="working", y="time_to_complete", data=data, hue='survey_type', kind="bar")

In [None]:
plt.figure(figsize=(25, 10))
sns.catplot(x="survey_type", y="time_to_complete", data=data, hue='Sex', col='working', kind="bar")

In [None]:
plt.figure(figsize=(25, 10))
sns.catplot(x="working", y="time_to_complete", data=data, hue='survey_type', col="Shift", kind="bar")

In [None]:
plt.figure(figsize=(25, 10))
sns.catplot(x="working", y="time_to_complete", data=data, hue='survey_type', col="Sex", kind="bar")

In [None]:
plt.figure(figsize=(25, 10))
sns.catplot(x="working", y="time_to_complete", data=data, hue='survey_type', col="day_of_week", kind="bar")

In [None]:
plt.figure(figsize=(25, 10))
sns.catplot(x="Shift", y="time_to_complete", data=data, hue='survey_type', col="Wave", kind="bar")

In [None]:
plt.figure(figsize=(25, 10))
sns.catplot(x="working", y="pf_mgt", data=data, hue='day_of_week', kind="bar")

In [None]:
plt.figure(figsize=(25, 10))
sns.catplot(x="wave_study_day", y="pf_mgt", data=data, col="working", kind="bar")

In [None]:
sns.catplot(x="day_of_week", y="pf_mgt", data=data, col="Shift", kind="bar")

In [None]:
sns.catplot(x="day_of_week", y="stress", data=data, col="Shift", kind="bar")

In [None]:
sns.catplot(x="day_of_week", y="anxiety", data=data, col="Shift", kind="bar")

In [None]:
data['pf_mgt_round'] =  data['pf_mgt'].round()
data['pf_mgt_round'].value_counts()

In [None]:
plt.figure(figsize=(25, 10))
sns.catplot(x="pf_mgt_round", y="time_to_complete", data=data, col="working", kind="bar")

In [None]:
#save to csv
data.to_csv('data.csv')

In [None]:
#split off data for surveys that were completed
data_PF_completed = data_PF_final[data_PF_final['completed'] == 1]
#data_PF_completed.reset_index(inplace=True)
print(data_PF_completed.shape)
#null survey questions for surveys that were completed
data_PF_completed.isnull().sum()

In [None]:
#there should be no nans in columns 'activity' and 'pf_3' to 'pf_mgt', find these rows
pf_no_comp = data_PF_completed.loc[data_PF_completed.loc[:, 'activity':'pf_mgt'].isnull().sum(axis=1) > 0]
print(pf_no_comp.shape)
#pf_no_comp.to_csv('pf_no_comp.csv')
#participants did not have to answer 'exp_0' to 'exp_13', so no missing values

In [None]:
#compute new column containing long string analysis results (e.g. max length of same number answered for pf_03:pf_15)
max_string = []
for index, row in data_PF_completed.iterrows():
    repeats = []
    for k, g in groupby(row.loc['pf_03':'pf_15']):
        repeats.append(sum(1 for i in g))
    max_string.append(max(repeats))

data_PF_completed['longest_string_pf'] = max_string