Explore psychological flexibility EMA data after preprocessing (see other NBs for preprocessing code), determine outliers and careless responding among participants and across survey admin dates

In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os
from itertools import groupby

#visualizing results
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
#import yellowbrick as yb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

In [None]:
path_PF_clean = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/TILES/final_data/pf_final.csv'

In [None]:
#read in csv from preprocessed psychologial felxibility EMAs 
data_PF = pd.read_csv(path_PF_clean)
data_PF = pd.DataFrame(data = data_PF)
#data_PF.reset_index(inplace=True)

print('Original data_PF_S3 shape:\n', data_PF.shape, '\n')
#ensure no replicate ID (211 participants in study)
print('Original data_PF_S3 unique IDs:\n', data_PF['participant_id'].unique().shape, '\n')
#how much missing data is there?
print('Original data_PF_S3 missing value counts:\n', data_PF.isnull().sum(), '\n')
#what is the data type of each column?
print('Original data_PF_S3 data types:\n', data_PF.info(), '\n')
#what is the participant response rate across the entire study?
print('Non-response rate for PF survey:\n', data_PF['completed_ts_utc'].isnull().sum() / data_PF.shape[0] * 100, '%')
#add a binary column for if survey was completed (1) or not (0)
data_PF['completed'] = np.where(data_PF['results_updated'].isnull(), 0, 1)

In [None]:
#make new column corresponding to the number in series of survey sent for each participant
part_id = data_PF['participant_id'].unique()
for participant in part_id:
    surveys = data_PF[data_PF['participant_id'] == participant]
    data_PF.loc[data_PF['participant_id'] == participant, 'survey_count'] = np.arange(surveys.shape[0])

In [None]:
#change dates from objects to datetimes
data_PF['survey_dt'] = data_PF['survey_dt'].astype('datetime64')
data_PF['delivered_ts_utc'] = data_PF['delivered_ts_utc'].astype('datetime64[ns]')
data_PF['started_ts_utc'] = data_PF['started_ts_utc'].astype('datetime64[ns]')
data_PF['completed_ts_utc'] = data_PF['completed_ts_utc'].astype('datetime64[ns]')
data_PF['ingested_ts_utc'] = data_PF['ingested_ts_utc'].astype('datetime64[ns]')

#confirm change 
print(data_PF.info())

In [None]:
#make new column of only numeric corresponding to activity questions (code -1 for write in responses)
activity_num = []
for index, row in data_PF.iterrows():
    try:
        num = float(row['activity'])
        activity_num.append(num)
    except:
        activity_num.append(-1)

data_PF['activity_num'] = activity_num

In [None]:
#compute time between when survey is sent and when participant starts the survey
data_PF['start_delay'] = (data_PF['started_ts_utc'] - data_PF['delivered_ts_utc']).astype('timedelta64[s]')
#compute time between when survey is sent and when participant starts the survey
data_PF['time_to_complete'] = (data_PF['completed_ts_utc'] - data_PF['started_ts_utc']).astype('timedelta64[s]')

In [None]:
#each participant should have 50 survey entries
data_PF['participant_id'].value_counts()

In [None]:
#split off data for surveys that were completed
data_PF_completed = data_PF[data_PF['completed'] == 1]
#data_PF_completed.reset_index(inplace=True)
print(data_PF_completed.shape)
#null survey questions for surveys that were completed
data_PF_completed.isnull().sum()

In [None]:
#there should be no nans in columns 'activity' and 'pf_3' to 'pf_mgt', find these rows
pf_no_comp = data_PF_completed.loc[data_PF_completed.loc[:, 'activity':'pf_mgt'].isnull().sum(axis=1) > 0]
print(pf_no_comp.shape)
pf_no_comp.to_csv('pf_no_comp.csv')
#participants did not have to answer 'exp_0' to 'exp_13', so no missing values

In [None]:
#compute new column containing long string analysis results (e.g. max length of same number answered for pf_03:pf_15)
max_string = []
for index, row in data_PF_completed.iterrows():
    repeats = []
    for k, g in groupby(row.loc['pf_03':'pf_15']):
        repeats.append(sum(1 for i in g))
    max_string.append(max(repeats))

data_PF_completed['longest_string_pf'] = max_string

In [None]:
#number of participants sent surveys across time
ax = data_PF.groupby('survey_count')['participant_id'].count().plot()
#number of participants completed surveys across time
ax = data_PF[data_PF['completed'] == 1].groupby('survey_count')['completed'].count().plot()
plt.legend(('sent', 'completed'))
plt.title('Surveys sent vs completed across time')
ax.set_ylabel('Count')
plt.show

In [None]:
#number of participants sent surveys across time
ax = data_PF.groupby('survey_dt')['participant_id'].count().plot()
#number of participants completed surveys across time
ax = data_PF[data_PF['completed'] == 1].groupby('survey_dt')['completed'].count().plot()
plt.legend(('sent', 'completed'))
plt.title('Surveys sent vs completed across time')
ax.set_ylabel('Count')
plt.show

In [None]:
ax = (data_PF[data_PF['completed'] == 1].groupby('survey_count')['participant_id'].count() /
data_PF.groupby('survey_count')['participant_id'].count()*100).plot()

ax.set_ylabel('Response rate (%)')
plt.title('Survey response rate across time')
plt.show

In [None]:
#delay to starting survey across survey dates
ax = data_PF_completed.groupby('survey_count')['start_delay'].mean().plot()

plt.title('Delay to starting survey across survey dates')
ax.set_ylabel('Min')
plt.show

In [None]:
#time to complete survey across survey dates
ax = data_PF_completed.groupby('survey_count')['time_to_complete'].mean().plot()

plt.title('Time to complete survey across survey dates')
ax.set_ylabel('Min')
plt.show

In [None]:
#time to complete survey across participants
ax = data_PF_completed.groupby(['participant_id'])['time_to_complete'].mean().plot(kind='hist', bins = 75)

plt.title('Time to complete survey across participants')
ax.set_ylabel('Number of participants')
plt.show

In [None]:
#pf_mgt across survey dates
ax = data_PF_completed.groupby(['survey_count'])['pf_mgt'].mean().plot()

plt.title('pf_mgt across survey dates')
ax.set_ylabel('pf_mgt')
plt.show

In [None]:
#delay to starting survey across survey dates
ax = data_PF_completed.groupby('survey_count')['longest_string_pf'].mean().plot()

plt.title('Longest string across survey dates')
ax.set_ylabel('Longest string')
plt.show

In [None]:
dep_vars = ['pf_mgt', 'exp_neg', 'exp_pos', 'exp_neut', 'survey_count', 'start_delay', 'time_to_complete', 'longest_string_pf']
sns.pairplot(data_PF_completed[dep_vars].dropna(), kind='reg')

In [None]:
data_PF_completed[dep_vars].corr()

In [None]:
for param in dep_vars:
    plt.figure(figsize=(10,5))
    data_PF_completed.groupby(['activity_num'])[param].mean().plot(kind='bar', yerr=data_PF_completed.groupby(['activity_num'])[param].sem())
    plt.ylabel([param])
    plt.show()

In [None]:
for param in dep_vars:
    plt.figure(figsize=(10,5))
    data_PF_completed.groupby(['longest_string_pf'])[param].mean().plot(kind='bar', yerr=data_PF_completed.groupby(['longest_string_pf'])[param].sem())
    plt.ylabel([param])
    plt.show()

In [None]:
data_PF_completed[data_PF_completed['time_to_complete'] <= 5]['time_to_complete'].plot(kind='hist', bins=50)

In [None]:
data_PF_completed.head()

In [None]:
data_PF_completed[data_PF_completed['time_to_complete'] <= 5]['time_to_complete'].shape