Preprocess and clean psychological flexibility EMA data from S3 bucket

In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os

#visualizing results
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
#import yellowbrick as yb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

In [None]:
path_PF_S3 = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/TILES/PF_survey_fix/S3_app_surveys_updated.csv'

In [None]:
#read in csv from S3 bucket; psychologial felxibility EMAs in long form
data_PF_S3 = pd.read_csv(path_PF_S3)
data_PF_S3 = pd.DataFrame(data = data_PF_S3)
print('Original data_PF_S3 shape:\n', data_PF_S3.shape, '\n')
#ensure no replicate ID (211 participants in study)
print('Original data_PF_S3 unique IDs:\n', data_PF_S3['participant_id'].unique().shape, '\n')
#how much missing data is there?
print('Original data_PF_S3 missing value counts:\n', data_PF_S3.isnull().sum(), '\n')
#what is the data type of each column?
print('Original data_PF_S3 data types:\n', data_PF_S3.info(), '\n')
#what is the participant response rate across the entire study?
print('Non-response rate over all PF surveys:\n', data_PF_S3.isnull().sum().max() / data_PF_S3.shape[0] * 100, '%')
#add a binary column for if survey was completed (1) or not (0)
data_PF_S3['completed'] = np.where(data_PF_S3['results_updated'].isnull(), 0, 1)

In [None]:
#change dates from objects to datetimes
data_PF_S3['survey_dt'] = data_PF_S3['survey_dt'].astype('datetime64')
data_PF_S3['delivered_ts_utc'] = data_PF_S3['delivered_ts_utc'].astype('datetime64[ns]')
data_PF_S3['started_ts_utc'] = data_PF_S3['started_ts_utc'].astype('datetime64[ns]')
data_PF_S3['completed_ts_utc'] = data_PF_S3['completed_ts_utc'].astype('datetime64[ns]')
data_PF_S3['ingested_ts_utc'] = data_PF_S3['ingested_ts_utc'].astype('datetime64[ns]')

#confirm change 
print(data_PF_S3.info())

In [None]:
print('data_PF_S3 shape:\n', data_PF_S3.shape, '\n')
print('data_PF_S3 unique survey_id shape:\n', data_PF_S3['survey_id'].unique().shape, '\n')
print('data_PF_S3 unique participant_id shape:\n', data_PF_S3['participant_id'].unique().shape, '\n')
print('data_PF_S3 unique survey_types:\n', data_PF_S3['survey_type'].value_counts(), '\n')

#what is the participant response rate for each survey type?
print('Non-response rate over psych_flex survey:\n', data_PF_S3[data_PF_S3['survey_type'] == 'psych_flex'].isnull().sum().max() /
      data_PF_S3[data_PF_S3['survey_type'] == 'psych_flex'].shape[0] * 100, '%')
print('Non-response rate over engage_psycap survey:\n', data_PF_S3[data_PF_S3['survey_type'] != 'psych_flex'].isnull().sum().max() /
      data_PF_S3[data_PF_S3['survey_type'] == 'psych_flex'].shape[0] * 100, '%')

In [None]:
#split off the psych_flex survey
data_PF_S3_PFonly = data_PF_S3[data_PF_S3['survey_type'] == 'psych_flex']
print(data_PF_S3_PFonly.shape)

In [None]:
#compute time between when survey is sent and when participant starts the survey
data_PF_S3_PFonly['start_delay'] = data_PF_S3_PFonly['started_ts_utc'] - data_PF_S3_PFonly['delivered_ts_utc']
#compute time between when survey is sent and when participant starts the survey
data_PF_S3_PFonly['time_to_complete'] = data_PF_S3_PFonly['completed_ts_utc'] - data_PF_S3_PFonly['started_ts_utc']

In [None]:
#each participant should have 50 survey entries
data_PF_S3_PFonly['participant_id'].value_counts()

In [None]:
data_PF_S3_PFonly[data_PF_S3_PFonly['participant_id'] == 'f9264a9d-99d8-4937-8bda-eb620c0c1ed7']

In [None]:
df['score'] = df['raw'].str.extract('(\d\d\d\d\.\d)', expand=True)

"{'11': 5, '12': 5, '9': 3, '7': 4, '3': 4, '4': 5, '8': 4, '13': 4, '10': 5, '1': 0, '15': 5, '6': 4, '14': 4, '5': 4, '2': [1, 3, 5, 7]}"

In [None]:
x = data_PF_S3_PFonly['results_updated'].loc[44]
x

In [None]:
data_PF_S3_PFonly['results_updated'].str.extract('\{(\'(\d|\d\d)\':\s(\d|\d\d),\s){14}\'(\d|\d\d)\':\s\[\d,\s\d,\s\d,\s\d\]\}')

In [None]:
import json
json.loads(x)

In [None]:
x = (data_PF_S3_PFonly['results_updated'].loc[44].split(','))
print(x)
questions = np.arange(15)
for number in questions:
    if re.search(r'1', x[0]):
        print(3)

In [None]:
#number of participants sent surveys across time
ax = data_PF_S3_PFonly.groupby('survey_dt')['participant_id'].count().plot()
#number of participants completed surveys across time
ax = data_PF_S3_PFonly[data_PF_S3_PFonly['completed'] == 1].groupby('survey_dt')['completed'].count().plot()
plt.legend(('sent', 'completed'))
plt.title('Surveys sent vs completed across time')
ax.set_ylabel('Count')
plt.show

In [None]:
ax = (data_PF_S3_PFonly[data_PF_S3_PFonly['completed'] == 1].groupby('survey_dt')['participant_id'].count() /
 data_PF_S3_PFonly.groupby('survey_dt')['participant_id'].count()*100).plot()

ax.set_ylabel('Response rate (%)')
plt.title('Survey response rate across time')
plt.show