Preprocess and clean psychological flexibility EMA data from S3 bucket

In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os

#visualizing results
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
#import yellowbrick as yb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

In [None]:
path_PF_S3 = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/TILES/PF_survey_fix/S3_app_surveys_updated.csv'

In [None]:
#read in csv from S3 bucket; psychologial felxibility EMAs in long form
data_PF_S3 = pd.read_csv(path_PF_S3)
data_PF_S3 = pd.DataFrame(data = data_PF_S3)
print('Original data_PF_S3 shape:\n', data_PF_S3.shape, '\n')
#ensure no replicate ID (211 participants in study)
print('Original data_PF_S3 unique IDs:\n', data_PF_S3['participant_id'].unique().shape, '\n')
#how much missing data is there?
print('Original data_PF_S3 missing value counts:\n', data_PF_S3.isnull().sum(), '\n')
#what is the data type of each column?
print('Original data_PF_S3 data types:\n', data_PF_S3.info(), '\n')
#what is the participant response rate across the entire study?
print('Non-response rate over all PF surveys:\n', data_PF_S3.isnull().sum().max() / data_PF_S3.shape[0] * 100, '%')
#add a binary column for if survey was completed (1) or not (0)
data_PF_S3['completed'] = np.where(data_PF_S3['results_updated'].isnull(), 0, 1)

In [None]:
#change dates from objects to datetimes
data_PF_S3['survey_dt'] = data_PF_S3['survey_dt'].astype('datetime64')
data_PF_S3['delivered_ts_utc'] = data_PF_S3['delivered_ts_utc'].astype('datetime64[ns]')
data_PF_S3['started_ts_utc'] = data_PF_S3['started_ts_utc'].astype('datetime64[ns]')
data_PF_S3['completed_ts_utc'] = data_PF_S3['completed_ts_utc'].astype('datetime64[ns]')
data_PF_S3['ingested_ts_utc'] = data_PF_S3['ingested_ts_utc'].astype('datetime64[ns]')

#confirm change 
print(data_PF_S3.info())

In [None]:
print('data_PF_S3 shape:\n', data_PF_S3.shape, '\n')
print('data_PF_S3 unique survey_id shape:\n', data_PF_S3['survey_id'].unique().shape, '\n')
print('data_PF_S3 unique participant_id shape:\n', data_PF_S3['participant_id'].unique().shape, '\n')
print('data_PF_S3 unique survey_types:\n', data_PF_S3['survey_type'].value_counts(), '\n')

#what is the participant response rate for each survey type?
print('Non-response rate over psych_flex survey:\n', data_PF_S3[data_PF_S3['survey_type'] == 'psych_flex'].isnull().sum().max() /
      data_PF_S3[data_PF_S3['survey_type'] == 'psych_flex'].shape[0] * 100, '%')
print('Non-response rate over engage_psycap survey:\n', data_PF_S3[data_PF_S3['survey_type'] != 'psych_flex'].isnull().sum().max() /
      data_PF_S3[data_PF_S3['survey_type'] == 'psych_flex'].shape[0] * 100, '%')

In [None]:
#split off the psych_flex survey
data_PF_S3_PFonly = data_PF_S3[data_PF_S3['survey_type'] == 'psych_flex']
print(data_PF_S3_PFonly.shape)

In [None]:
#compute time between when survey is sent and when participant starts the survey
data_PF_S3_PFonly['start_delay'] = data_PF_S3_PFonly['started_ts_utc'] - data_PF_S3_PFonly['delivered_ts_utc']
#compute time between when survey is sent and when participant starts the survey
data_PF_S3_PFonly['time_to_complete'] = data_PF_S3_PFonly['completed_ts_utc'] - data_PF_S3_PFonly['started_ts_utc']

In [None]:
#each participant should have 50 survey entries
data_PF_S3_PFonly['participant_id'].value_counts()

In [None]:
#split off new data frame of rows with completed surveys
data_PF_S3_PFonly_completed = data_PF_S3_PFonly[data_PF_S3_PFonly['completed'] == 1]

## 'results_updated' column contains answers to the psychological flexibility survey questions 1-15, need to be processed according to JV instructions as follows:

### instructions from JV for question 1, 3-15

relabel “1” as “activity”

create ordinal variable “pf_mgt” by computing mean of items 3-15 (range 1-5) – this is the main psychological flexibility score


### instructions from JV for question 2

create binary categorical variable “exp_0”.  
If question “2” response includes value of 0 or 0.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_1”.  
If question “2” response includes value of 1 or 1.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_2”.  
If question “2” response includes value of 2 or 2.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_3”.  
If question “2” response includes value of 3 or 3.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_4”.  
If question “2” response includes value of 4 or 4.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_5”.  
If question “2” response includes value of 5 or 5.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_6”.  
If question “2” response includes value of 6 or 2.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_7”.  
If question “2” response includes value of 7 or 7.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_8”.  
If question “2” response includes value of 8 or 8.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_9”.  
If question “2” response includes value of 9 or 9.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_10”.  
If question “2” response includes value of 10 or 10.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_11”.  
If question “2” response includes value of 11 or 11.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_12”.  
If question “2” response includes value of 12 or 12.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_13”.  
If question “2” response includes value of 13 or 13.0, code as 1, otherwise code as 0.


create ratio variable “exp_neg” by computing sum of variables: exp_0, exp_2, exp_4, exp_6, exp_8 (range 0-5)

create ratio variable “exp_pos” by computing sum of variables: exp_1, exp_3, exp_5, exp_7 (range 0-4)

create ratio variable “exp_neut” by computing sum of variables: exp_9, exp_10, exp_11, exp_12, exp_13 (range 0-5)


In [None]:
def process_pf(data):
    
    #create dic from entry
    dic = eval(data)

    #process q2 first
    q2_dic = {'exp_0': 0,
                 'exp_1': 0, 
                 'exp_2': 0,
                 'exp_3': 0,
                 'exp_4': 0,
                 'exp_5': 0,
                 'exp_6': 0,
                 'exp_7': 0,
                 'exp_8': 0,
                 'exp_9': 0,
                 'exp_10': 0,
                 'exp_11': 0,
                 'exp_12': 0,
                 'exp_13': 0,
                 'exp_neg': 0,
                 'exp_pos': 0,
                 'exp_neut': 0,}
    try:
        if type(dic['2']) == int:
            x = dic['2']
            q2_dic[f'exp_{x}'] = 1
        else:
            for item in dic['2']:
                item = int(item)
                q2_dic[f'exp_{item}'] = 1
        q2_dic['exp_neg'] = q2_dic['exp_0'] + q2_dic['exp_2'] + q2_dic['exp_4'] + q2_dic['exp_6'] + q2_dic['exp_8']
        q2_dic['exp_pos'] = q2_dic['exp_1'] + q2_dic['exp_3'] + q2_dic['exp_5'] + q2_dic['exp_7']
        q2_dic['exp_neut'] = q2_dic['exp_9'] + q2_dic['exp_10'] + q2_dic['exp_11'] + q2_dic['exp_12'] + q2_dic['exp_13']
        
    except:
        pass
    
    #remove q2 from starting dic to process rest of questitons
    qno2_dic = {'activity': 0,
               'pf_03': 0,
               'pf_04': 0,
               'pf_05': 0,
               'pf_06': 0,
               'pf_07': 0,
               'pf_08': 0,
               'pf_09': 0,
               'pf_10': 0,
               'pf_11': 0,
               'pf_12': 0,
               'pf_13': 0,
               'pf_14': 0,
               'pf_15': 0}
    
    try:
        del dic['2']
        
        try:
            qno2_dic['activity'] = dic['1']
        except:
            qno2_dic['activity'] = np.nan
        try:
            qno2_dic['pf_03'] = dic['3']
        except:
            qno2_dic['pf_03'] = np.nan
        try:
            qno2_dic['pf_04'] = dic['4']
        except:
            qno2_dic['pf_04'] = np.nan
        try:
            qno2_dic['pf_05'] = dic['5']
        except:
            qno2_dic['pf_05'] = np.nan
        try:
            qno2_dic['pf_06'] = dic['6']
        except:
            qno2_dic['pf_06'] = np.nan
        try:
            qno2_dic['pf_07'] = dic['7']
        except:
            qno2_dic['pf_07'] = np.nan
        try:
            qno2_dic['pf_08'] = dic['8']
        except:
            qno2_dic['pf_08'] = np.nan
        try:
            qno2_dic['pf_09'] = dic['9']
        except:
            qno2_dic['pf_09'] = np.nan
        try:
            qno2_dic['pf_10'] = dic['10']
        except:
            qno2_dic['pf_10'] = np.nan
        try:
            qno2_dic['pf_11'] = dic['11']
        except:
            qno2_dic['pf_11'] = np.nan
        try:
            qno2_dic['pf_12'] = dic['12']
        except:
            qno2_dic['pf_12'] = np.nan
        try:
            qno2_dic['pf_13'] = dic['13']
        except:
            qno2_dic['pf_13'] = np.nan
        try:
            qno2_dic['pf_14'] = dic['14']
        except:
            qno2_dic['pf_14'] = np.nan
        try:
            qno2_dic['pf_15'] = dic['15']
        except:
            qno2_dic['pf_15'] = np.nan
    except:
        pass
    
    pf_df_qno2 = pd.DataFrame.from_dict(qno2_dic, orient='index').T
    pf_df_qno2 = pf_df_qno2.reindex(sorted(pf_df_qno2.columns), axis=1)

    pf_df_qno2.rename({'pf_01': 'activity'}, axis='columns', inplace=True)
    pf_df_qno2['pf_mgt'] = pf_df_qno2.loc[:, 'pf_03':].mean(axis = 1)

    pf_df_q2 = pd.DataFrame.from_dict(q2_dic, orient='index').T

    pf_df = pd.concat([pf_df_qno2, pf_df_q2], axis = 1)
    
    return pf_df

In [None]:
pf_final = pd.DataFrame()
for index, row in data_PF_S3_PFonly_completed['results_updated'].iteritems():
    df = process_pf(row)
    pf_final = pf_final.append(df)
pf_final.reset_index(inplace=True)

In [None]:
data_PF_S3_PFonly_completed.reset_index(inplace=True)
pf_final = pd.concat([data_PF_S3_PFonly_completed, pf_final], axis = 1)

In [None]:
pf_final.head()

In [None]:
#number of participants sent surveys across time
ax = data_PF_S3_PFonly.groupby('survey_dt')['participant_id'].count().plot()
#number of participants completed surveys across time
ax = data_PF_S3_PFonly[data_PF_S3_PFonly['completed'] == 1].groupby('survey_dt')['completed'].count().plot()
plt.legend(('sent', 'completed'))
plt.title('Surveys sent vs completed across time')
ax.set_ylabel('Count')
plt.show

In [None]:
ax = (data_PF_S3_PFonly[data_PF_S3_PFonly['completed'] == 1].groupby('survey_dt')['participant_id'].count() /
 data_PF_S3_PFonly.groupby('survey_dt')['participant_id'].count()*100).plot()

ax.set_ylabel('Response rate (%)')
plt.title('Survey response rate across time')
plt.show