Preprocess and clean psychological flexibility EMA data from S3 bucket

### 'results_updated' column contains answers to the psychological flexibility survey questions 1-15, need to be processed according to JV instructions as follows:

#### instructions from JV for question 1, 3-15

relabel “1” as “activity”

create ordinal variable “pf_mgt” by computing mean of items 3-15 (range 1-5) – this is the main psychological flexibility score


#### instructions from JV for question 2

create binary categorical variable “exp_0”.  
If question “2” response includes value of 0 or 0.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_1”.  
If question “2” response includes value of 1 or 1.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_2”.  
If question “2” response includes value of 2 or 2.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_3”.  
If question “2” response includes value of 3 or 3.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_4”.  
If question “2” response includes value of 4 or 4.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_5”.  
If question “2” response includes value of 5 or 5.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_6”.  
If question “2” response includes value of 6 or 2.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_7”.  
If question “2” response includes value of 7 or 7.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_8”.  
If question “2” response includes value of 8 or 8.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_9”.  
If question “2” response includes value of 9 or 9.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_10”.  
If question “2” response includes value of 10 or 10.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_11”.  
If question “2” response includes value of 11 or 11.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_12”.  
If question “2” response includes value of 12 or 12.0, code as 1, otherwise code as 0.

create binary categorical variable “exp_13”.  
If question “2” response includes value of 13 or 13.0, code as 1, otherwise code as 0.


create ratio variable “exp_neg” by computing sum of variables: exp_0, exp_2, exp_4, exp_6, exp_8 (range 0-5)

create ratio variable “exp_pos” by computing sum of variables: exp_1, exp_3, exp_5, exp_7 (range 0-4)

create ratio variable “exp_neut” by computing sum of variables: exp_9, exp_10, exp_11, exp_12, exp_13 (range 0-5)


In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

In [None]:
path_PF_S3 = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/TILES/PF_survey_fix/S3_app_surveys_updated.csv'

In [None]:
#read in csv from S3 bucket; psychologial felxibility EMAs in long form
data_PF_S3 = pd.read_csv(path_PF_S3)
data_PF_S3 = pd.DataFrame(data = data_PF_S3)
print(data_PF_S3.shape)

In [None]:
print('data_PF_S3 shape:\n', data_PF_S3.shape, '\n')
print('data_PF_S3 unique survey_id shape:\n', data_PF_S3['survey_id'].unique().shape, '\n')
print('data_PF_S3 unique participant_id shape:\n', data_PF_S3['participant_id'].unique().shape, '\n')
print('data_PF_S3 unique survey_types:\n', data_PF_S3['survey_type'].value_counts(), '\n')

In [None]:
#split off the psych_flex survey
data_PF_S3_PFonly = data_PF_S3[data_PF_S3['survey_type'] == 'psych_flex']
print(data_PF_S3_PFonly.shape)

In [None]:
#each participant should have 50 survey entries
data_PF_S3_PFonly_participant_vc = data_PF_S3_PFonly['participant_id'].value_counts()
#save this info as csv to share with JV
data_PF_S3_PFonly_participant_vc.to_csv('data_PF_S3_PFonly_participant_vc.csv')

In [None]:
def process_pf(data):
    
    #generate dics
    qno2_dic = {'activity': np.nan,
               'pf_03': np.nan,
               'pf_04': np.nan,
               'pf_05': np.nan,
               'pf_06': np.nan,
               'pf_07': np.nan,
               'pf_08': np.nan,
               'pf_09': np.nan,
               'pf_10': np.nan,
               'pf_11': np.nan,
               'pf_12': np.nan,
               'pf_13': np.nan,
               'pf_14': np.nan,
               'pf_15': np.nan}
        
    q2_dic = {'exp_0': 0,
              'exp_1': 0, 
              'exp_2': 0,
              'exp_3': 0,
              'exp_4': 0,
              'exp_5': 0,
              'exp_6': 0,
              'exp_7': 0,
              'exp_8': 0,
              'exp_9': 0,
              'exp_10': 0,
              'exp_11': 0,
              'exp_12': 0,
              'exp_13': 0,
              'exp_neg': 0,
              'exp_pos': 0,
              'exp_neut': 0}
    
    #determine if they answered the survey, if they did then process and update qno2_dic and q2_dic
    try:
        type(data) == str
    
        #create dic from entry
        dic = eval(data)
        
        #process question #2 first, consists of 1-13 possible entries 
        try: #not everyone answered q2
            if type(dic['2']) == int: #if only have 1 entry for q2
                x = dic['2']
                q2_dic[f'exp_{x}'] = 1
            else: #for multiple entries for q2
                for item in dic['2']:
                    item = int(item)
                    q2_dic[f'exp_{item}'] = 1
            #use entries to compute three different numbers (see above for more details)
            q2_dic['exp_neg'] = q2_dic['exp_0'] + q2_dic['exp_2'] + q2_dic['exp_4'] + q2_dic['exp_6'] + q2_dic['exp_8']
            q2_dic['exp_pos'] = q2_dic['exp_1'] + q2_dic['exp_3'] + q2_dic['exp_5'] + q2_dic['exp_7']
            q2_dic['exp_neut'] = q2_dic['exp_9'] + q2_dic['exp_10'] + q2_dic['exp_11'] + q2_dic['exp_12'] + q2_dic['exp_13']
            
        except:
            #so we can find the surveys where q2 wasn't answered at all
            q2_dic = {'exp_0': np.nan,
              'exp_1': np.nan, 
              'exp_2': np.nan,
              'exp_3': np.nan,
              'exp_4': np.nan,
              'exp_5': np.nan,
              'exp_6': np.nan,
              'exp_7': np.nan,
              'exp_8': np.nan,
              'exp_9': np.nan,
              'exp_10': np.nan,
              'exp_11': np.nan,
              'exp_12': np.nan,
              'exp_13': np.nan,
              'exp_neg': np.nan,
              'exp_pos': np.nan,
              'exp_neut': np.nan}
        
        #process questions 1 and 3-15
        try: #not everyone answered all questions
            qno2_dic['activity'] = dic['1']
        except:
            pass
        try:
            qno2_dic['pf_03'] = dic['3']
        except:
            pass
        try:
            qno2_dic['pf_04'] = dic['4']
        except:
            pass
        try:
            qno2_dic['pf_05'] = dic['5']
        except:
            pass
        try:
            qno2_dic['pf_06'] = dic['6']
        except:
            pass
        try:
            qno2_dic['pf_07'] = dic['7']
        except:
            pass
        try:
            qno2_dic['pf_08'] = dic['8']
        except:
            pass
        try:
            qno2_dic['pf_09'] = dic['9']
        except:
            pass
        try:
            qno2_dic['pf_10'] = dic['10']
        except:
            pass
        try:
            qno2_dic['pf_11'] = dic['11']
        except:
            pass
        try:
            qno2_dic['pf_12'] = dic['12']
        except:
            pass
        try:
            qno2_dic['pf_13'] = dic['13']
        except:
            pass
        try:
            qno2_dic['pf_14'] = dic['14']
        except:
            pass
        try:
            qno2_dic['pf_15'] = dic['15']
        except:
            pass
    
    except:
        pass
    
    #create dataframes
    pf_df_q2 = pd.DataFrame.from_dict(q2_dic, orient='index').T
    
    pf_df_qno2 = pd.DataFrame.from_dict(qno2_dic, orient='index').T
    pf_df_qno2 = pf_df_qno2.reindex(sorted(pf_df_qno2.columns), axis=1)
    
    #see instructions above for more details
    pf_df_qno2.rename({'pf_01': 'activity'}, axis='columns', inplace=True)
    pf_df_qno2['pf_mgt'] = pf_df_qno2.loc[:, 'pf_03':].mean(axis = 1)

    pf_df = pd.concat([pf_df_qno2, pf_df_q2], axis = 1)
    
    return pf_df

In [None]:
#create new data frame that will contain data_PF_S3_PFonly and the processed pf data for each entry
col_names = ['activity', 'pf_03', 'pf_04', 'pf_05', 'pf_06', 'pf_07', 'pf_08', 'pf_09', 'pf_10', 'pf_11', 'pf_12', 'pf_13', 'pf_14', 'pf_15', 'pf_mgt', 'exp_0', 'exp_1', 'exp_2', 'exp_3', 'exp_4', 'exp_5', 'exp_6', 'exp_7', 'exp_8', 'exp_9', 'exp_10', 'exp_11', 'exp_12', 'exp_13', 'exp_neg', 'exp_pos', 'exp_neut']
pf_final = pd.DataFrame(columns=col_names, index=data_PF_S3_PFonly.index)
pf_final = pd.concat([data_PF_S3_PFonly, pf_final], axis = 1)

for index, row in data_PF_S3_PFonly.iterrows():
    df = process_pf(row['results_updated'])
    pf_final.loc[index, col_names] = df.values[0]

pf_final.head()

In [None]:
#save this info as csv to use for subsequent analysis and to share with JV and rest of group
pf_final.to_csv('pf_final.csv')