Explore psychological flexibility EMA data after preprocessing (see other NBs for preprocessing code), determine outliers and careless responding among participants and across survey admin dates

In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os
from itertools import groupby

#visualizing results
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
#import yellowbrick as yb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

In [None]:
path_PF_clean = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/TILES/final_data/pf_final.csv'
path_part_info = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/TILES/final_data/participant_info.csv'

In [None]:
#read in csv containing participant info
data_part_info = pd.read_csv(path_part_info)
data_part_info = pd.DataFrame(data = data_part_info)
#data_PF.reset_index(inplace=True)

print('Original data_part_info shape:\n', data_part_info.shape, '\n')
#ensure no replicate ID (211 participants in study)
print('Original data_part_info unique IDs:\n', data_part_info['ParticipantID'].unique().shape, '\n')
#how much missing data is there?
print('Original data_part_info missing value counts:\n', data_part_info.isnull().sum(), '\n')
#what is the data type of each column?
print('Original data_part_info data types:\n', data_part_info.info(), '\n')

data_part_info.head()

In [None]:
#there should only be a single entry for each of the 212 participants but data frame is 213 rows, find duplicate ParticipantIDs
print(data_part_info['ParticipantID'].value_counts())
#examine duplicate
print(data_part_info[data_part_info['ParticipantID'] == '5dc874cc-0b53-449b-bfa9-57fb665f388b'])
#duplicate entry confirmed, delete duplicate 
data_part_info.drop_duplicates(subset='ParticipantID', keep='first', inplace=True)
print(data_part_info['ParticipantID'].shape)

In [None]:
#read in csv from preprocessed psychologial felxibility EMAs 
data_PF = pd.read_csv(path_PF_clean)
data_PF = pd.DataFrame(data = data_PF)
#data_PF.reset_index(inplace=True)

print('Original data_PF_S3 shape:\n', data_PF.shape, '\n')
#ensure no replicate ID (211 participants in study)
print('Original data_PF_S3 unique IDs:\n', data_PF['participant_id'].unique().shape, '\n')
#how much missing data is there?
print('Original data_PF_S3 missing value counts:\n', data_PF.isnull().sum(), '\n')
#what is the data type of each column?
print('Original data_PF_S3 data types:\n', data_PF.info(), '\n')
#what is the participant response rate across the entire study?
print('Non-response rate for PF survey:\n', data_PF['completed_ts_utc'].isnull().sum() / data_PF.shape[0] * 100, '%')
#add a binary column for if survey was completed (1) or not (0)
data_PF['completed'] = np.where(data_PF['results_updated'].isnull(), 0, 1)

In [None]:
#add the information contained in data_part_info to data_PF
#first create new data table of data_part_info that contains the same number of rows for each participant in that is in data_PF
#(e.g. replicate data_part_info so same length as data_PF for each participant)

participants = data_PF['participant_id'].unique()

data_part_info_long = pd.DataFrame()

for part in participants:
    df_part_long = pd.concat([data_part_info[data_part_info['ParticipantID'] == part]]*len(data_PF[data_PF['participant_id'] ==  part]), ignore_index=True)
    data_part_info_long = data_part_info_long.append(df_part_long)

#confirm the two data tables are now the same lenght
print('data_PF and data_part_info_long are the same length:', data_PF.shape[0] == data_part_info_long.shape[0])
print(data_PF.shape[0])
print(data_part_info_long.shape[0])

In [None]:
#merge data tables
data_PF.reset_index(inplace=True)
data_part_info_long.reset_index(inplace=True)
data_PF_final = pd.concat([data_PF, data_part_info_long], axis = 1)
#data_PF_final = pd.merge(data_PF, data_part_info_long, left_on='participant_id', right_on='ParticipantID')
print(data_PF_final.shape)
data_PF_final.head()

In [None]:
#make new column corresponding to the number in series of survey sent for each participant
part_id = data_PF_final['participant_id'].unique()
for participant in part_id:
    surveys = data_PF_final[data_PF_final['participant_id'] == participant]
    data_PF_final.loc[data_PF_final['participant_id'] == participant, 'survey_count'] = np.arange(surveys.shape[0])

In [None]:
#change dates from objects to datetimes
data_PF_final['survey_dt'] = data_PF_final['survey_dt'].astype('datetime64')
data_PF_final['delivered_ts_utc'] = data_PF_final['delivered_ts_utc'].astype('datetime64[ns]')
data_PF_final['started_ts_utc'] = data_PF_final['started_ts_utc'].astype('datetime64[ns]')
data_PF_final['completed_ts_utc'] = data_PF_final['completed_ts_utc'].astype('datetime64[ns]')
data_PF_final['ingested_ts_utc'] = data_PF_final['ingested_ts_utc'].astype('datetime64[ns]')

#confirm change 
print(data_PF_final.info())

In [None]:
#make new column of only numeric corresponding to activity questions (code -1 for write in responses)
activity_num = []
for index, row in data_PF_final.iterrows():
    try:
        num = float(row['activity'])
        activity_num.append(num)
    except:
        activity_num.append(-1)

data_PF_final['activity_num'] = activity_num

In [None]:
#compute time between when survey is sent and when participant starts the survey
data_PF_final['start_delay'] = (data_PF_final['started_ts_utc'] - data_PF_final['delivered_ts_utc']).astype('timedelta64[s]')
#compute time between when survey is sent and when participant starts the survey
data_PF_final['time_to_complete'] = (data_PF_final['completed_ts_utc'] - data_PF_final['started_ts_utc']).astype('timedelta64[s]')

In [None]:
#each participant should have 50 survey entries
data_PF_final['participant_id'].value_counts()

In [None]:
#number of participants sent surveys across time
ax = data_PF_final.groupby('survey_count')['participant_id'].count().plot()
#number of participants completed surveys across time
ax = data_PF_final[data_PF_final['completed'] == 1].groupby('survey_count')['completed'].count().plot()
plt.legend(('sent', 'completed'))
plt.title('Surveys sent vs completed across time')
ax.set_ylabel('Count')
plt.show

In [None]:
#number of participants sent surveys across time
ax = data_PF_final.groupby('survey_dt')['participant_id'].count().plot()
#number of participants completed surveys across time
ax = data_PF_final[data_PF_final['completed'] == 1].groupby('survey_dt')['completed'].count().plot()
plt.legend(('sent', 'completed'))
plt.title('Surveys sent vs completed across time')
ax.set_ylabel('Count')
plt.show

In [None]:
#split off data for surveys that were completed
data_PF_completed = data_PF_final[data_PF_final['completed'] == 1]
#data_PF_completed.reset_index(inplace=True)
print(data_PF_completed.shape)
#null survey questions for surveys that were completed
data_PF_completed.isnull().sum()

In [None]:
#there should be no nans in columns 'activity' and 'pf_3' to 'pf_mgt', find these rows
pf_no_comp = data_PF_completed.loc[data_PF_completed.loc[:, 'activity':'pf_mgt'].isnull().sum(axis=1) > 0]
print(pf_no_comp.shape)
pf_no_comp.to_csv('pf_no_comp.csv')
#participants did not have to answer 'exp_0' to 'exp_13', so no missing values

In [None]:
#compute new column containing long string analysis results (e.g. max length of same number answered for pf_03:pf_15)
max_string = []
for index, row in data_PF_completed.iterrows():
    repeats = []
    for k, g in groupby(row.loc['pf_03':'pf_15']):
        repeats.append(sum(1 for i in g))
    max_string.append(max(repeats))

data_PF_completed['longest_string_pf'] = max_string

In [None]:
#new data frame of only first 50 surveys
data_PF_completed_first50 = data_PF_completed[data_PF_completed['survey_count'] < 50]

In [None]:
data_PF_completed_first50['time_to_complete'].plot(kind='hist', bins=50)

In [None]:
#create bins of 5 surveys across time
data_PF_completed_first50['survey_count_by5'] = pd.cut(data_PF_completed['survey_count'], 10, labels=False)
#create bins of start_delay
data_PF_completed_first50['start_delay_3'] = pd.cut(data_PF_completed['start_delay'], 10, labels=False)
#create bins of time_to_complete
data_PF_completed_first50['time_to_complete_3'] = pd.cut(data_PF_completed['time_to_complete'], 3, labels=False)
#create bins of time_to_complete
data_PF_completed_first50['longest_string_pf_3'] = pd.cut(data_PF_completed['longest_string_pf'], 3, labels=False)

In [None]:
ax = (data_PF[data_PF['completed'] == 1].groupby('survey_count')['participant_id'].count() /
data_PF.groupby('survey_count')['participant_id'].count()*100).plot()

ax.set_ylabel('Response rate (%)')
plt.title('Survey response rate across time')
plt.show

In [None]:
#delay to starting survey across survey dates
ax = data_PF_completed.groupby('survey_count')['start_delay'].mean().plot()

plt.title('Delay to starting survey across survey dates')
ax.set_ylabel('Min')
plt.show

In [None]:
#time to complete survey across survey dates
ax = data_PF_completed.groupby('survey_count')['time_to_complete'].mean().plot()

plt.title('Time to complete survey across survey dates')
ax.set_ylabel('Min')
plt.show

In [None]:
data_PF_completed.head()

In [None]:
#time to complete survey across participants
ax = data_PF_completed.groupby(['participant_id'])['time_to_complete'].mean().plot(kind='hist', bins = 75)

plt.title('Time to complete survey across participants')
ax.set_ylabel('Number of participants')
plt.show

In [None]:
#pf_mgt across survey dates
ax = data_PF_completed.groupby(['survey_count'])['pf_mgt'].mean().plot()

plt.title('pf_mgt across survey dates')
ax.set_ylabel('pf_mgt')
plt.show

In [None]:
#delay to starting survey across survey dates
ax = data_PF_completed.groupby('survey_count')['longest_string_pf'].mean().plot()

plt.title('Longest string across survey dates')
ax.set_ylabel('Longest string')
plt.show

In [None]:
dep_vars = ['pf_mgt', 'exp_neg', 'exp_pos', 'exp_neut', 'survey_count', 'start_delay', 'time_to_complete', 'longest_string_pf']
sns.pairplot(data_PF_completed[dep_vars].dropna(), kind='reg')

In [None]:
data_PF_completed[dep_vars].corr()

In [None]:
for param in dep_vars:
    plt.figure(figsize=(10,5))
    data_PF_completed.groupby(['activity_num'])[param].mean().plot(kind='bar', yerr=data_PF_completed.groupby(['activity_num'])[param].sem())
    plt.ylabel([param])
    plt.show()

In [None]:
for param in dep_vars:
    plt.figure(figsize=(10,5))
    data_PF_completed.groupby(['longest_string_pf'])[param].mean().plot(kind='bar', yerr=data_PF_completed.groupby(['longest_string_pf'])[param].sem())
    plt.ylabel([param])
    plt.show()

In [None]:
data_PF_completed[data_PF_completed['time_to_complete'] <= 600]['time_to_complete'].plot(kind='hist', bins=50)

In [None]:
data_PF_completed['time_to_complete'].value_counts()

In [None]:
data_PF_completed[data_PF_completed['time_to_complete'] <= 5]['time_to_complete'].shape