# Setup

## Imports

In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import datetime
from dateutil import parser
import matplotlib.pyplot as plt
import seaborn as sns

## Set Paths

In [21]:
# set run
run_num = 1

if run_num ==1:
    # run 1
    path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_1_raw_data/run_1/app_data/'
    save_path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_2_processed_data/run_1/'
    subjects_run1 = pd.read_csv(path + '../run1_subjects.csv')
    subjects = subjects_run1.ParticipantIdentifier
elif run_num ==2:
    # run 2
    path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_1_raw_data/run_2/app_data/'
    save_path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_2_processed_data/run_2/'
    subjects_run2 = pd.read_csv(path + '../run2_subjects.csv')
    subjects = subjects_run2.ParticipantIdentifier
    
eda_reports_path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_4_outputs/EDA/'

# Active Data

## Import Data

In [3]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

In [4]:
# loop through all days
days = [i for i in os.listdir(path) if i.startswith('RK')]
for day in tqdm(days):
    files = os.listdir(path + day)
    surveyQuestions = [i for i in files if i.startswith('SurveyQuestionResults')]
    # there should be only one
    for file in surveyQuestions:
        if 'df' not in globals():
            df = pd.read_csv(path + day + '/' + file)
        else:
            temp_df = pd.read_csv(path + day + '/' + file)
            df = pd.concat([df,temp_df], axis=0)

  0%|          | 0/87 [00:00<?, ?it/s]

100%|██████████| 87/87 [00:02<00:00, 31.59it/s]


In [5]:
df.shape

(597839, 8)

In [6]:
# select relevant columns
df = df[['ParticipantIdentifier', 'ResultIdentifier', 'Answers', 'EndDate']]
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,ParticipantIdentifier,ResultIdentifier,Answers,EndDate
0,6338356d-f098-46ea-b270-10c6fff7e67e,WEEKLY_goal_report1,19,2022-10-21T21:55:56-04:00
1,6338356d-f098-46ea-b270-10c6fff7e67e,WEEKLY_goal_report2,2,2022-10-21T21:55:59-04:00
2,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,task_custom_bart_info1,"{""totalEarnings"":0}",2022-10-24T07:04:41-04:00
3,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,task_custom_bart_info2,"{""totalEarnings"":0}",2022-10-24T07:04:41-04:00
4,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,task_custom_bart_info3,"{""totalEarnings"":0}",2022-10-24T07:04:42-04:00


In [7]:
df.isna().sum()

ParticipantIdentifier     0
ResultIdentifier          0
Answers                   2
EndDate                  15
dtype: int64

In [8]:
# Remove rows without valid EndDate value
df = df.dropna(subset=['EndDate']).reset_index(drop=True)

# Select relevant subjects
df = df.loc[df.ParticipantIdentifier.isin(subjects)].reset_index(drop=True)

In [9]:
# add trial date and time columns
for i in tqdm(range(df.shape[0])):
    dt = parser.parse(df.loc[i, 'EndDate'])
    df.loc[i, 'datetime'] = dt
    df.loc[i, 'trial_date'] = (dt + datetime.timedelta(hours = -4.75)).date() # trial day associated with sample (4:45am is when the day flips)
    df.loc[i, 'time'] = dt.time()

100%|██████████| 586408/586408 [01:00<00:00, 9686.29it/s]


In [None]:
# save to csv
if run_num ==1:
    # run 1
    df.to_csv(save_path + 'run1_survey_results.csv', index=False)
if run_num ==2:
    # run 2
    df.to_csv(save_path + 'run2_survey_results.csv', index=False)

# Gap App

## Self Report

### Affect

#### Select Data

In [10]:
df_affect = df.loc[df.ResultIdentifier.str.startswith('affect_')].reset_index(drop=True)
df_affect_am = df.loc[(df.ResultIdentifier.str.startswith('affect_')) & (df.ResultIdentifier.str.endswith('am'))].reset_index(drop=True)
df_affect_pm = df.loc[(df.ResultIdentifier.str.startswith('affect_')) & (~df.ResultIdentifier.str.endswith('am'))].reset_index(drop=True)

#### Convert to Wide

In [15]:
df_affect_pm_wide = df_affect_pm.pivot_table(index=["ParticipantIdentifier", "trial_date"], 
                    columns='ResultIdentifier', 
                    values='Answers').reset_index()
# get rid of name on index
df_affect_pm_wide = df_affect_pm_wide.rename_axis(None, axis=1)

df_affect_am_wide = df_affect_am.pivot_table(index=["ParticipantIdentifier", "trial_date"], 
                    columns='ResultIdentifier', 
                    values='Answers').reset_index()
# get rid of name on index
df_affect_am_wide = df_affect_am_wide.rename_axis(None, axis=1)

In [16]:
# join
df_daily_affect_wide = df_affect_pm_wide.merge(df_affect_am_wide, how='left', on=['ParticipantIdentifier', 'trial_date'])

df_daily_affect_wide.head(3)

Unnamed: 0,ParticipantIdentifier,trial_date,affect_neg_angry,affect_neg_ashamed,affect_neg_bored,affect_neg_depressed,affect_neg_embarrassed,affect_neg_frustrated,affect_neg_guilty,affect_neg_lazy,...,affect_neg_sad_am,affect_neg_stressed_am,affect_pos_amused_am,affect_pos_appreciated_am,affect_pos_excited_am,affect_pos_focused_am,affect_pos_happy_am,affect_pos_hopeful_am,affect_pos_motivated_am,affect_pos_relaxedCalm_am
0,0501ba67-3406-4779-aff1-878a0e9f7885,2022-09-30,1.0,4.0,5.0,4.0,1.0,3.0,4.0,4.0,...,,,,,,,,,,
1,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-01,1.0,2.0,1.0,1.0,1.0,3.0,3.0,3.0,...,2.0,2.0,1.0,3.0,4.0,4.0,4.0,5.0,4.0,3.0
2,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-02,3.0,1.0,4.0,5.0,1.0,4.0,2.0,5.0,...,,,,,,,,,,


In [47]:
df_complete_idDate.trial_date[0].date()

datetime.date(2022, 9, 27)

In [45]:
df_daily_affect_wide.trial_date[0]

datetime.date(2022, 9, 30)

In [48]:
# Create df with one participant ID for every trial_date

# Create a series of dates from '2022-09-27' to '2022-12-20'
date_series = pd.date_range(start='2022-09-27', end='2022-12-20')
ids_series = subjects

# Create a dataframe using a cartesian product of the two series
df_complete_idDate = pd.DataFrame({
    'ParticipantIdentifier': np.repeat(ids_series, len(date_series)),
    'trial_date': date_series.tolist() * len(ids_series)
}).reset_index(drop=True)

# Convert trial_date to datetime.date
df_complete_idDate['trial_date'] = pd.to_datetime(df_complete_idDate['trial_date']).dt.date

# Join with affect df
df_daily_affect_wide = df_complete_idDate.merge(df_daily_affect_wide, how='left', on=['ParticipantIdentifier', 'trial_date'])

#### EDA Profiling

In [17]:
from ydata_profiling import ProfileReport

  from .autonotebook import tqdm as notebook_tqdm


In [52]:
profile = ProfileReport(df_daily_affect_wide.iloc[:,2:], title="Affect | Pandas Profiling Report")
profile.to_file(eda_reports_path + "affect_report2.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Summarize dataset: 100%|██████████| 450/450 [00:23<00:00, 19.24it/s, Completed]                                                   
Generate report structure: 100%|██████████| 1/1 [00:04<00:00,  4.49s/it]
Render HTML: 100%|██████████| 1/1 [00:03<00:00,  3.50s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 37.65it/s]


In [20]:
df_daily_affect_wide.loc[df_daily_affect_wide['affect_neg_angry']>5]

Unnamed: 0,ParticipantIdentifier,trial_date,affect_neg_angry,affect_neg_ashamed,affect_neg_bored,affect_neg_depressed,affect_neg_embarrassed,affect_neg_frustrated,affect_neg_guilty,affect_neg_lazy,...,affect_neg_sad_am,affect_neg_stressed_am,affect_pos_amused_am,affect_pos_appreciated_am,affect_pos_excited_am,affect_pos_focused_am,affect_pos_happy_am,affect_pos_hopeful_am,affect_pos_motivated_am,affect_pos_relaxedCalm_am
321,147400db-43d9-4155-8bf2-b85b8adf4315,2022-12-02,16.5,16.5,16.5,16.5,16.5,16.5,16.5,16.5,...,,,,,,,,,,
424,156895d9-0f71-4844-92dd-3fb298f84f2b,2022-12-17,11.0,2.0,10.5,10.5,10.5,2.0,16.0,20.5,...,1.0,1.0,1.0,1.0,1.0,2.0,4.0,3.0,2.0,4.0
505,1e7aef96-16cc-43f8-95d4-e3bc582eb6d3,2022-10-04,5.5,5.5,5.5,5.5,5.5,7.0,5.5,27.0,...,1.0,2.0,2.0,3.0,1.0,1.0,5.0,2.0,1.0,4.0
790,2db37fd7-2694-4b46-80f9-9f4b58d57bfa,2022-09-29,12.0,22.0,12.0,12.0,12.0,12.0,7.0,12.0,...,,,,,,,,,,
2206,852b45ca-f506-42b2-b5bd-2b0072b1b28b,2022-10-03,5.5,11.5,17.5,5.5,17.0,22.0,22.0,22.5,...,,,,,,,,,,
2658,a57c95d0-1fcc-4a16-aba7-294259f89fd7,2022-10-08,21.5,16.0,6.0,27.0,17.0,27.5,12.0,12.5,...,,,,,,,,,,
2704,a57c95d0-1fcc-4a16-aba7-294259f89fd7,2022-12-14,16.0,11.0,22.0,11.0,11.5,16.5,16.5,21.0,...,,,,,,,,,,
3949,f889f1a4-9754-456e-ae08-092f992d3359,2022-10-09,5.5,5.5,20.5,5.5,5.5,17.0,5.5,22.0,...,,,,,,,,,,


#### Save

In [None]:
# save to csv
df_daily_affect_wide.to_csv(save_path + 'run2_affect.csv', index=False)