# Setup

## Imports

In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import datetime
from dateutil import parser
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport

  from .autonotebook import tqdm as notebook_tqdm


## Set Paths

In [11]:
# set run
run_num = 1

if run_num ==1:
    # run 1
    path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_1_raw_data/run_1/app_data/'
    save_path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_2_processed_data/run_1/'
    subjects_run1 = pd.read_csv(path + '../run1_subjects.csv')
    subjects = subjects_run1.ParticipantIdentifier
elif run_num ==2:
    # run 2
    path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_1_raw_data/run_2/app_data/'
    save_path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_2_processed_data/run_2/'
    subjects_run2 = pd.read_csv(path + '../run2_subjects.csv')
    subjects = subjects_run2.ParticipantIdentifier
    
eda_reports_path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_4_outputs/EDA/'

# Active Data

## Import Data

In [30]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

deleted existing df


In [31]:
# loop through all days
days = [i for i in os.listdir(path) if i.startswith('RK')]
for day in tqdm(days):
    files = os.listdir(path + day)
    surveyQuestions = [i for i in files if i.startswith('SurveyQuestionResults')]
    # there should be only one
    for file in surveyQuestions:
        if 'df' not in globals():
            df = pd.read_csv(path + day + '/' + file)
        else:
            temp_df = pd.read_csv(path + day + '/' + file)
            df = pd.concat([df,temp_df], axis=0)

100%|██████████| 87/87 [00:02<00:00, 32.24it/s]


In [32]:
df.shape

(597839, 8)

In [33]:
# select relevant columns
df = df[['ParticipantIdentifier', 'ResultIdentifier', 'Answers', 'EndDate']]
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,ParticipantIdentifier,ResultIdentifier,Answers,EndDate
0,6338356d-f098-46ea-b270-10c6fff7e67e,WEEKLY_goal_report1,19,2022-10-21T21:55:56-04:00
1,6338356d-f098-46ea-b270-10c6fff7e67e,WEEKLY_goal_report2,2,2022-10-21T21:55:59-04:00
2,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,task_custom_bart_info1,"{""totalEarnings"":0}",2022-10-24T07:04:41-04:00
3,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,task_custom_bart_info2,"{""totalEarnings"":0}",2022-10-24T07:04:41-04:00
4,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,task_custom_bart_info3,"{""totalEarnings"":0}",2022-10-24T07:04:42-04:00


In [34]:
df.isna().sum()

ParticipantIdentifier     0
ResultIdentifier          0
Answers                   2
EndDate                  15
dtype: int64

In [35]:
# Remove rows without valid EndDate value
df = df.dropna(subset=['EndDate']).reset_index(drop=True)

# Select relevant subjects
df = df.loc[df.ParticipantIdentifier.isin(subjects)].reset_index(drop=True)

In [36]:
# add trial date and time columns
for i in tqdm(range(df.shape[0])):
    dt = parser.parse(df.loc[i, 'EndDate'])
    df.loc[i, 'datetime'] = dt
    df.loc[i, 'trial_date'] = (dt + datetime.timedelta(hours = -4.75)).date() # trial day associated with sample (4:45am is when the day flips)
    df.loc[i, 'time'] = dt.time()

100%|██████████| 586408/586408 [00:58<00:00, 9964.65it/s] 


In [37]:
# save to csv
if run_num ==1:
    # run 1
    df.to_csv(save_path + 'run1_survey_results.csv', index=False)
if run_num ==2:
    # run 2
    df.to_csv(save_path + 'run2_survey_results.csv', index=False)

# Gap App

## Self Report

### Affect

#### Load Data

In [52]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

if 'df_daily_affect_wide' in globals():
    del(df_daily_affect_wide)
    print('deleted affect df')

deleted existing df
deleted affect df


In [53]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_survey_results.csv')

#### Select Data

In [54]:
df_affect = df.loc[df.ResultIdentifier.str.startswith('affect_')].reset_index(drop=True)
df_affect_am = df.loc[(df.ResultIdentifier.str.startswith('affect_')) & (df.ResultIdentifier.str.endswith('am'))].reset_index(drop=True)
df_affect_pm = df.loc[(df.ResultIdentifier.str.startswith('affect_')) & (~df.ResultIdentifier.str.endswith('am'))].reset_index(drop=True)

#### Convert to Wide

In [56]:
df_affect_pm_wide = df_affect_pm.pivot_table(index=["ParticipantIdentifier", "trial_date"], 
                    columns='ResultIdentifier', 
                    values='Answers').reset_index()
# get rid of name on index
df_affect_pm_wide = df_affect_pm_wide.rename_axis(None, axis=1)

df_affect_am_wide = df_affect_am.pivot_table(index=["ParticipantIdentifier", "trial_date"], 
                    columns='ResultIdentifier', 
                    values='Answers').reset_index()
# get rid of name on index
df_affect_am_wide = df_affect_am_wide.rename_axis(None, axis=1)

In [57]:
# join
df_daily_affect_wide = df_affect_pm_wide.merge(df_affect_am_wide, how='left', on=['ParticipantIdentifier', 'trial_date'])

df_daily_affect_wide.head(3)

Unnamed: 0,ParticipantIdentifier,trial_date,affect_neg_angry,affect_neg_ashamed,affect_neg_bored,affect_neg_depressed,affect_neg_embarrassed,affect_neg_frustrated,affect_neg_guilty,affect_neg_lazy,...,affect_neg_sad_am,affect_neg_stressed_am,affect_pos_amused_am,affect_pos_appreciated_am,affect_pos_excited_am,affect_pos_focused_am,affect_pos_happy_am,affect_pos_hopeful_am,affect_pos_motivated_am,affect_pos_relaxedCalm_am
0,0501ba67-3406-4779-aff1-878a0e9f7885,2022-09-30,1.0,4.0,5.0,4.0,1.0,3.0,4.0,4.0,...,,,,,,,,,,
1,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-01,1.0,2.0,1.0,1.0,1.0,3.0,3.0,3.0,...,2.0,2.0,1.0,3.0,4.0,4.0,4.0,5.0,4.0,3.0
2,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-02,3.0,1.0,4.0,5.0,1.0,4.0,2.0,5.0,...,,,,,,,,,,


In [59]:
type(df_daily_affect_wide.iloc[0,1])

str

In [60]:
# Create df with one participant ID for every trial_date

# Create a series of dates from '2022-09-27' to '2022-12-20'
date_series = pd.date_range(start='2022-09-27', end='2022-12-20')
ids_series = subjects

# Create a dataframe using a cartesian product of the two series
df_complete_idDate = pd.DataFrame({
    'ParticipantIdentifier': np.repeat(ids_series, len(date_series)),
    'trial_date': date_series.tolist() * len(ids_series)
}).reset_index(drop=True)

# Convert trial_date to datetime.date
df_complete_idDate['trial_date'] = pd.to_datetime(df_complete_idDate['trial_date']).dt.date
df_daily_affect_wide['trial_date'] = pd.to_datetime(df_daily_affect_wide['trial_date']).dt.date

# Join with affect df
df_daily_affect_wide = df_complete_idDate.merge(df_daily_affect_wide, how='left', on=['ParticipantIdentifier', 'trial_date'])

#### EDA Profiling

In [61]:
profile = ProfileReport(df_daily_affect_wide.iloc[:,2:], title="Affect | Pandas Profiling Report")
profile.to_file(eda_reports_path + "affect_report.html")

Summarize dataset:   2%|▏         | 1/45 [00:00<00:01, 29.53it/s, Describe variable:affect_neg_ashamed]

Summarize dataset: 100%|██████████| 450/450 [00:21<00:00, 20.49it/s, Completed]                                                   
Generate report structure: 100%|██████████| 1/1 [00:03<00:00,  3.91s/it]
Render HTML: 100%|██████████| 1/1 [00:03<00:00,  3.37s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 60.93it/s]


**NOTE**

There are a number of variables where the maximum values are well above 5, which is the maximum option that should be available. These glitch entries should be removed

#### Clean

##### Impossible Values

In [62]:
# Number of instances where the cell value is out of range (greater than 5)
df_daily_affect_wide.iloc[:,2:][df_daily_affect_wide.iloc[:,2:] > 5].count().sum()

157

In [63]:
# Replace all values below threshold with NaN
df_daily_affect_wide.iloc[:,2:] = np.where(df_daily_affect_wide.iloc[:,2:]>5, np.nan, df_daily_affect_wide.iloc[:,2:])

In [64]:
# Recheck for instances above 5
df_daily_affect_wide.iloc[:,2:][df_daily_affect_wide.iloc[:,2:] > 5].count().sum()

0

In [86]:
# Rerun profiling
profile = ProfileReport(df_daily_affect_wide.iloc[:,2:],
                        title="Affect | Pandas Profiling Report",
                        infer_dtypes = False)
profile.to_file(eda_reports_path + "affect_report_clean.html")

SyntaxError: expression cannot contain assignment, perhaps you meant "=="? (982188103.py, line 4)

##### Variance

We can see that there are a number of participants who have no variance in a given category.

This is obviously a problem for some analyses...

In [67]:
# Drop 'trial_date' column
data = df_daily_affect_wide.drop(columns='trial_date', errors='ignore')

# Group by 'ParticipantIdentifier' and compute the variance
grouped_variance = data.groupby('ParticipantIdentifier').var()

# Filter the grouped_variance dataframe to only include columns with 0 variance for any participant
zero_variance_df = grouped_variance[grouped_variance == 0].dropna(how='all')

# Melt the dataframe to have ParticipantIdentifier, Column with 0 variance
melted_zero_variance_df = zero_variance_df.reset_index().melt(id_vars=['ParticipantIdentifier'], value_name='Variance')
final_zero_variance_df = melted_zero_variance_df.dropna(subset=['Variance']).drop(columns='Variance')

# Count the number of values present in the variable column for each participant
value_counts = data.groupby('ParticipantIdentifier').count()

# Merge the value counts with the final_zero_variance_df
merged_df = final_zero_variance_df.merge(value_counts, on='ParticipantIdentifier', how='left')

# Extract only the relevant columns
result_df = merged_df[['ParticipantIdentifier', 'variable', 'affect_neg_angry']]
result_df.columns = ['ParticipantIdentifier', 'ZeroVariance', 'Count']

result_df

Unnamed: 0,ParticipantIdentifier,ZeroVariance,Count
0,27329533-d0a4-4605-9da5-0eb857154cae,affect_neg_angry,4
1,329e2c06-a903-44ce-a409-8ed8c580b124,affect_neg_angry,25
2,d11241a0-932e-4931-83ee-f3d28f66875f,affect_neg_angry,4
3,7272fab8-c4e7-45bb-ba42-e9f6a06801c0,affect_neg_ashamed,3
4,27329533-d0a4-4605-9da5-0eb857154cae,affect_neg_bored,4
...,...,...,...
182,90592e06-bcf6-4150-85b0-c5daf7e7569c,affect_pos_motivated_am,66
183,a8b5a9ea-b762-4f46-a431-6c530215c498,affect_pos_motivated_am,81
184,afbd4906-0513-42b1-91ce-d25065842f55,affect_pos_motivated_am,25
185,27f7805e-5951-47b4-9f42-4c6200001cc6,affect_pos_relaxedCalm_am,45


Some subjects had mulitple categories without any variance.

In [116]:
zero_var_cols = result_df.ParticipantIdentifier.value_counts().reset_index(name='affect_zeroVar_cols')
# Remove rows where both morning and evening surveys have NaN values
df_count = df_daily_affect_wide[~((df_daily_affect_wide['affect_neg_angry'].isnull()) & (df_daily_affect_wide['affect_neg_angry_am'].isnull()))]
# Calculate how many completed days
df_count =  df_count.groupby('ParticipantIdentifier').size().reset_index(name='total_count')

# merge
zero_var_cols = zero_var_cols.merge(df_count, on='ParticipantIdentifier', how='left')

# add flag
zero_var_cols['affect_zeroVarCols_flag'] = False
zero_var_cols.loc[zero_var_cols.affect_zeroVar_cols > 2, 'affect_zeroVarCols_flag'] = True
zero_var_cols

Unnamed: 0,ParticipantIdentifier,affect_zeroVar_cols,total_count,affect_zeroVarCols_flag
0,27f7805e-5951-47b4-9f42-4c6200001cc6,15,45,True
1,d520094e-39dc-47da-b764-049277fa48ad,12,72,True
2,27329533-d0a4-4605-9da5-0eb857154cae,8,4,True
3,2f32cd19-e9c5-4aad-8999-6f4646169ab6,8,8,True
4,ff129772-aeab-4432-8136-8f94027b8504,8,16,True
5,38bcd1b2-f8bc-48ee-bff2-5bca24012983,8,36,True
6,7272fab8-c4e7-45bb-ba42-e9f6a06801c0,7,3,True
7,3bb57dd9-1d69-471c-b769-b3323748bb9f,7,75,True
8,f889f1a4-9754-456e-ae08-092f992d3359,7,18,True
9,9330d6d9-c667-43be-b437-a3c988dd10d7,7,83,True


In [117]:
# merge with main affect df
df_daily_affect_wide = df_daily_affect_wide.merge(zero_var_cols.drop(columns=['total_count']), on='ParticipantIdentifier', how='left')

How many subjects had at least one column with no variance?

In [80]:
len(np.unique(result_df.ParticipantIdentifier))

49

Here is the subject who had 15 variables with no variance:

In [81]:
df_daily_affect_wide.loc[df_daily_affect_wide.ParticipantIdentifier == '27f7805e-5951-47b4-9f42-4c6200001cc6', :].dropna()

Unnamed: 0,ParticipantIdentifier,trial_date,affect_neg_angry,affect_neg_ashamed,affect_neg_bored,affect_neg_depressed,affect_neg_embarrassed,affect_neg_frustrated,affect_neg_guilty,affect_neg_lazy,...,affect_neg_sad_am,affect_neg_stressed_am,affect_pos_amused_am,affect_pos_appreciated_am,affect_pos_excited_am,affect_pos_focused_am,affect_pos_happy_am,affect_pos_hopeful_am,affect_pos_motivated_am,affect_pos_relaxedCalm_am
1191,27f7805e-5951-47b4-9f42-4c6200001cc6,2022-09-28,1.0,1.0,1.0,1.0,1.0,2.0,1.0,3.0,...,1.0,2.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,2.0
1195,27f7805e-5951-47b4-9f42-4c6200001cc6,2022-10-02,2.0,1.0,1.0,1.0,1.0,2.0,1.0,3.0,...,1.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0


But maybe variance within day is more important in terms of actually cleaning data. If someone enters the same value for every variable perhaps it is because they are not answering accurately and just trying to finish as quickly as possible.

In [120]:
# find rows with zero variance
idx = np.where(df_daily_affect_wide.drop(columns=['ParticipantIdentifier', 'trial_date']).var(axis=1) == 0)[0]
# calculate how many zero variance days per subject
df_zeroVar = df_daily_affect_wide.iloc[idx,:].groupby('ParticipantIdentifier').size().reset_index(name='zeroVar_count')

# Remove rows where both morning and evening surveys have NaN values
df_count = df_daily_affect_wide[~((df_daily_affect_wide['affect_neg_angry'].isnull()) & (df_daily_affect_wide['affect_neg_angry_am'].isnull()))]
# Calculate how many completed days
df_count =  df_count.groupby('ParticipantIdentifier').size().reset_index(name='total_count')

# merge
df_zeroVar = df_zeroVar.merge(df_count, on='ParticipantIdentifier', how='left')
df_zeroVar['affect_pct_zeroVarRows'] = (df_zeroVar.zeroVar_count / df_zeroVar.total_count) * 100

# add flag
df_zeroVar['affect_zeroVarRows_flag'] = False
df_zeroVar.loc[df_zeroVar.affect_pct_zeroVarRows > 10, 'affect_zeroVarRows_flag'] = True
df_zeroVar

Unnamed: 0,ParticipantIdentifier,zeroVar_count,total_count,affect_pct_zeroVarRows,affect_zeroVarRows_flag
0,25ca39d7-4279-48fd-903f-d0927adadb77,8,25,32.0,True
1,412330b3-cc02-4030-96cd-f4cfdcc45fa6,3,83,3.614458,False
2,4217d9ff-07c0-42a5-9da9-f2e351b40709,2,44,4.545455,False
3,4965aa0d-a7a9-4d1c-b835-3f79a29e0d39,13,83,15.662651,True
4,98cb45a7-4057-4e81-b1d7-e7aede5e106e,3,30,10.0,False
5,b62eaadd-1819-41da-a70b-a46d4151db72,1,29,3.448276,False
6,bbd82a98-a1c9-4229-afc3-cc201067b909,3,64,4.6875,False
7,bf670311-c590-473a-98ab-d719ebf0f2ab,42,83,50.60241,True
8,c2097f36-4ca3-4537-856d-a649d1557553,1,83,1.204819,False


In [121]:
# merge with main affect df
df_daily_affect_wide = df_daily_affect_wide.merge(df_zeroVar.drop(columns=['zeroVar_count', 'total_count']), on='ParticipantIdentifier', how='left')

In [124]:
df_daily_affect_wide.sample(5)

Unnamed: 0,ParticipantIdentifier,trial_date,affect_neg_angry,affect_neg_ashamed,affect_neg_bored,affect_neg_depressed,affect_neg_embarrassed,affect_neg_frustrated,affect_neg_guilty,affect_neg_lazy,...,affect_pos_excited_am,affect_pos_focused_am,affect_pos_happy_am,affect_pos_hopeful_am,affect_pos_motivated_am,affect_pos_relaxedCalm_am,affect_zeroVar_cols,affect_zeroVarCols_flag,affect_pct_zeroVarRows,affect_zeroVarRows_flag
5542,d973b82f-367b-4871-94a9-181421f870e8,2022-10-14,3.0,4.0,3.0,3.0,3.0,4.0,1.0,3.0,...,,,,,,,,,,
2346,bf670311-c590-473a-98ab-d719ebf0f2ab,2022-11-17,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,,,50.60241,True
2108,3223b573-6de0-4aeb-b005-dd2e467b1e62,2022-12-04,,,,,,,,,...,,,,,,,3.0,True,,
5421,f889f1a4-9754-456e-ae08-092f992d3359,2022-12-02,,,,,,,,,...,,,,,,,7.0,True,,
6337,6d9ce034-4fcd-407e-9e35-9dd9caafe1fe,2022-11-13,,,,,,,,,...,,,,,,,3.0,True,,


#### Save

In [65]:
# save to csv
if run_num ==1:
    # run 1
    df_daily_affect_wide.to_csv(save_path + 'run1_affect.csv', index=False)
if run_num ==2:
    # run 2
    df_daily_affect_wide.to_csv(save_path + 'run2_affect.csv', index=False)

### Daily General and Detail

#### Load Data

In [5]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_survey_results.csv')

#### Select Data

In [135]:
past24_general = [
    'DAILY_survey_situation1_surveys',
    'DAILY_survey_situation2_surveys',
    'DAILY_survey_missed',
    'DAILY_past48to24_gap',
    'DAILY_past48to24_gapCause',
    'DAILY_past24_ideal',
    'DAILY_past24_satisfaction',
    'DAILY_past24_change',
    'DAILY_past24_productivity',
    'DAILY_past24_procrastination',
    'DAILY_past24_punctuality',
    'DAILY_past24_mentalEffort',
    'DAILY_past24_physicalEffort',
    'DAILY_past24_values',
    'DAILY_past24_gap',
    'DAILY_past24_gapCause',
    'DAILY_past24_illness',
    'DAILY_past24_fatigue',
    'DAILY_past24_unusualEvents'
]

past24_categories = [
    'DAILY_past24_sleep',
    'DAILY_past24_occupation',
    'DAILY_past24_nonoccupation',
    'DAILY_past24_exercise',
    'DAILY_past24_leisureSolo',
    'DAILY_past24_leisureSoloMental',
    'DAILY_past24_leisureSoloPhysical',
    'DAILY_past24_leisureNonSolo',
    'DAILY_past24_leisureNonSoloMental',
    'DAILY_past24_leisureNonSoloPhysical',
    'DAILY_past24_diet',
    'DAILY_past24_socialMedia',
    'DAILY_past24_drinks'
]

next24_categories = [
    'DAILY_next24_sleep',
    'DAILY_next24_occupation',
    'DAILY_next24_nonoccupation',
    'DAILY_next24_leisureSolo',
    'DAILY_next24_leisureNonSolo',
    'DAILY_next24_exercise',
    'DAILY_next24_socialMedia',
    'DAILY_next24_drinks',
    'DAILY_next24_diet'
]

if run_num == 1:
    specific_goals = [
        'DAILY_goal1_report',
        'DAILY_goal1_importance',
        'DAILY_goal1_consequences',
        'DAILY_goal1_motivationInternal',
        'DAILY_goal1_motivationExternal',
        'DAILY_goal1_confidence',
        'DAILY_goal1_effort',
        'DAILY_goal1_interaction_week1',
        'DAILY_goal1_interaction_week2',
        'DAILY_goal1_interaction_month1',
        'DAILY_goal1_interaction_month2',
        'DAILY_goal2_report',
        'DAILY_goal2_importance',
        'DAILY_goal2_consequences',
        'DAILY_goal2_motivationInternal',
        'DAILY_goal2_motivationExternal',
        'DAILY_goal2_confidence',
        'DAILY_goal2_effort',
        'DAILY_goal2_interaction_week1',
        'DAILY_goal2_interaction_week2',
        'DAILY_goal2_interaction_month1',
        'DAILY_goal2_interaction_month2',
        'DAILY_goal2_interaction_eachOther'
    ]

elif run_num == 2:
    specific_goals = [
        'DAILY_goal1_report',
        'DAILY_goal1_importance',
        'DAILY_goal1_consequences',
        'DAILY_goal1_motivationInternal',
        'DAILY_goal1_motivationExternal',
        'DAILY_goal1_confidence',
        'DAILY_goal1_effort',
        'DAILY_goal1_interaction_week',
        'DAILY_goal1_interaction_month',
        'DAILY_goal2_report',
        'DAILY_goal2_importance',
        'DAILY_goal2_consequences',
        'DAILY_goal2_motivationInternal',
        'DAILY_goal2_motivationExternal',
        'DAILY_goal2_confidence',
        'DAILY_goal2_effort',
        'DAILY_goal2_interaction_week',
        'DAILY_goal2_interaction_month',
        'DAILY_goal2_interaction_eachOther'
    ]

non_numeric_cols = [
    'DAILY_goal1_set',
    'DAILY_goal2_set',
    'ParticipantIdentifier', 
    'trial_date',
    'DAILY_next24_diet',
    'DAILY_past48to24_gapCause',
    'DAILY_survey_situation1_surveys',
    'DAILY_survey_situation2_surveys',
    'DAILY_survey_missed',
    'DAILY_past24_gapCause'
]

In [136]:
df_daily_sr = df.loc[df.ResultIdentifier.isin(past24_general + 
                                              past24_categories + 
                                              next24_categories + 
                                              specific_goals)].reset_index(drop=True)

In [137]:
df_daily_sr.head(3)

Unnamed: 0,ParticipantIdentifier,ResultIdentifier,Answers,EndDate,datetime,trial_date,time
0,90592e06-bcf6-4150-85b0-c5daf7e7569c,DAILY_survey_situation1_surveys,alone,2022-10-24T19:59:12-04:00,2022-10-24 19:59:12-04:00,2022-10-24,19:59:12
1,90592e06-bcf6-4150-85b0-c5daf7e7569c,DAILY_survey_missed,did_not_miss,2022-10-24T19:59:13-04:00,2022-10-24 19:59:13-04:00,2022-10-24,19:59:13
2,90592e06-bcf6-4150-85b0-c5daf7e7569c,DAILY_past24_ideal,13,2022-10-24T19:59:17-04:00,2022-10-24 19:59:17-04:00,2022-10-24,19:59:17


#### Convert to Wide

In [138]:
df_daily_sr_wide = df_daily_sr.pivot_table(index=["ParticipantIdentifier", "trial_date"],
                                           columns='ResultIdentifier',
                                           values='Answers',
                                           aggfunc=lambda x: ' '.join(x)).reset_index()
# get rid of name on index
df_daily_sr_wide = df_daily_sr_wide.rename_axis(None, axis=1)

In [139]:
# convert data to numeric where appropriate
df_daily_sr_wide.loc[:,df_daily_sr_wide.columns[~df_daily_sr_wide.columns.isin(non_numeric_cols)]] = df_daily_sr_wide.loc[:,df_daily_sr_wide.columns[~df_daily_sr_wide.columns.isin(non_numeric_cols)]].apply(pd.to_numeric, errors='coerce')

In [140]:
# Break gap cause into two columns
df_daily_sr_wide[['DAILY_past24_gapCause_internal', 'DAILY_past24_gapCause_external', 'drop_col']] = df_daily_sr_wide.DAILY_past24_gapCause.str.split("_", expand = True)
df_daily_sr_wide.drop(columns='drop_col', inplace=True)

# convert to numeric 0-1
cols = ['DAILY_past24_gapCause_internal', 'DAILY_past24_gapCause_external']
df_daily_sr_wide[cols] = df_daily_sr_wide[cols].apply(pd.to_numeric, errors = 'coerce')
df_daily_sr_wide[cols] = df_daily_sr_wide[cols]/100

In [141]:
# REPEAT FOR MISSED DAY DATA
# Break gap cause into two columns
df_daily_sr_wide[['DAILY_past48to24_gapCause_internal', 'DAILY_past48to24_gapCause_external']] = df_daily_sr_wide.DAILY_past48to24_gapCause.str.split("_", expand = True)

# convert to numeric 0-1
cols = ['DAILY_past48to24_gapCause_internal', 'DAILY_past48to24_gapCause_external']
df_daily_sr_wide[cols] = df_daily_sr_wide[cols].apply(pd.to_numeric, errors = 'coerce')
df_daily_sr_wide[cols] = df_daily_sr_wide[cols]/100

In [142]:
df_daily_sr_wide.head()

Unnamed: 0,ParticipantIdentifier,trial_date,DAILY_goal1_confidence,DAILY_goal1_consequences,DAILY_goal1_effort,DAILY_goal1_importance,DAILY_goal1_interaction_month,DAILY_goal1_interaction_week,DAILY_goal1_motivationExternal,DAILY_goal1_motivationInternal,...,DAILY_past24_values,DAILY_past48to24_gap,DAILY_past48to24_gapCause,DAILY_survey_missed,DAILY_survey_situation1_surveys,DAILY_survey_situation2_surveys,DAILY_past24_gapCause_internal,DAILY_past24_gapCause_external,DAILY_past48to24_gapCause_internal,DAILY_past48to24_gapCause_external
0,0501ba67-3406-4779-aff1-878a0e9f7885,2022-09-30,4.0,3.0,6.0,5.0,,,7.0,6.0,...,,,,did_not_miss,with_friend,False,0.9,0.1,,
1,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-01,5.0,4.0,7.0,7.0,,,6.0,7.0,...,,60.0,80_20,missed_busy,alone,,0.3,0.7,0.8,0.2
2,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-02,7.0,5.0,4.0,7.0,,,7.0,5.0,...,,,,did_not_miss,with_friend,False,1.0,0.0,,
3,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-03,5.0,7.0,7.0,7.0,,,7.0,4.0,...,,76.0,100_0,missed_feels,with_friend,False,1.0,0.0,1.0,0.0
4,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-04,5.0,7.0,7.0,7.0,,,7.0,7.0,...,,88.0,100_0,missed_busy,with_friend,False,1.0,0.0,1.0,0.0


#### Category Gap Calculation

Make sure that we are not calculating gaps where there was no PREDICTION MADE

<div class="alert alert-block alert-info">
<b>📝 Note:</b><br>
    I am assuming that peoples' goals are directional in a way that MAY NOT BE ACCURATE for everyone.<br><br>
    For example, I am assuming that people want to sleep more and drink less - in other words they have a <b>gap</b> if they have <b>more</b> drinks than planned, but for <b>sleep</b> the gap calculation is reversed since we assume a gap means that you had <b>fewer</b> hours of sleep than planned.<br><br>
    While this may be accurate <i>in general</i> I would reasonably expect there to be exceptions.
</div>

In [143]:
# calculate diet gap
df_daily_sr_wide['DAILY_gap_diet'] = 100 - df_daily_sr_wide.DAILY_past24_diet

In [144]:
# take the predicted amount from the day before and subtract the actual amount...
for i in range(df_daily_sr_wide.shape[0]-1):
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_sleep'] =  df_daily_sr_wide.loc[i, 'DAILY_next24_sleep'] - df_daily_sr_wide.loc[i+1, 'DAILY_past24_sleep']
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_occupation'] =  df_daily_sr_wide.loc[i, 'DAILY_next24_occupation'] - df_daily_sr_wide.loc[i+1, 'DAILY_past24_occupation']    
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_nonoccupation'] =  df_daily_sr_wide.loc[i, 'DAILY_next24_nonoccupation'] - df_daily_sr_wide.loc[i+1, 'DAILY_past24_nonoccupation']    
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_leisureSolo'] =  df_daily_sr_wide.loc[i+1, 'DAILY_past24_leisureSolo'] - df_daily_sr_wide.loc[i, 'DAILY_next24_leisureSolo'] # reversed
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_leisureNonSolo'] =  df_daily_sr_wide.loc[i+1, 'DAILY_past24_leisureNonSolo']  - df_daily_sr_wide.loc[i, 'DAILY_next24_leisureNonSolo'] # reversed
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_exercise'] =  df_daily_sr_wide.loc[i, 'DAILY_next24_exercise'] - df_daily_sr_wide.loc[i+1, 'DAILY_past24_exercise']    
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_socialMedia'] =  df_daily_sr_wide.loc[i+1, 'DAILY_past24_socialMedia'] - df_daily_sr_wide.loc[i, 'DAILY_next24_socialMedia'] # reversed 
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_drinks'] =  df_daily_sr_wide.loc[i+1, 'DAILY_past24_drinks'] - df_daily_sr_wide.loc[i, 'DAILY_next24_drinks'] # reversed
    

In [158]:
# Join with complete ID/Date

# Convert trial_date to datetime.date
df_daily_sr_wide['trial_date'] = pd.to_datetime(df_daily_sr_wide['trial_date']).dt.date

# Join
df_daily_sr_wide = df_complete_idDate.merge(df_daily_sr_wide, how='left', on=['ParticipantIdentifier', 'trial_date'])

#### EDA Profiling

In [159]:
profile = ProfileReport(df_daily_sr_wide.iloc[:,2:], title="Daily Reports | Pandas Profiling Report")
profile.to_file(eda_reports_path + "daily_reports.html")

Summarize dataset: 100%|██████████| 4571/4571 [03:58<00:00, 19.15it/s, Completed]                                                                       
Generate report structure: 100%|██████████| 1/1 [00:08<00:00,  8.64s/it]
Render HTML: 100%|██████████| 1/1 [00:38<00:00, 38.26s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00,  2.65it/s]


## Custom Tasks


### N-Back

The n-back sequence was created as follows (where `n` indicates whether it is 2-back or 3-back)

```javascript
function constructSequence(n) {
    const ls = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
    let number = 0;
    let char = "";
    const alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'];
    const sequence = []
    for (let i = 0; i < SEQ_LEN; i++) {
        number = ls[Math.floor(Math.random() * ls.length)];
        if (i >= n && number <= 2) {
            char = sequence[i - n];
            //console.log("in if ===>", char, sequence, i, n)
        } else {
            char = alphabet[Math.floor(Math.random() * alphabet.length)];
            //console.log("in else ==>", char)
        }
        sequence.push(char)

    }
    return sequence;
}
```

Given that `ls` has a length of 11 this means that on average there is a 3/11 chance of having a match (for positions 3)


In [171]:
import json

#### Load Data

In [177]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_survey_results.csv')

#### Format Data

In [204]:
nback_df = df.loc[df.ResultIdentifier == 'task_custom_nBack_results'].reset_index(drop=True)

In [205]:
# we only need this as bids were being overwritten when the bid was NOT accepted with a 0 bid...
# this was fixed on Feb 7, but using the actual bid value will also continue to work...

nback_df_bids = df.loc[df.ResultIdentifier == 'task_custom_nBack_diffSelect'].reset_index(drop=True)

In [206]:
nback_df.Answers[5]

'{"bid":56,"randomNumber":98,"correctness":0.8181818181818182,"earnings":98,"mode":"hard","matched":6,"missed":1,"sequence":["X","Z","H","I","S","S","O","I","S","H","I","O","I","W","O","I","I","R","G","O","V","U","O","V","U"],"falseAlarm":3,"indexOfMatchClicked":[9,11,12,13,15,16,20,23,25]}'

In [207]:
# Parse json to create columns
for i in range(nback_df.shape[0]):
    nback_df.loc[i, 'task_nback_bid'] = json.loads(nback_df_bids.Answers[i])['bid']
    nback_df.loc[i, 'task_nback_rndNum'] = json.loads(nback_df.Answers[i])['randomNumber']    
    nback_df.loc[i, 'task_nback_mode'] = json.loads(nback_df.Answers[i])['mode']
    nback_df.loc[i, 'task_nback_matched'] = json.loads(nback_df.Answers[i])['matched']    
    nback_df.loc[i, 'task_nback_missed'] = json.loads(nback_df.Answers[i])['missed']    
    nback_df.loc[i, 'task_nback_falseAlarm'] = json.loads(nback_df.Answers[i])['falseAlarm']
    nback_df.loc[i, 'task_nback_trialCount'] = len(json.loads(nback_df.Answers[i])['sequence'])

In [208]:
nback_df = nback_df.drop(['ResultIdentifier', 'Answers', 'EndDate', 'datetime'], axis=1)
nback_df = nback_df.rename(columns={"time": "task_nback_time"})

#### Add Features

Adding the following metrics for Binary Classification:

1. **Accuracy**: 
   The proportion of correctly predicted classifications in the total predictions made.
   $$
   \text{Accuracy} = \frac{\text{TP} + \text{TN}}{\text{TP} + \text{TN} + \text{FP} + \text{FN}}
   $$

2. **Precision** (or Positive Predictive Value):
   The proportion of positive identifications that were actually correct.
   $$
   \text{Precision} = \frac{\text{TP}}{\text{TP} + \text{FP}}
   $$

3. **Recall** (or Sensitivity or True Positive Rate):
   The proportion of actual positives that were identified correctly.
   $$
   \text{Recall} = \frac{\text{TP}}{\text{TP} + \text{FN}}
   $$

4. **Specificity** (or True Negative Rate):
   The proportion of actual negatives that were identified correctly.
   $$
   \text{Specificity} = \frac{\text{TN}}{\text{TN} + \text{FP}}
   $$

5. **False Alarm Rate** (or Fall-Out):
   The proportion of actual negatives that were incorrectly classified as positive.
   $$
   \text{False Alarm Rate} = \frac{\text{FP}}{\text{TN} + \text{FP}}
   $$

6. **F1 Score**:
   The harmonic mean of precision and recall, giving a balance between the two.
   $$
   \text{F1 Score} = 2 \times \frac{\text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}}
   $$

7. **Matthews Correlation Coefficient (MCC)**:
   A metric that takes into account true and false positives and negatives and is generally regarded as a balanced measure which can be used even if the classes are of very different sizes.
   $$
   \text{MCC} = \frac{\text{TP} \times \text{TN} - \text{FP} \times \text{FN}}{\sqrt{(\text{TP} + \text{FP})(\text{TP} + \text{FN})(\text{TN} + \text{FP})(\text{TN} + \text{FN})}}
   $$

8. **Bias (C or criterion)**:
   A metric from signal detection theory that indicates the participant's response bias. A positive value indicates a bias toward saying "no" (


In [221]:
truePos = nback_df.task_nback_matched
trueNeg = nback_df.task_nback_trialCount - nback_df.task_nback_matched - nback_df.task_nback_missed - nback_df.task_nback_falseAlarm
falsePos = nback_df.task_nback_falseAlarm
falseNeg = nback_df.task_nback_missed

# proportion of correct classifications in total predictions made
nback_df['task_nback_accuracy'] = (truePos + trueNeg) / (truePos + trueNeg + falsePos + falseNeg)
# positive predictive value (hit rate)
nback_df['task_nback_precision'] = truePos / (truePos + falsePos)
# true positive rate (sensitivity)
nback_df['task_nback_recall'] = truePos / (truePos + falseNeg)
# true negative rate (false_alarm_rate)
nback_df['task_nback_specificity'] = trueNeg / (trueNeg + falsePos)
# Proportion of times the participant incorrectly indicates an n-back match when there wasn't one.
nback_df['task_nback_falseAlarmRate'] = falsePos / (trueNeg + falsePos)
# Harmonic mean of precision and recall, giving a balance between the two
nback_df['task_nback_F1'] = 2 * ((nback_df['task_nback_precision'] * nback_df['task_nback_recall'])/(nback_df['task_nback_precision'] + nback_df['task_nback_recall']))
# Matthews Correlation Coefficient (MCC):
# It takes into account true and false positives and negatives and is generally regarded as a balanced measure which can be used even if the classes are of very different sizes.
nback_df['task_nback_MCC'] = ((truePos * trueNeg) - (falsePos * falseNeg)) / (np.sqrt((truePos + falsePos)*(truePos + falseNeg) * (trueNeg + falsePos) * (trueNeg + falseNeg)))

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [227]:
# metric from signal detection theory
# indicates the participant's response bias. 
# A positive value indicates a bias toward saying "no" (conservative)
# A negative value indicates a bias toward saying "yes" (liberal).

from scipy.stats import norm

def calculate_criterion(hit_rate, false_alarm_rate):
    # Calculate the Z scores for the hit rate and false alarm rate
    z_hit = norm.ppf(hit_rate)
    z_fa = norm.ppf(false_alarm_rate)
    
    # Calculate the criterion C
    C = -0.5 * (z_hit + z_fa)
    
    return C

nback_df['task_nback_bias'] = calculate_criterion(nback_df['task_nback_precision'], nback_df['task_nback_falseAlarmRate'])

  C = -0.5 * (z_hit + z_fa)


#### EDA Profiling

In [228]:
profile = ProfileReport(nback_df.iloc[:,3:], title="n-Back Task | Pandas Profiling Report")
profile.to_file(eda_reports_path + "nback_report.html")

  x = asanyarray(arr - arrmean)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)
  x = asanyarray(arr - arrmean)
(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'cannot specify integer `bins` when input data contains infinity')
Summarize dataset: 100%|██████████| 194/194 [00:08<00:00, 23.41it/s, Completed]                                                   
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.94s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.37s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 48.07it/s]


**NOTES**

Looking at the data it is clear that something wonky went on in some trials where people have matched values up to 67, and missed values of -56. 

Can calculate a super low probability number of matches and delete trials with any values above that - as well as any trials with negative "missed" values.

False alarm also has a max of 136

In [220]:
np.where(np.sqrt((truePos + falsePos)*(truePos + falseNeg) * (trueNeg + falsePos) * (trueNeg + falseNeg)).isna())

  result = getattr(ufunc, method)(*inputs, **kwargs)


(array([  17,   90,  100,  143,  306,  363,  424,  704,  705,  720,  735,
         932,  965, 1177, 1180, 1310, 1335, 1352, 1505, 1528, 1742, 1747,
        1865, 1946, 1962, 1998, 2134, 2177, 2239, 2412, 2492, 2530, 2914,
        3089, 3153, 3337, 3611, 3739, 3838, 3978, 4033]),)

In [225]:
nback_df.iloc[17]

ParticipantIdentifier        76acef3c-d659-4fdd-b258-3668a1597584
trial_date                                             2022-10-24
task_nback_time                                          11:15:02
task_nback_bid                                               65.0
task_nback_rndNum                                            80.0
task_nback_mode                                              hard
task_nback_matched                                           48.0
task_nback_missed                                           -42.0
task_nback_falseAlarm                                         0.0
task_nback_accuracy                                          2.68
task_nback_trialCount                                        25.0
task_nback_precision                                          1.0
task_nback_recall                                             8.0
task_nback_specificity                                        1.0
task_nback_F1                                            1.777778
task_nback

In [214]:
nback_df.head(10)

Unnamed: 0,ParticipantIdentifier,trial_date,task_nback_time,task_nback_bid,task_nback_rndNum,task_nback_mode,task_nback_matched,task_nback_missed,task_nback_falseAlarm,task_nback_accuracy,task_nback_trialCount,task_nback_precision,task_nback_recall,task_nback_specificity,task_nback_F1,task_nback_MCC
0,39d785c6-9e3d-44c2-88ec-2ce14ef923f9,2022-10-24,07:07:47,57.0,63.0,hard,5.0,0.0,0.0,1.0,25.0,1.0,1.0,1.0,1.0,1.0
1,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,2022-10-24,07:08:51,6.0,100.0,hard,6.0,0.0,0.0,1.0,25.0,1.0,1.0,1.0,1.0,1.0
2,6b827de8-fe47-4007-aad3-202655b954e3,2022-10-24,07:22:00,0.0,0.0,easy,6.0,0.0,0.0,1.0,25.0,1.0,1.0,1.0,1.0,1.0
3,412330b3-cc02-4030-96cd-f4cfdcc45fa6,2022-10-24,07:28:35,0.0,0.0,easy,4.0,4.0,2.0,0.76,25.0,0.666667,0.5,0.882353,0.571429,0.41762
4,c2097f36-4ca3-4537-856d-a649d1557553,2022-10-24,08:52:18,0.0,0.0,easy,5.0,3.0,4.0,0.72,25.0,0.555556,0.625,0.764706,0.588235,0.378726
5,bf670311-c590-473a-98ab-d719ebf0f2ab,2022-10-24,08:58:26,56.0,98.0,hard,6.0,1.0,3.0,0.84,25.0,0.666667,0.857143,0.833333,0.75,0.645881
6,7d501571-5d23-4f09-9266-1644f4f71add,2022-10-24,08:59:10,66.0,64.0,easy,9.0,0.0,0.0,1.0,25.0,1.0,1.0,1.0,1.0,1.0
7,f889f1a4-9754-456e-ae08-092f992d3359,2022-10-24,09:00:14,13.0,14.0,hard,8.0,2.0,0.0,0.92,25.0,1.0,0.8,1.0,0.888889,0.840168
8,a33e1d38-6ee8-4da6-993b-a94a8ae7fc30,2022-10-24,09:06:27,25.0,89.0,hard,6.0,1.0,3.0,0.84,25.0,0.666667,0.857143,0.833333,0.75,0.645881
9,a9de00be-460d-4b74-bed3-bf013fe2052a,2022-10-24,09:11:25,6.0,95.0,hard,5.0,0.0,2.0,0.92,25.0,0.714286,1.0,0.9,0.833333,0.801784
