# Setup

## Imports

In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import json
import datetime
from dateutil import parser
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport
from importlib import reload
import utils

## Set Paths

In [2]:
# set run
run_num = 1

if run_num ==1:
    # run 1
    path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_1_raw_data/run_1/app_data/'
    save_path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_2_processed_data/run_1/'
    subjects_run1 = pd.read_csv(path + '../run1_subjects.csv')
    subjects = subjects_run1.ParticipantIdentifier
elif run_num ==2:
    # run 2
    path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_1_raw_data/run_2/app_data/'
    save_path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_2_processed_data/run_2/'
    subjects_run2 = pd.read_csv(path + '../run2_subjects.csv')
    subjects = subjects_run2.ParticipantIdentifier
    
eda_reports_path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_4_outputs/EDA/'

# Active Data

## Import Data

In [3]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

In [4]:
# loop through all days
days = [i for i in os.listdir(path) if i.startswith('RK')]
for day in tqdm(days):
    files = os.listdir(path + day)
    surveyQuestions = [i for i in files if i.startswith('SurveyQuestionResults')]
    # there should be only one
    for file in surveyQuestions:
        if 'df' not in globals():
            df = pd.read_csv(path + day + '/' + file)
        else:
            temp_df = pd.read_csv(path + day + '/' + file)
            df = pd.concat([df,temp_df], axis=0)

  0%|          | 0/87 [00:00<?, ?it/s]

100%|██████████| 87/87 [00:02<00:00, 29.97it/s]


In [5]:
df.shape

(597839, 8)

In [6]:
# select relevant columns
df = df[['ParticipantIdentifier', 'ResultIdentifier', 'Answers', 'EndDate']]
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,ParticipantIdentifier,ResultIdentifier,Answers,EndDate
0,6338356d-f098-46ea-b270-10c6fff7e67e,WEEKLY_goal_report1,19,2022-10-21T21:55:56-04:00
1,6338356d-f098-46ea-b270-10c6fff7e67e,WEEKLY_goal_report2,2,2022-10-21T21:55:59-04:00
2,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,task_custom_bart_info1,"{""totalEarnings"":0}",2022-10-24T07:04:41-04:00
3,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,task_custom_bart_info2,"{""totalEarnings"":0}",2022-10-24T07:04:41-04:00
4,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,task_custom_bart_info3,"{""totalEarnings"":0}",2022-10-24T07:04:42-04:00


In [7]:
df.isna().sum()

ParticipantIdentifier     0
ResultIdentifier          0
Answers                   2
EndDate                  15
dtype: int64

In [8]:
# Remove rows without valid EndDate value
df = df.dropna(subset=['EndDate']).reset_index(drop=True)

# Select relevant subjects
df = df.loc[df.ParticipantIdentifier.isin(subjects)].reset_index(drop=True)

In [9]:
# add trial date and time columns
for i in tqdm(range(df.shape[0])):
    dt = parser.parse(df.loc[i, 'EndDate'])
    df.loc[i, 'datetime'] = dt
    df.loc[i, 'trial_date'] = (dt + datetime.timedelta(hours = -4.75)).date() # trial day associated with sample (4:45am is when the day flips)
    df.loc[i, 'time'] = dt.time()

100%|██████████| 586408/586408 [00:59<00:00, 9920.41it/s] 


In [10]:
df['trial_date'][0]

datetime.date(2022, 10, 21)

In [11]:
# save to csv
if run_num ==1:
    # run 1
    df.to_csv(save_path + 'run1_app_survey_results.csv', index=False)
if run_num ==2:
    # run 2
    df.to_csv(save_path + 'run2_app_survey_results.csv', index=False)

# Gap App

## Self Report

### Affect

#### Load Data

In [12]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

if 'df_daily_affect_wide' in globals():
    del(df_daily_affect_wide)
    print('deleted affect df')

deleted existing df


In [13]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_app_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_app_survey_results.csv')
    
# Convert the date column back to datetime.date
df['trial_date'] = pd.to_datetime(df['trial_date']).dt.date

#### Select Data

In [14]:
df_affect = df.loc[df.ResultIdentifier.str.startswith('affect_')].reset_index(drop=True)
df_affect_am = df.loc[(df.ResultIdentifier.str.startswith('affect_')) & (df.ResultIdentifier.str.endswith('am'))].reset_index(drop=True)
df_affect_pm = df.loc[(df.ResultIdentifier.str.startswith('affect_')) & (~df.ResultIdentifier.str.endswith('am'))].reset_index(drop=True)

#### Convert to Wide

In [15]:
df_affect_pm_wide = df_affect_pm.pivot_table(index=["ParticipantIdentifier", "trial_date"], 
                    columns='ResultIdentifier', 
                    values='Answers').reset_index()
# get rid of name on index
df_affect_pm_wide = df_affect_pm_wide.rename_axis(None, axis=1)

df_affect_am_wide = df_affect_am.pivot_table(index=["ParticipantIdentifier", "trial_date"], 
                    columns='ResultIdentifier', 
                    values='Answers').reset_index()
# get rid of name on index
df_affect_am_wide = df_affect_am_wide.rename_axis(None, axis=1)

In [16]:
# join
df_daily_affect_wide = df_affect_pm_wide.merge(df_affect_am_wide, how='left', on=['ParticipantIdentifier', 'trial_date'])

df_daily_affect_wide.head(3)

Unnamed: 0,ParticipantIdentifier,trial_date,affect_neg_angry,affect_neg_ashamed,affect_neg_bored,affect_neg_depressed,affect_neg_embarrassed,affect_neg_frustrated,affect_neg_guilty,affect_neg_lazy,...,affect_neg_sad_am,affect_neg_stressed_am,affect_pos_amused_am,affect_pos_appreciated_am,affect_pos_excited_am,affect_pos_focused_am,affect_pos_happy_am,affect_pos_hopeful_am,affect_pos_motivated_am,affect_pos_relaxedCalm_am
0,0501ba67-3406-4779-aff1-878a0e9f7885,2022-09-30,1.0,4.0,5.0,4.0,1.0,3.0,4.0,4.0,...,,,,,,,,,,
1,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-01,1.0,2.0,1.0,1.0,1.0,3.0,3.0,3.0,...,2.0,2.0,1.0,3.0,4.0,4.0,4.0,5.0,4.0,3.0
2,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-02,3.0,1.0,4.0,5.0,1.0,4.0,2.0,5.0,...,,,,,,,,,,


#### EDA Profiling

In [None]:
profile = ProfileReport(df_daily_affect_wide.iloc[:,2:], title=f"Affect Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"affect_report_run{run_num}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

**NOTE**

There are a number of variables where the maximum values are well above 5, which is the maximum option that should be available. These glitch entries should be removed

#### Clean

<div class="alert alert-block alert-info">
<b>🧹 Process:</b><br>
For our cleaning process we do the following:<br><br>

<ol>
    <li>Remove values greater than 5, given that the Likert scale only went to 5</li>
    <br>
    <li>Look for zero variance <b>columns</b> (affective measures) and <b>rows</b> (days) for subjects</li></h5>
    <br>
    👉 If the variance was zero for three or more columns then the subject was flagged (`affect_zeroVarCols_flag = True`)<br>
    <br>
    👉 If the variance was zero for more than 10% of a subjects completed days then the subject was flagged (`affect_zeroVarRows_flag = True`)
</ol>
</div>

##### Impossible Values

In [17]:
# Number of instances where the cell value is out of range (greater than 5)
df_daily_affect_wide.iloc[:,2:][df_daily_affect_wide.iloc[:,2:] > 5].count().sum()

157

In [18]:
# Replace all values below threshold with NaN
df_daily_affect_wide.iloc[:,2:] = np.where(df_daily_affect_wide.iloc[:,2:]>5, np.nan, df_daily_affect_wide.iloc[:,2:])

In [19]:
# Recheck for instances above 5
df_daily_affect_wide.iloc[:,2:][df_daily_affect_wide.iloc[:,2:] > 5].count().sum()

0

In [None]:
# Rerun profiling
profile = ProfileReport(df_daily_affect_wide.iloc[:,2:],
                        title=f"Affect Run {run_num} - Clean | Pandas Profiling Report",
                        infer_dtypes = False)
profile.to_file(eda_reports_path + f"affect_report_clean_run{run_num}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

KeyboardInterrupt: 

##### Variance

We can see that there are a number of participants who have no variance in a given category.

This is obviously a problem for some analyses...

In [20]:
# Drop 'trial_date' column
data = df_daily_affect_wide.drop(columns='trial_date', errors='ignore')

# Group by 'ParticipantIdentifier' and compute the variance
grouped_variance = data.groupby('ParticipantIdentifier').var()

# Filter the grouped_variance dataframe to only include columns with 0 variance for any participant
zero_variance_df = grouped_variance[grouped_variance == 0].dropna(how='all')

# Melt the dataframe to have ParticipantIdentifier, Column with 0 variance
melted_zero_variance_df = zero_variance_df.reset_index().melt(id_vars=['ParticipantIdentifier'], value_name='Variance')
final_zero_variance_df = melted_zero_variance_df.dropna(subset=['Variance']).drop(columns='Variance')

# Count the number of values present in the variable column for each participant
value_counts = data.groupby('ParticipantIdentifier').count()

# Merge the value counts with the final_zero_variance_df
merged_df = final_zero_variance_df.merge(value_counts, on='ParticipantIdentifier', how='left')

# Extract only the relevant columns
result_df = merged_df[['ParticipantIdentifier', 'variable', 'affect_neg_angry']]
result_df.columns = ['ParticipantIdentifier', 'ZeroVariance', 'Count']

result_df

Unnamed: 0,ParticipantIdentifier,ZeroVariance,Count
0,27329533-d0a4-4605-9da5-0eb857154cae,affect_neg_angry,4
1,329e2c06-a903-44ce-a409-8ed8c580b124,affect_neg_angry,25
2,d11241a0-932e-4931-83ee-f3d28f66875f,affect_neg_angry,4
3,7272fab8-c4e7-45bb-ba42-e9f6a06801c0,affect_neg_ashamed,3
4,27329533-d0a4-4605-9da5-0eb857154cae,affect_neg_bored,4
...,...,...,...
182,90592e06-bcf6-4150-85b0-c5daf7e7569c,affect_pos_motivated_am,66
183,a8b5a9ea-b762-4f46-a431-6c530215c498,affect_pos_motivated_am,81
184,afbd4906-0513-42b1-91ce-d25065842f55,affect_pos_motivated_am,25
185,27f7805e-5951-47b4-9f42-4c6200001cc6,affect_pos_relaxedCalm_am,45


Some subjects had mulitple categories without any variance.

In [21]:
zero_var_cols = result_df.ParticipantIdentifier.value_counts().reset_index(name='affect_zeroVar_cols')
# Remove rows where both morning and evening surveys have NaN values
df_count = df_daily_affect_wide[~((df_daily_affect_wide['affect_neg_angry'].isnull()) & (df_daily_affect_wide['affect_neg_angry_am'].isnull()))]
# Calculate how many completed days
df_count =  df_count.groupby('ParticipantIdentifier').size().reset_index(name='total_count')

# merge
zero_var_cols = zero_var_cols.merge(df_count, on='ParticipantIdentifier', how='left')

# add flag
zero_var_cols['affect_zeroVarCols_flag'] = False
zero_var_cols.loc[zero_var_cols.affect_zeroVar_cols > 2, 'affect_zeroVarCols_flag'] = True
zero_var_cols

Unnamed: 0,ParticipantIdentifier,affect_zeroVar_cols,total_count,affect_zeroVarCols_flag
0,27f7805e-5951-47b4-9f42-4c6200001cc6,15,45,True
1,d520094e-39dc-47da-b764-049277fa48ad,12,72,True
2,27329533-d0a4-4605-9da5-0eb857154cae,8,4,True
3,2f32cd19-e9c5-4aad-8999-6f4646169ab6,8,8,True
4,ff129772-aeab-4432-8136-8f94027b8504,8,16,True
5,38bcd1b2-f8bc-48ee-bff2-5bca24012983,8,36,True
6,7272fab8-c4e7-45bb-ba42-e9f6a06801c0,7,3,True
7,3bb57dd9-1d69-471c-b769-b3323748bb9f,7,75,True
8,f889f1a4-9754-456e-ae08-092f992d3359,7,18,True
9,9330d6d9-c667-43be-b437-a3c988dd10d7,7,83,True


In [22]:
# merge with main affect df
df_daily_affect_wide = df_daily_affect_wide.merge(zero_var_cols.drop(columns=['total_count']), on='ParticipantIdentifier', how='left')

How many subjects had at least one column with no variance?

In [23]:
len(np.unique(result_df.ParticipantIdentifier))

49

Here is the subject who had 15 variables with no variance (run 1):

In [24]:
df_daily_affect_wide.loc[df_daily_affect_wide.ParticipantIdentifier == '27f7805e-5951-47b4-9f42-4c6200001cc6', :].dropna()

Unnamed: 0,ParticipantIdentifier,trial_date,affect_neg_angry,affect_neg_ashamed,affect_neg_bored,affect_neg_depressed,affect_neg_embarrassed,affect_neg_frustrated,affect_neg_guilty,affect_neg_lazy,...,affect_pos_amused_am,affect_pos_appreciated_am,affect_pos_excited_am,affect_pos_focused_am,affect_pos_happy_am,affect_pos_hopeful_am,affect_pos_motivated_am,affect_pos_relaxedCalm_am,affect_zeroVar_cols,affect_zeroVarCols_flag
662,27f7805e-5951-47b4-9f42-4c6200001cc6,2022-09-28,1.0,1.0,1.0,1.0,1.0,2.0,1.0,3.0,...,1.0,1.0,1.0,3.0,1.0,1.0,1.0,2.0,15.0,True
666,27f7805e-5951-47b4-9f42-4c6200001cc6,2022-10-02,2.0,1.0,1.0,1.0,1.0,2.0,1.0,3.0,...,2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,15.0,True


But maybe variance within day is more important in terms of actually cleaning data. If someone enters the same value for every variable perhaps it is because they are not answering accurately and just trying to finish as quickly as possible.

In [25]:
# find rows with zero variance
idx = np.where(df_daily_affect_wide.drop(columns=['ParticipantIdentifier', 'trial_date']).var(axis=1) == 0)[0]
# calculate how many zero variance days per subject
df_zeroVar = df_daily_affect_wide.iloc[idx,:].groupby('ParticipantIdentifier').size().reset_index(name='zeroVar_count')

# Remove rows where both morning and evening surveys have NaN values
df_count = df_daily_affect_wide[~((df_daily_affect_wide['affect_neg_angry'].isnull()) & (df_daily_affect_wide['affect_neg_angry_am'].isnull()))]
# Calculate how many completed days
df_count =  df_count.groupby('ParticipantIdentifier').size().reset_index(name='total_count')

# merge
df_zeroVar = df_zeroVar.merge(df_count, on='ParticipantIdentifier', how='left')
df_zeroVar['affect_pct_zeroVarRows'] = (df_zeroVar.zeroVar_count / df_zeroVar.total_count) * 100

# add flag
df_zeroVar['affect_zeroVarRows_flag'] = False
df_zeroVar.loc[df_zeroVar.affect_pct_zeroVarRows > 10, 'affect_zeroVarRows_flag'] = True
df_zeroVar

Unnamed: 0,ParticipantIdentifier,zeroVar_count,total_count,affect_pct_zeroVarRows,affect_zeroVarRows_flag
0,25ca39d7-4279-48fd-903f-d0927adadb77,8,25,32.0,True
1,412330b3-cc02-4030-96cd-f4cfdcc45fa6,3,83,3.614458,False
2,4217d9ff-07c0-42a5-9da9-f2e351b40709,2,44,4.545455,False
3,4965aa0d-a7a9-4d1c-b835-3f79a29e0d39,13,83,15.662651,True
4,98cb45a7-4057-4e81-b1d7-e7aede5e106e,3,30,10.0,False
5,b62eaadd-1819-41da-a70b-a46d4151db72,1,29,3.448276,False
6,bbd82a98-a1c9-4229-afc3-cc201067b909,3,64,4.6875,False
7,bf670311-c590-473a-98ab-d719ebf0f2ab,42,83,50.60241,True
8,c2097f36-4ca3-4537-856d-a649d1557553,1,83,1.204819,False


In [26]:
# merge with main affect df
df_daily_affect_wide = df_daily_affect_wide.merge(df_zeroVar.drop(columns=['zeroVar_count', 'total_count']), on='ParticipantIdentifier', how='left')

In [27]:
df_daily_affect_wide.sample(5)

Unnamed: 0,ParticipantIdentifier,trial_date,affect_neg_angry,affect_neg_ashamed,affect_neg_bored,affect_neg_depressed,affect_neg_embarrassed,affect_neg_frustrated,affect_neg_guilty,affect_neg_lazy,...,affect_pos_excited_am,affect_pos_focused_am,affect_pos_happy_am,affect_pos_hopeful_am,affect_pos_motivated_am,affect_pos_relaxedCalm_am,affect_zeroVar_cols,affect_zeroVarCols_flag,affect_pct_zeroVarRows,affect_zeroVarRows_flag
1154,3bb57dd9-1d69-471c-b769-b3323748bb9f,2022-11-29,2.0,2.0,1.0,1.0,1.0,2.0,2.0,1.0,...,,,,,,,7.0,True,,
853,3223b573-6de0-4aeb-b005-dd2e467b1e62,2022-11-11,1.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,...,,,,,,,3.0,True,,
4041,fed3a16b-c5a3-4a84-a111-32b7574e04b5,2022-11-23,3.0,4.0,3.0,2.0,3.0,3.0,3.0,3.0,...,4.0,2.0,4.0,2.0,1.0,4.0,,,,
2567,a33e1d38-6ee8-4da6-993b-a94a8ae7fc30,2022-09-30,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,...,2.0,3.0,3.0,3.0,3.0,3.0,,,,
755,2baee05a-5e5a-4436-8c25-2628d46d1e08,2022-11-15,2.0,2.0,2.0,2.0,4.0,1.0,2.0,2.0,...,2.0,2.0,2.0,2.0,3.0,4.0,,,,


#### Save

In [28]:
# Create df with one participant ID for every trial_date

# Create a series of dates from '2022-09-27' to '2022-12-20'
if run_num == 1:
    date_series = pd.date_range(start='2022-09-27', end='2022-12-20')
elif run_num == 2:
    date_series = pd.date_range(start='2023-01-30', end='2023-04-24')

ids_series = subjects

# Create a dataframe using a cartesian product of the two series
df_complete_idDate = pd.DataFrame({
    'ParticipantIdentifier': np.repeat(ids_series, len(date_series)),
    'trial_date': date_series.tolist() * len(ids_series)
}).reset_index(drop=True)

# Convert trial_date to datetime.date
df_complete_idDate['trial_date'] = pd.to_datetime(df_complete_idDate['trial_date']).dt.date
df_daily_affect_wide['trial_date'] = pd.to_datetime(df_daily_affect_wide['trial_date']).dt.date

# Join with affect df
df_daily_affect_wide = df_complete_idDate.merge(df_daily_affect_wide, how='left', on=['ParticipantIdentifier', 'trial_date'])

In [29]:
df_daily_affect_wide.shape

(7820, 46)

In [30]:
# save to csv
if run_num ==1:
    # run 1
    df_daily_affect_wide.to_csv(save_path + 'run1_affect.csv', index=False)
if run_num ==2:
    # run 2
    df_daily_affect_wide.to_csv(save_path + 'run2_affect.csv', index=False)

### Daily General and Detail

#### Load Data

In [31]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

if 'df_daily_sr_wide' in globals():
    del(df_daily_sr_wide)
    print('deleted daily self report df')

deleted existing df


In [32]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_app_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_app_survey_results.csv')
    
# Convert the date column back to datetime.date
df['trial_date'] = pd.to_datetime(df['trial_date']).dt.date

#### Select Data

In [33]:
past24_general = [
    'DAILY_survey_situation1_surveys',
    'DAILY_survey_situation2_surveys',
    'DAILY_survey_missed',
    'DAILY_past48to24_gap',
    'DAILY_past48to24_gapCause',
    'DAILY_past24_ideal',
    'DAILY_past24_satisfaction',
    'DAILY_past24_change',
    'DAILY_past24_productivity',
    'DAILY_past24_procrastination',
    'DAILY_past24_punctuality',
    'DAILY_past24_mentalEffort',
    'DAILY_past24_physicalEffort',
    'DAILY_past24_values',
    'DAILY_past24_gap',
    'DAILY_past24_gapCause',
    'DAILY_past24_illness',
    'DAILY_past24_fatigue',
    'DAILY_past24_unusualEvents'
]

past24_categories = [
    'DAILY_past24_sleep',
    'DAILY_past24_occupation',
    'DAILY_past24_nonoccupation',
    'DAILY_past24_exercise',
    'DAILY_past24_leisureSolo',
    'DAILY_past24_leisureSoloMental',
    'DAILY_past24_leisureSoloPhysical',
    'DAILY_past24_leisureNonSolo',
    'DAILY_past24_leisureNonSoloMental',
    'DAILY_past24_leisureNonSoloPhysical',
    'DAILY_past24_diet',
    'DAILY_past24_socialMedia',
    'DAILY_past24_drinks'
]

next24_categories = [
    'DAILY_next24_sleep',
    'DAILY_next24_occupation',
    'DAILY_next24_nonoccupation',
    'DAILY_next24_leisureSolo',
    'DAILY_next24_leisureNonSolo',
    'DAILY_next24_exercise',
    'DAILY_next24_socialMedia',
    'DAILY_next24_drinks',
    'DAILY_next24_diet'
]

monthly_goals = [
    'MONTHLY_ib_gap_change',
    'MONTHLY_ib_gap_change_app',
    'MONTHLY_goal_report1',
    'MONTHLY_goal_set1_importance',
    'MONTHLY_goal_set1_consequences',
    'MONTHLY_goal_set1_motivationInternal',
    'MONTHLY_goal_set1_motivationExternal',
    'MONTHLY_goal_set1_confidence',
    'MONTHLY_goal_set1_effort',
    'MONTHLY_goal_report2',
    'MONTHLY_goal_set2_importance',
    'MONTHLY_goal_set2_consequences',
    'MONTHLY_goal_set2_motivationInternal',
    'MONTHLY_goal_set2_motivationExternal',
    'MONTHLY_goal_set2_confidence',
    'MONTHLY_goal_set2_effort',
    'MONTHLY_goal_set2_interaction_eachOther'
]

monthly_ideals = [
    'IDEAL_weekday_sleep',
    'IDEAL_weekday_occupation',
    'IDEAL_weekday_nonoccupation',
    'IDEAL_weekday_leisureSolo',
    'IDEAL_weekday_leisureNonSolo',
    'IDEAL_weekday_exercise',
    'IDEAL_weekday_socialMedia',
    'IDEAL_weekday_drinks',
    'IDEAL_weekend_sleep',
    'IDEAL_weekend_occupation',
    'IDEAL_weekend_nonoccupation',
    'IDEAL_weekend_leisureSolo',
    'IDEAL_weekend_leisureNonSolo',
    'IDEAL_weekend_exercise',
    'IDEAL_weekend_socialMedia',
    'IDEAL_weekend_drinks'
]

if run_num == 1:
    specific_goals = [
        'DAILY_goal1_report',
        'DAILY_goal1_importance',
        'DAILY_goal1_consequences',
        'DAILY_goal1_motivationInternal',
        'DAILY_goal1_motivationExternal',
        'DAILY_goal1_confidence',
        'DAILY_goal1_effort',
        'DAILY_goal1_interaction_week',
        'DAILY_goal1_interaction_month',
        'DAILY_goal2_report',
        'DAILY_goal2_importance',
        'DAILY_goal2_consequences',
        'DAILY_goal2_motivationInternal',
        'DAILY_goal2_motivationExternal',
        'DAILY_goal2_confidence',
        'DAILY_goal2_effort',
        'DAILY_goal2_interaction_week',
        'DAILY_goal2_interaction_month',
        'DAILY_goal2_interaction_eachOther'
    ]
    
    weekly_goals = [
        'WEEKLY_goal_report1',
        'WEEKLY_goal_set1_importance',
        'WEEKLY_goal_set1_consequences',
        'WEEKLY_goal_set1_motivationInternal',
        'WEEKLY_goal_set1_motivationExternal',
        'WEEKLY_goal_set1_confidence',
        'WEEKLY_goal_set1_effort',
        'WEEKLY_goal_set1_interaction_month',
        'WEEKLY_goal_report2',
        'WEEKLY_goal_set2_importance',
        'WEEKLY_goal_set2_consequences',
        'WEEKLY_goal_set2_motivationInternal',
        'WEEKLY_goal_set2_motivationExternal',
        'WEEKLY_goal_set2_confidence',
        'WEEKLY_goal_set2_effort',
        'WEEKLY_goal_set2_interaction_month',
        'WEEKLY_goal_set2_interaction_eachOther'
    ]

elif run_num == 2:
    specific_goals = [
        'DAILY_goal1_report',
        'DAILY_goal1_importance',
        'DAILY_goal1_consequences',
        'DAILY_goal1_motivationInternal',
        'DAILY_goal1_motivationExternal',
        'DAILY_goal1_confidence',
        'DAILY_goal1_effort',
        'DAILY_goal1_interaction_week1',
        'DAILY_goal1_interaction_week2',
        'DAILY_goal1_interaction_month1',
        'DAILY_goal1_interaction_month2',
        'DAILY_goal2_report',
        'DAILY_goal2_importance',
        'DAILY_goal2_consequences',
        'DAILY_goal2_motivationInternal',
        'DAILY_goal2_motivationExternal',
        'DAILY_goal2_confidence',
        'DAILY_goal2_effort',
        'DAILY_goal2_interaction_week1',
        'DAILY_goal2_interaction_week2',
        'DAILY_goal2_interaction_month1',
        'DAILY_goal2_interaction_month2',
        'DAILY_goal2_interaction_eachOther'
    ]
    
    weekly_goals = [
        'WEEKLY_goal_report1',
        'WEEKLY_goal_set1_importance',
        'WEEKLY_goal_set1_consequences',
        'WEEKLY_goal_set1_motivationInternal',
        'WEEKLY_goal_set1_motivationExternal',
        'WEEKLY_goal_set1_confidence',
        'WEEKLY_goal_set1_effort',
        'WEEKLY_goal_set1_interaction_month1',
        'WEEKLY_goal_set1_interaction_month2',
        'WEEKLY_goal_report2',
        'WEEKLY_goal_set2_importance',
        'WEEKLY_goal_set2_consequences',
        'WEEKLY_goal_set2_motivationInternal',
        'WEEKLY_goal_set2_motivationExternal',
        'WEEKLY_goal_set2_confidence',
        'WEEKLY_goal_set2_effort',
        'WEEKLY_goal_set2_interaction_month1',
        'WEEKLY_goal_set2_interaction_month2',
        'WEEKLY_goal_set2_interaction_eachOther'
    ]

non_numeric_cols = [
    'IDEAL_values_monthly1',
    'IDEAL_values_monthly2',
    'IDEAL_values_monthly3',
    'DAILY_goal1_set',
    'DAILY_goal2_set',
    'WEEKLY_goal_set1',
    'WEEKLY_goal_set2',
    'MONTHLY_goal_set1',
    'MONTHLY_goal_set2',
    'DAILY_next24_diet',
    'DAILY_past48to24_gapCause',
    'DAILY_survey_situation1_surveys',
    'DAILY_survey_situation2_surveys',
    'DAILY_survey_missed',
    'DAILY_past24_gapCause',
    'ParticipantIdentifier',
    'trial_date'
]

In [34]:
df_daily_sr = df.loc[df.ResultIdentifier.isin(past24_general + 
                                              past24_categories + 
                                              next24_categories + 
                                              specific_goals +
                                              non_numeric_cols)].reset_index(drop=True)

#### Convert to Wide

In [35]:
# Convert values in 'Answers' column to numeric where possible, else leave as string
# df_daily_sr['Answers'] = pd.to_numeric(df_daily_sr['Answers'], errors='coerce').fillna(df_daily_sr['Answers'])

# Pivot the data
df_daily_sr_wide = df_daily_sr.pivot_table(index=["ParticipantIdentifier", "trial_date"],
                                           columns='ResultIdentifier',
                                           values='Answers',
                                           aggfunc=lambda x: ' '.join(map(str, x))).reset_index()
# get rid of name on index
df_daily_sr_wide = df_daily_sr_wide.rename_axis(None, axis=1)

In [36]:
# convert data to numeric where appropriate
df_daily_sr_wide.loc[:,df_daily_sr_wide.columns[~df_daily_sr_wide.columns.isin(non_numeric_cols + ['ParticipantIdentifier', 'trial_date'])]] = df_daily_sr_wide.loc[:,df_daily_sr_wide.columns[~df_daily_sr_wide.columns.isin(non_numeric_cols)]].apply(pd.to_numeric, errors='coerce')

In [37]:
# Break gap cause into two columns
if run_num == 1:
    df_daily_sr_wide[['DAILY_past24_gapCause_internal', 'DAILY_past24_gapCause_external', 'drop_col']] = df_daily_sr_wide.DAILY_past24_gapCause.str.split("_", expand = True)
    df_daily_sr_wide.drop(columns='drop_col', inplace=True)

elif run_num == 2:
    df_daily_sr_wide[['DAILY_past24_gapCause_internal', 'DAILY_past24_gapCause_external', 'drop_col', 'drop_col']] = df_daily_sr_wide.DAILY_past24_gapCause.str.split("_", expand = True)
    df_daily_sr_wide.drop(columns='drop_col', inplace=True)

# convert to numeric 0-1
cols = ['DAILY_past24_gapCause_internal', 'DAILY_past24_gapCause_external']
df_daily_sr_wide[cols] = df_daily_sr_wide[cols].apply(pd.to_numeric, errors = 'coerce')
df_daily_sr_wide[cols] = df_daily_sr_wide[cols]/100

In [38]:
# REPEAT FOR MISSED DAY DATA
# Break gap cause into two columns
df_daily_sr_wide[['DAILY_past48to24_gapCause_internal', 'DAILY_past48to24_gapCause_external']] = df_daily_sr_wide.DAILY_past48to24_gapCause.str.split("_", expand = True)

# convert to numeric 0-1
cols = ['DAILY_past48to24_gapCause_internal', 'DAILY_past48to24_gapCause_external']
df_daily_sr_wide[cols] = df_daily_sr_wide[cols].apply(pd.to_numeric, errors = 'coerce')
df_daily_sr_wide[cols] = df_daily_sr_wide[cols]/100

#### Category Gap Calculation

⚡ Make sure that we are not calculating gaps where there was no PREDICTION MADE

<div class="alert alert-block alert-info">
<b>📝 Note:</b><br>
    I am assuming that peoples' goals are directional in a way that MAY NOT BE ACCURATE for everyone.<br><br>
    For example, I am assuming that people want to sleep more and drink less - in other words they have a <b>gap</b> if they have <b>more</b> drinks than planned, but for <b>sleep</b> the gap calculation is reversed since we assume a gap means that you had <b>fewer</b> hours of sleep than planned.<br><br>
    While this may be accurate <i>in general</i> I would reasonably expect there to be exceptions.
</div>

In [39]:
# calculate diet gap (since it is originally a success measure)
df_daily_sr_wide['DAILY_gap_diet'] = 100 - df_daily_sr_wide.DAILY_past24_diet

In [40]:
# take the predicted amount from the day before and subtract the actual amount...
for i in range(df_daily_sr_wide.shape[0]-1):
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_sleep'] =  df_daily_sr_wide.loc[i, 'DAILY_next24_sleep'] - df_daily_sr_wide.loc[i+1, 'DAILY_past24_sleep']
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_occupation'] =  df_daily_sr_wide.loc[i, 'DAILY_next24_occupation'] - df_daily_sr_wide.loc[i+1, 'DAILY_past24_occupation']    
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_nonoccupation'] =  df_daily_sr_wide.loc[i, 'DAILY_next24_nonoccupation'] - df_daily_sr_wide.loc[i+1, 'DAILY_past24_nonoccupation']    
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_leisureSolo'] =  df_daily_sr_wide.loc[i+1, 'DAILY_past24_leisureSolo'] - df_daily_sr_wide.loc[i, 'DAILY_next24_leisureSolo'] # reversed
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_leisureNonSolo'] =  df_daily_sr_wide.loc[i+1, 'DAILY_past24_leisureNonSolo']  - df_daily_sr_wide.loc[i, 'DAILY_next24_leisureNonSolo'] # reversed
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_exercise'] =  df_daily_sr_wide.loc[i, 'DAILY_next24_exercise'] - df_daily_sr_wide.loc[i+1, 'DAILY_past24_exercise']    
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_socialMedia'] =  df_daily_sr_wide.loc[i+1, 'DAILY_past24_socialMedia'] - df_daily_sr_wide.loc[i, 'DAILY_next24_socialMedia'] # reversed 
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_drinks'] =  df_daily_sr_wide.loc[i+1, 'DAILY_past24_drinks'] - df_daily_sr_wide.loc[i, 'DAILY_next24_drinks'] # reversed
    

#### EDA Profiling

In [None]:
profile = ProfileReport(df_daily_sr_wide.iloc[:,2:], title=f"Daily Reports Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"daily_reports_run{run_num}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#### Clean

##### Variance

We can see that there are a number of participants who have no variance in a given category.

This is obviously a problem for some analyses...

In [41]:
# check for variance in numeric cols only, leaving ParticipantIdentifier for grouping
non_numeric_cols_alt = [item for item in non_numeric_cols if item != 'ParticipantIdentifier']

In [42]:
# Drop 'trial_date' column
data = df_daily_sr_wide.drop(columns=non_numeric_cols_alt, errors='ignore')

# Group by 'ParticipantIdentifier' and compute the variance
grouped_variance = data.groupby('ParticipantIdentifier').var()

# Filter the grouped_variance dataframe to only include columns with 0 variance for any participant
zero_variance_df = grouped_variance[grouped_variance == 0].dropna(how='all')

# Melt the dataframe to have ParticipantIdentifier, Column with 0 variance
melted_zero_variance_df = zero_variance_df.reset_index().melt(id_vars=['ParticipantIdentifier'], value_name='Variance')
final_zero_variance_df = melted_zero_variance_df.dropna(subset=['Variance']).drop(columns='Variance')
final_zero_variance_df = pd.DataFrame(final_zero_variance_df.groupby('ParticipantIdentifier').count()).reset_index().sort_values(by='variable', ascending=False)
final_zero_variance_df = final_zero_variance_df.rename(columns={'variable': 'ZeroVariance'})

# Count the number of values present in the variable column for each participant
value_counts = data.groupby('ParticipantIdentifier').count()

# Merge the value counts with the final_zero_variance_df
merged_df = final_zero_variance_df.merge(value_counts, on='ParticipantIdentifier', how='left')

# Extract only the relevant columns
result_df = merged_df[['ParticipantIdentifier', 'ZeroVariance', 'DAILY_goal1_confidence']]
result_df.columns = ['ParticipantIdentifier', 'dailySR_zeroVar_cols', 'Count']

result_df

Unnamed: 0,ParticipantIdentifier,dailySR_zeroVar_cols,Count
0,27329533-d0a4-4605-9da5-0eb857154cae,13,4
1,7272fab8-c4e7-45bb-ba42-e9f6a06801c0,10,3
2,2db37fd7-2694-4b46-80f9-9f4b58d57bfa,10,3
3,329e2c06-a903-44ce-a409-8ed8c580b124,9,25
4,43adea65-466c-4891-8bd9-301c4d6560c8,6,50
...,...,...,...
75,8a5a92f3-472a-4a39-bfd8-f43f60fdcd27,1,27
76,a8b5a9ea-b762-4f46-a431-6c530215c498,1,80
77,38bcd1b2-f8bc-48ee-bff2-5bca24012983,1,33
78,92863208-10a4-443b-a225-18630552f5cc,1,10


In [43]:
# we can see that drinks have the most people with zero variance...
pd.DataFrame(grouped_variance[grouped_variance == 0].dropna(how='all').eq(0).sum()).reset_index(names='Variable').sort_values(by=0, ascending=False).reset_index(drop=True)

Unnamed: 0,Variable,0
0,DAILY_past24_drinks,49
1,DAILY_next24_drinks,23
2,DAILY_gap_drinks,21
3,DAILY_next24_sleep,12
4,DAILY_past48to24_gapCause_external,10
...,...,...
62,DAILY_past24_nonoccupation,0
63,DAILY_past24_physicalEffort,0
64,DAILY_past24_procrastination,0
65,DAILY_past24_productivity,0


In [44]:
# add flag for subjects with more than 5 zero variance columns
flag_threshold = 5

result_df['dailySR_zeroVarCols_flag'] = False
result_df.loc[result_df.dailySR_zeroVar_cols > flag_threshold, 'dailySR_zeroVarCols_flag'] = True
result_df.sort_values(by='dailySR_zeroVar_cols', ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['dailySR_zeroVarCols_flag'] = False


Unnamed: 0,ParticipantIdentifier,dailySR_zeroVar_cols,Count,dailySR_zeroVarCols_flag
0,27329533-d0a4-4605-9da5-0eb857154cae,13,4,True
2,2db37fd7-2694-4b46-80f9-9f4b58d57bfa,10,3,True
1,7272fab8-c4e7-45bb-ba42-e9f6a06801c0,10,3,True
3,329e2c06-a903-44ce-a409-8ed8c580b124,9,25,True
4,43adea65-466c-4891-8bd9-301c4d6560c8,6,50,True
...,...,...,...,...
60,146e1ab8-c4ca-4a52-9261-e108b38eea53,1,77,False
59,6c1b2b3c-3c06-4238-830d-65b76904e8b5,1,69,False
58,ed16354a-961a-4e5e-83de-ce9a17b25a2f,1,37,False
57,d77924dd-6dcf-4288-bb01-186fa08cfd0b,1,59,False


In [45]:
# look at subject with most zero var columns
df_daily_sr_wide.loc[(df_daily_sr_wide['ParticipantIdentifier'] == '27329533-d0a4-4605-9da5-0eb857154cae') & (df_daily_sr_wide['DAILY_goal1_confidence'].notna())]

Unnamed: 0,ParticipantIdentifier,trial_date,DAILY_goal1_confidence,DAILY_goal1_consequences,DAILY_goal1_effort,DAILY_goal1_importance,DAILY_goal1_interaction_month,DAILY_goal1_interaction_week,DAILY_goal1_motivationExternal,DAILY_goal1_motivationInternal,...,DAILY_past48to24_gapCause_external,DAILY_gap_diet,DAILY_gap_sleep,DAILY_gap_occupation,DAILY_gap_nonoccupation,DAILY_gap_leisureSolo,DAILY_gap_leisureNonSolo,DAILY_gap_exercise,DAILY_gap_socialMedia,DAILY_gap_drinks
667,27329533-d0a4-4605-9da5-0eb857154cae,2022-10-03,4.0,7.0,7.0,7.0,5.0,5.0,5.0,5.0,...,0.8,,,,,,,,,
668,27329533-d0a4-4605-9da5-0eb857154cae,2022-10-04,4.0,7.0,7.0,7.0,4.0,5.0,3.0,4.0,...,,,1.0,0.0,2.0,,0.0,1.0,,
669,27329533-d0a4-4605-9da5-0eb857154cae,2022-10-05,3.0,7.0,7.0,7.0,5.0,5.0,7.0,4.0,...,,,1.0,2.0,,1.0,1.0,1.0,-28.0,
670,27329533-d0a4-4605-9da5-0eb857154cae,2022-10-06,3.0,7.0,7.0,7.0,5.0,5.0,7.0,3.0,...,,,0.0,7.0,,,-2.0,30.0,,


In [46]:
# merge with main sr df
df_daily_sr_wide = df_daily_sr_wide.merge(result_df.drop(columns=['Count']), on='ParticipantIdentifier', how='left')

In [47]:
# Add sr prefix
# df_daily_sr_wide.columns[2:]

df_daily_sr_wide.columns = ['ParticipantIdentifier', 'trial_date'] + ['sr_' + col for col in df_daily_sr_wide.columns[2:]]

#### Save

In [48]:
# Create df with one participant ID for every trial_date

# Create a series of dates from '2022-09-27' to '2022-12-20'
if run_num == 1:
    date_series = pd.date_range(start='2022-09-27', end='2022-12-20')
elif run_num == 2:
    date_series = pd.date_range(start='2023-01-30', end='2023-04-24')

ids_series = subjects

# Create a dataframe using a cartesian product of the two series
df_complete_idDate = pd.DataFrame({
    'ParticipantIdentifier': np.repeat(ids_series, len(date_series)),
    'trial_date': date_series.tolist() * len(ids_series)
}).reset_index(drop=True)

# Convert trial_date to datetime.date
df_complete_idDate['trial_date'] = pd.to_datetime(df_complete_idDate['trial_date']).dt.date
df_daily_sr_wide['trial_date'] = pd.to_datetime(df_daily_sr_wide['trial_date']).dt.date

# Join with affect df
df_daily_sr_wide = df_complete_idDate.merge(df_daily_sr_wide, how='left', on=['ParticipantIdentifier', 'trial_date'])

In [49]:
# save to csv
if run_num ==1:
    # run 1
    df_daily_sr_wide.to_csv(save_path + 'run1_selfReport.csv', index=False)
if run_num ==2:
    # run 2
    df_daily_sr_wide.to_csv(save_path + 'run2_selfReport.csv', index=False)

### Daily/Weekly Social Support

**NOTE**: These were only collected in run 2

Weekly support is a 12 item scale scored on a 5 point Likert-scale (0-4).
It is based on the The [Interpersonal Support Evaluation List](https://www.cmu.edu/common-cold-project/measures-by-study/psychological-and-social-constructs/social-relationships-loneliness-measures/social-support.html), using the [ISEL-12 version](https://www.cmu.edu/common-cold-project/measures-by-study/psychological-and-social-constructs/social-relationships-loneliness-measures/isel_12_item.pdf).

Daily social support is a custom measure developed by [Leo Huang](https://www.leohuangneuro.com/about-me), [Cendri Hutcherson](https://www.linkedin.com/in/cendri-hutcherson-3327a161/?originalSubdomain=ca), and [Daniel J Wilson](https://github.com/danieljwilson).

The items fall under 3 categories 👇

![Example Image](../../3_3_6_inputs/images/ss_daily_items.jpg)

#### Load Weekly Data

In [50]:
ss_df = df.loc[df['ResultIdentifier'].str.startswith('ss_weekly_')].reset_index(drop=True)

#### Munge Data

In [51]:
# Make Answers numeric
ss_df['Answers'] = pd.to_numeric(ss_df['Answers'], errors='coerce')

# Reverse score the specified items
reverse_items = ['ss_weekly_1', 'ss_weekly_4', 'ss_weekly_5', 'ss_weekly_7', 'ss_weekly_10', 'ss_weekly_12']
ss_df.loc[ss_df['ResultIdentifier'].isin(reverse_items) & (ss_df['Answers'] != 0), 'Answers'] = 5 - ss_df['Answers']

In [52]:
# Calculate the total score for each participant
total_scores = ss_df.groupby(['ParticipantIdentifier', 'trial_date'])['Answers'].sum().reset_index()
total_scores.rename(columns={'Answers': 'ss_weekly_ISEL12_totalScore'}, inplace=True)

# Calculate subscale scores
appraisal_items = ['ss_weekly_1', 'ss_weekly_2', 'ss_weekly_3', 'ss_weekly_4']
belonging_items = ['ss_weekly_5', 'ss_weekly_6', 'ss_weekly_7', 'ss_weekly_8']
tangible_items = ['ss_weekly_9', 'ss_weekly_10', 'ss_weekly_11', 'ss_weekly_12']

appraisal_scores = ss_df[ss_df['ResultIdentifier'].isin(appraisal_items)].groupby(['ParticipantIdentifier', 'trial_date'])['Answers'].sum().reset_index()
appraisal_scores.rename(columns={'Answers': 'ss_weekly_ISEL12_appraisal'}, inplace=True)

belonging_scores = ss_df[ss_df['ResultIdentifier'].isin(belonging_items)].groupby(['ParticipantIdentifier', 'trial_date'])['Answers'].sum().reset_index()
belonging_scores.rename(columns={'Answers': 'ss_weekly_ISEL12_belonging'}, inplace=True)

tangible_scores = ss_df[ss_df['ResultIdentifier'].isin(tangible_items)].groupby(['ParticipantIdentifier', 'trial_date'])['Answers'].sum().reset_index()
tangible_scores.rename(columns={'Answers': 'ss_weekly_ISEL12_tangible'}, inplace=True)

In [53]:
# Merge
ss_df = total_scores.merge(appraisal_scores, on=['ParticipantIdentifier', 'trial_date']).merge(belonging_scores, on=['ParticipantIdentifier', 'trial_date']).merge(tangible_scores, on=['ParticipantIdentifier', 'trial_date'])

#### EDA

In [None]:
profile = ProfileReport(ss_df.iloc[:,2:], title=f"ISEL 12 Social Support Weekly Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"ss_weekly_run{run_num}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#### Save

In [54]:
# save to csv
if run_num ==1:
    # run 1
    print('No social support weekly measure for run 1...')
if run_num ==2:
    # run 2
    ss_df.to_csv(save_path + 'run2_ss_weekly.csv', index=False)

No social support weekly measure for run 1...


#### Load Daily Data

In [55]:
df_daily_ss = df.loc[df['ResultIdentifier'].str.startswith('ss_')].reset_index(drop=True)

# Remove weekly measures
df_daily_ss = df_daily_ss.loc[~df_daily_ss['ResultIdentifier'].str.contains('_weekly_')].reset_index(drop=True)

#### Convert to Wide

In [56]:
# Convert values in 'Answers' column to numeric where possible, else leave as string
# df_daily_sr['Answers'] = pd.to_numeric(df_daily_sr['Answers'], errors='coerce').fillna(df_daily_sr['Answers'])

# Pivot the data
df_daily_ss_wide = df_daily_ss.pivot_table(index=["ParticipantIdentifier", "trial_date"],
                                           columns='ResultIdentifier',
                                           values='Answers',
                                           aggfunc=lambda x: ' '.join(map(str, x))).reset_index()
# get rid of name on index
df_daily_ss_wide = df_daily_ss_wide.rename_axis(None, axis=1)

In [57]:
df_daily_ss_wide.head()

Unnamed: 0,ParticipantIdentifier,trial_date


In [58]:
# convert data to numeric where appropriate
df_daily_ss_wide.loc[:,df_daily_ss_wide.columns[~df_daily_ss_wide.columns.isin(non_numeric_cols + ['ParticipantIdentifier', 'trial_date'])]] = df_daily_ss_wide.loc[:,df_daily_ss_wide.columns[~df_daily_ss_wide.columns.isin(non_numeric_cols)]].apply(pd.to_numeric, errors='coerce')

#### EDA

In [None]:
profile = ProfileReport(df_daily_ss_wide.iloc[:,2:], title=f"Social Support Daily Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"ss_daily_run{run_num}.html")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={"index": "df_index"}, inplace=True)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#### Save

In [59]:
# save to csv
if run_num ==1:
    # run 1
    print('No social support daily measure for run 1...')
if run_num ==2:
    # run 2
    df_daily_ss_wide.to_csv(save_path + 'run2_ss_daily.csv', index=False)

No social support daily measure for run 1...


## Custom Tasks


### Food Task

##### Load Data

In [60]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

if 'food_df' in globals():
    del(food_df)
    print('deleted daily food_df')

deleted existing df


In [61]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_app_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_app_survey_results.csv')

# Convert the date column back to datetime.date
df['trial_date'] = pd.to_datetime(df['trial_date']).dt.date

##### Select Data

In [62]:
food_df = df.loc[df['ResultIdentifier'].str.startswith('rating_')].reset_index(drop=True)
hunger =  df.loc[df['ResultIdentifier']=='Hunger_Screen'].reset_index(drop=True)
hunger.rename(columns={"Answers": "task_food_hunger_level"}, inplace=True)

#### Convert to Wide

In [63]:
# Convert the dataframe from long to wide format
food_df_wide = food_df.pivot_table(index=["ParticipantIdentifier", "trial_date"], 
                              columns='ResultIdentifier', 
                              values='Answers', 
                              aggfunc='first').reset_index()

# get rid of name on index
food_df_wide = food_df_wide.rename_axis(None, axis=1)

In [64]:
import json

# Filter out columns that start with "rating_"
rating_columns = [col for col in food_df_wide.columns if col.startswith('rating_')]

# Define a function to extract required information from the 'name' value
def extract_name_info(json_data):
    # Check if 'name' key exists in json_data and is of type string
    name_str = json_data.get('name', "")
    
    if not isinstance(name_str, str):
        return None, None

    # Extract task_food_item
    item_start = name_str.rfind('_') + 1
    item_end = name_str.rfind('.jpg')
    task_food_item = name_str[item_start:item_end] if item_start != -1 and item_end != -1 else None
    
    # Extract task_food_category
    category_start = name_str.rfind('/') + 1
    category_end = name_str.rfind('_')
    task_food_category = name_str[category_start:category_end] if category_start != -1 and category_end != -1 else None
    
    return task_food_item, task_food_category

# Modify the function to handle potential strings in columns
def parse_json(entry):
    try:
        return json.loads(entry)
    except (TypeError, json.JSONDecodeError):
        return {}

# Re-parse the JSON strings using the modified function
for col in rating_columns:
    food_df_wide[col] = food_df_wide[col].apply(parse_json)

# Re-extract the required values using the modified function
for col in rating_columns:
    # Extract required information
    food_df_wide[col + '_rating'] = food_df_wide[col].apply(lambda x: x.get('rating', None))
    food_df_wide[col + '_rt'] = food_df_wide[col].apply(lambda x: x.get('reactionTime', None))
    food_df_wide[col + '_item'], food_df_wide[col + '_category'] = zip(*food_df_wide[col].apply(extract_name_info))

# Drop the original "rating_" columns as they are not needed anymore
food_df_wide = food_df_wide.drop(columns=rating_columns)

# Convert all 'task_food_rating' columns to dtype int
rating_cols_to_convert = [col for col in food_df_wide.columns if '_rating' in col]

for col in rating_cols_to_convert:
    food_df_wide[col] = food_df_wide[col].astype(np.int64)  # Using 'Int64' to handle potential NaN values

# Rename columns that start with 'rating' to start with 'task_food'
food_df_wide.columns = ['task_food' + col[len('rating'):] if col.startswith('rating') else col for col in food_df_wide.columns]

food_df_wide.head()

Unnamed: 0,ParticipantIdentifier,trial_date


In [65]:
# add hunger value
food_df_wide = food_df_wide.merge(hunger[['ParticipantIdentifier', 'trial_date', 'task_food_hunger_level']],
                                  on=['ParticipantIdentifier', 'trial_date'],
                                  how='left')

# Convert hunger to int
food_df_wide['task_food_hunger_level'] = food_df_wide['task_food_hunger_level'].astype(np.float32)

food_df_wide.head()

Unnamed: 0,ParticipantIdentifier,trial_date,task_food_hunger_level


#### EDA

In [None]:
profile = ProfileReport(food_df_wide.iloc[:,2:], title=f"Food Task Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_food_run{run_num}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#### Clean

We had numerous outlier RT values.

We used the $z$ transform method to flag outliers (in the column `task_food_rt_flag`) based on Berger and Kiefer ([2021](https://doi.org/10.3389/fpsyg.2021.675558)) where they tested multiple methods of removing outliers from rt data.

We also set the flag threshold to 3.

In [66]:
# clean outlier rt values

from scipy.stats import zscore

# Step 1: Filter out rt cols
rt_columns = [col for col in food_df_wide.columns if col.endswith('_rt')]

# Step 2: Compute the z-scores for these columns
for col in rt_columns:
    z_col_name = col + '_z'
    food_df_wide[z_col_name] = zscore(food_df_wide[col], nan_policy='omit')

# Step 3: Check each row for values above a threshold in the '_z' columns
threshold = 3  # Define a threshold value
z_columns = [col + '_z' for col in rt_columns]
food_df_wide['task_food_rt_flag'] = food_df_wide[z_columns].apply(lambda row: any(abs(val) > threshold for val in row), axis=1)

z_columns_to_drop = [col for col in food_df_wide.columns if col.endswith('_z')]
wide_df = food_df_wide.drop(columns=z_columns_to_drop)


#### Save

In [67]:
# Create df with one participant ID for every trial_date

# Create a series of dates from '2022-09-27' to '2022-12-20'
if run_num == 1:
    date_series = pd.date_range(start='2022-09-27', end='2022-12-20')
elif run_num == 2:
    date_series = pd.date_range(start='2023-01-30', end='2023-04-24')

ids_series = subjects

# Create a dataframe using a cartesian product of the two series
df_complete_idDate = pd.DataFrame({
    'ParticipantIdentifier': np.repeat(ids_series, len(date_series)),
    'trial_date': date_series.tolist() * len(ids_series)
}).reset_index(drop=True)

# Convert trial_date to datetime.date
df_complete_idDate['trial_date'] = pd.to_datetime(df_complete_idDate['trial_date']).dt.date
food_df_wide['trial_date'] = pd.to_datetime(food_df_wide['trial_date']).dt.date

# Join with affect df
food_df_wide = df_complete_idDate.merge(food_df_wide, how='left', on=['ParticipantIdentifier', 'trial_date'])

In [68]:
# Removing duplicate rows from the food_df dataframe based on 'ParticipantIdentifier' and 'trial_date'
food_df_wide = food_df_wide.drop_duplicates(subset=['ParticipantIdentifier', 'trial_date'])
food_df_wide.shape

(7820, 4)

In [69]:
# save to csv
if run_num == 1:
    # run 1
    print('Task was not part of run 1...')
if run_num == 2:
    # run 2
    food_df_wide.to_csv(save_path + 'run2_task_food.csv', index=False)

Task was not part of run 1...


### N-Back

The n-back sequence was created as follows (where `n` indicates whether it is 2-back or 3-back)

```javascript
function constructSequence(n) {
    const ls = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
    let number = 0;
    let char = "";
    const alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'];
    const sequence = []
    for (let i = 0; i < SEQ_LEN; i++) {
        number = ls[Math.floor(Math.random() * ls.length)];
        if (i >= n && number <= 2) {
            char = sequence[i - n];
            //console.log("in if ===>", char, sequence, i, n)
        } else {
            char = alphabet[Math.floor(Math.random() * alphabet.length)];
            //console.log("in else ==>", char)
        }
        sequence.push(char)

    }
    return sequence;
}
```

Given that `ls` has a length of 11 this means that on average there is a 3/11 chance of having a match (for positions 3 and onward)


In [70]:
import json

#### Load Data

In [71]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_app_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_app_survey_results.csv')
    
# Convert the date column back to datetime.date
df['trial_date'] = pd.to_datetime(df['trial_date']).dt.date

#### Format Data

In [72]:
nback_df = df.loc[df.ResultIdentifier == 'task_custom_nBack_results'].reset_index(drop=True)

In [73]:
# we only need this as bids were being overwritten when the bid was NOT accepted with a 0 bid...
# this was fixed on Feb 7, but using the actual bid value will also continue to work...

nback_df_bids = df.loc[df.ResultIdentifier == 'task_custom_nBack_diffSelect'].reset_index(drop=True)

In [74]:
nback_df.Answers[5]

'{"bid":56,"randomNumber":98,"correctness":0.8181818181818182,"earnings":98,"mode":"hard","matched":6,"missed":1,"sequence":["X","Z","H","I","S","S","O","I","S","H","I","O","I","W","O","I","I","R","G","O","V","U","O","V","U"],"falseAlarm":3,"indexOfMatchClicked":[9,11,12,13,15,16,20,23,25]}'

In [75]:
# Parse json to create columns
for i in range(nback_df.shape[0]):
    nback_df.loc[i, 'task_nback_bid'] = json.loads(nback_df_bids.Answers[i])['bid']
    nback_df.loc[i, 'task_nback_rndNum'] = json.loads(nback_df.Answers[i])['randomNumber']    
    nback_df.loc[i, 'task_nback_mode'] = json.loads(nback_df.Answers[i])['mode']
    nback_df.loc[i, 'task_nback_matched'] = json.loads(nback_df.Answers[i])['matched']    
    nback_df.loc[i, 'task_nback_missed'] = json.loads(nback_df.Answers[i])['missed']    
    nback_df.loc[i, 'task_nback_falseAlarm'] = json.loads(nback_df.Answers[i])['falseAlarm']
    nback_df.loc[i, 'task_nback_trialCount'] = len(json.loads(nback_df.Answers[i])['sequence'])

In [76]:
nback_df = nback_df.drop(['ResultIdentifier', 'Answers', 'EndDate', 'datetime'], axis=1)
nback_df = nback_df.rename(columns={"time": "task_nback_time"})

#### Add Features

Adding the following metrics for Binary Classification:

1. **Accuracy**: 
   The proportion of correctly predicted classifications in the total predictions made.
   $$
   \text{Accuracy} = \frac{\text{TP} + \text{TN}}{\text{TP} + \text{TN} + \text{FP} + \text{FN}}
   $$

2. **Precision** (or Positive Predictive Value):
   The proportion of positive identifications that were actually correct.
   $$
   \text{Precision} = \frac{\text{TP}}{\text{TP} + \text{FP}}
   $$

3. **Recall** (or Sensitivity or True Positive Rate):
   The proportion of actual positives that were identified correctly.
   $$
   \text{Recall} = \frac{\text{TP}}{\text{TP} + \text{FN}}
   $$

4. **Specificity** (or True Negative Rate):
   The proportion of actual negatives that were identified correctly.
   $$
   \text{Specificity} = \frac{\text{TN}}{\text{TN} + \text{FP}}
   $$

5. **False Alarm Rate** (or Fall-Out):
   The proportion of actual negatives that were incorrectly classified as positive.
   $$
   \text{False Alarm Rate} = \frac{\text{FP}}{\text{TN} + \text{FP}}
   $$

6. **F1 Score**:
   The harmonic mean of precision and recall, giving a balance between the two.
   $$
   \text{F1 Score} = 2 \times \frac{\text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}}
   $$

7. **Matthews Correlation Coefficient (MCC)**:
   A metric that takes into account true and false positives and negatives and is generally regarded as a balanced measure which can be used even if the classes are of very different sizes.
   $$
   \text{MCC} = \frac{\text{TP} \times \text{TN} - \text{FP} \times \text{FN}}{\sqrt{(\text{TP} + \text{FP})(\text{TP} + \text{FN})(\text{TN} + \text{FP})(\text{TN} + \text{FN})}}
   $$

8. **Bias (C or criterion)**:
   A metric from signal detection theory that indicates the participant's response bias. A positive value indicates a bias toward saying "no" (


In [77]:
truePos = nback_df.task_nback_matched
trueNeg = nback_df.task_nback_trialCount - nback_df.task_nback_matched - nback_df.task_nback_missed - nback_df.task_nback_falseAlarm
falsePos = nback_df.task_nback_falseAlarm
falseNeg = nback_df.task_nback_missed

# proportion of correct classifications in total predictions made
nback_df['task_nback_accuracy'] = (truePos + trueNeg) / (truePos + trueNeg + falsePos + falseNeg)
# positive predictive value (hit rate)
nback_df['task_nback_precision'] = truePos / (truePos + falsePos)
# true positive rate (sensitivity)
nback_df['task_nback_recall'] = truePos / (truePos + falseNeg)
# true negative rate (false_alarm_rate)
nback_df['task_nback_specificity'] = trueNeg / (trueNeg + falsePos)
# Proportion of times the participant incorrectly indicates an n-back match when there wasn't one.
nback_df['task_nback_falseAlarmRate'] = falsePos / (trueNeg + falsePos)
# Harmonic mean of precision and recall, giving a balance between the two
nback_df['task_nback_F1'] = 2 * ((nback_df['task_nback_precision'] * nback_df['task_nback_recall'])/(nback_df['task_nback_precision'] + nback_df['task_nback_recall']))
# Matthews Correlation Coefficient (MCC):
# It takes into account true and false positives and negatives and is generally regarded as a balanced measure which can be used even if the classes are of very different sizes.
nback_df['task_nback_MCC'] = ((truePos * trueNeg) - (falsePos * falseNeg)) / (np.sqrt((truePos + falsePos)*(truePos + falseNeg) * (trueNeg + falsePos) * (trueNeg + falseNeg)))

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [78]:
# metric from signal detection theory
# indicates the participant's response bias. 
# A positive value indicates a bias toward saying "no" (conservative)
# A negative value indicates a bias toward saying "yes" (liberal).

from scipy.stats import norm

def calculate_criterion(hit_rate, false_alarm_rate):
    # Calculate the Z scores for the hit rate and false alarm rate
    z_hit = norm.ppf(hit_rate)
    z_fa = norm.ppf(false_alarm_rate)
    
    # Calculate the criterion C
    C = -0.5 * (z_hit + z_fa)
    
    return C

nback_df['task_nback_bias'] = calculate_criterion(nback_df['task_nback_precision'], nback_df['task_nback_falseAlarmRate'])

  C = -0.5 * (z_hit + z_fa)


In [79]:
# People that chose the easy mode (2-back instead of 3 back) were assigned a bid of ZERO
# However, this suggests that they would have done the easy task for nothing, which is NOT the case
# as they would not have done it for the max possible (100) points
# I arbitrarily assign subjects in the easy condition a bid of 200

nback_df.loc[nback_df.task_nback_bid == 0, 'task_nback_bid'] = 200

#### EDA Profiling

In [None]:
profile = ProfileReport(nback_df.iloc[:,3:], title= f"n-Back Task Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"nback_report_run{run_num}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  x = asanyarray(arr - arrmean)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)
(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'cannot specify integer `bins` when input data contains infinity')


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

**NOTES**

Looking at the data it is clear that something wonky went on in some trials where people have matched values up to 67, and missed values of -56. 

Can calculate a super low probability number of matches and delete trials with any values above that - as well as any trials with negative "missed" values.

False alarm also has a max of 136

In [80]:
np.where(np.sqrt((truePos + falsePos)*(truePos + falseNeg) * (trueNeg + falsePos) * (trueNeg + falseNeg)).isna())

  result = getattr(ufunc, method)(*inputs, **kwargs)


(array([  17,   90,  100,  143,  306,  363,  424,  704,  705,  720,  735,
         932,  965, 1177, 1180, 1310, 1335, 1352, 1505, 1528, 1742, 1747,
        1865, 1946, 1962, 1998, 2134, 2177, 2239, 2412, 2492, 2530, 2914,
        3089, 3153, 3337, 3611, 3739, 3838, 3978, 4033]),)

In [81]:
nback_df.iloc[35]

ParticipantIdentifier        837ea159-60c6-4bde-b3ba-2c107844a9bb
trial_date                                             2022-10-24
task_nback_time                                          21:25:24
task_nback_bid                                               50.0
task_nback_rndNum                                            63.0
task_nback_mode                                              hard
task_nback_matched                                            4.0
task_nback_missed                                             0.0
task_nback_falseAlarm                                         4.0
task_nback_trialCount                                        25.0
task_nback_accuracy                                          0.84
task_nback_precision                                          0.5
task_nback_recall                                             1.0
task_nback_specificity                                   0.809524
task_nback_falseAlarmRate                                0.190476
task_nback

#### Clean

The cleaning process consists of removing any entries/rows where there is an impossible or extrememly improbable value in any of the `matched`, `missed` and `false_alarm` columns.

Impossible means that any of: 

1. $\text{matches} > 23$, given that there were only 23 possibile maches for the 3-back (hard mode).
⚡ However, I chose to eliminate any trials that had a likelihood of less than 1 in a million, which was $\text{matches} > 17$
2. $\text{misses} < 0$, given that it is impossible.

3. $\text{false alarm} > 25$, given that it is impossible.

In [82]:
nback_df.shape

(4105, 18)

In [83]:
nback_df = nback_df.loc[nback_df.task_nback_matched <18,]

In [84]:
nback_df.shape

(4075, 18)

In [85]:
nback_df = nback_df.loc[nback_df.task_nback_missed >= 0,]

In [86]:
nback_df.shape

(3939, 18)

In [87]:
nback_df = nback_df.loc[nback_df.task_nback_falseAlarm <= 25,]

In [88]:
nback_df.shape

(3937, 18)

In [None]:
# rerun EDA
profile = ProfileReport(nback_df.iloc[:,3:], title= f"n-Back Task Run {run_num} - cleaned | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"nback_report_run{run_num}_clean.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)
(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'cannot specify integer `bins` when input data contains infinity')


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### GoNoGo

**Parameters**

- 50 trials
- Stimulus = 250ms
- ITI = 450ms
- go/no-go ratio 4:1 (40/10)
- NOTE THAT IF YOU PRESS THE BUTTON DURING THE ITI IT STILL COUNTS FOR THAT TRIAL! So the stimulus disappears but the trial is still “on”…each trial = 250 + 450 = 700ms

**Data**

For each trial

- `trialType` Go or NoGo trial
- `stim` Circle color
- `stimDuration` How long is stim on
- `iti`
- `RT` RT to click
- `correct` Correct/Error
    - Go trial = click
    - NoGo trial = no click

<div class="alert alert-block alert-info">
<b>📝 Note:</b><br>
    According to <a href="https://link.springer.com/article/10.3758/s13428-017-0923-5">this paper</a> optimal go/no-go ratios to maximize false alarms were predicted to occur for the shortest tested ITI (450 ms) and a go/no-go ratio near 4:1. 
    <br><br>These values are predicted to produce a mean of 6.4 to 8.7 false alarms per 150 trials (95% confidence interval of the mean)
    <br><br>Given that we ran 50 trials we would expect 1/3 of this range. We found our mean false alarm rate was in this range at 2.7 (or 8.1 for 150 trials).
</div>

#### Load Data

In [89]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

if 'gng_df' in globals():
    del(gng_df)
    print('deleted gng_df')

deleted existing df


In [90]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_app_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_app_survey_results.csv')
    
# Convert the date column back to datetime.date
df['trial_date'] = pd.to_datetime(df['trial_date']).dt.date

In [91]:
gng_df =  df.loc[df.ResultIdentifier == 'task_custom_gonogo'].reset_index(drop=True)

#### Format Data

In [92]:
json.loads(gng_df.Answers[0])

{'reactionTime': {'0': 453,
  '1': 405,
  '2': 412,
  '3': 401,
  '4': 375,
  '5': 382,
  '6': 407,
  '7': 403,
  '8': 410,
  '9': 467,
  '11': 536,
  '12': 510,
  '13': 500,
  '15': 447,
  '16': 405,
  '18': 428,
  '19': 458,
  '20': 423,
  '21': 446,
  '24': 399,
  '25': 423,
  '26': 447,
  '28': 526,
  '29': 539,
  '30': 571,
  '31': 553,
  '32': 494,
  '33': 457,
  '34': 497,
  '35': 488,
  '36': 436,
  '37': 434,
  '38': 425,
  '39': 457,
  '41': 478,
  '42': 435,
  '43': 466,
  '44': 423,
  '48': 420,
  '49': 391},
 'correctness': {'0': 0,
  '1': 0,
  '2': 0,
  '3': 0,
  '4': 0,
  '5': 0,
  '6': 0,
  '7': 0,
  '8': 0,
  '9': 0,
  '11': 0,
  '12': 0,
  '13': 0,
  '15': 0,
  '16': 0,
  '18': 0,
  '19': 0,
  '20': 0,
  '21': 0,
  '24': 0,
  '25': 0,
  '26': 0,
  '28': 0,
  '29': 0,
  '30': 0,
  '31': 0,
  '32': 0,
  '33': 0,
  '34': 0,
  '35': 0,
  '36': 0,
  '37': 0,
  '38': 0,
  '39': 0,
  '41': 0,
  '42': 0,
  '43': 0,
  '44': 0,
  '48': 0,
  '49': 0},
 'incorrectNoGo': 0,
 'corr

In [93]:
for i in range(gng_df.shape[0]):
    gng_df.loc[i, 'task_gng_incorrectNoGo'] = json.loads(gng_df.Answers[i])['incorrectNoGo']
    gng_df.loc[i, 'task_gng_incorrectGo'] = json.loads(gng_df.Answers[i])['incorrectGo']    
    gng_df.loc[i, 'task_gng_correctGo'] = json.loads(gng_df.Answers[i])['correctGo']
    gng_df.loc[i, 'task_gng_correctNoGo'] = json.loads(gng_df.Answers[i])['correctNoGo']    
    gng_df.loc[i, 'task_gng_avgRt'] = np.mean(list(json.loads(gng_df.Answers[i])['reactionTime'].values()))

In [94]:
gng_df['task_gng_time'] = gng_df.time

In [95]:
gng_df = gng_df[['ParticipantIdentifier', 'trial_date',
                 'task_gng_time',
                 'task_gng_incorrectNoGo',
                 'task_gng_incorrectGo',
                 'task_gng_correctGo',
                 'task_gng_correctNoGo',
                 'task_gng_avgRt'
                ]]

In [96]:
gng_df.head(2)

Unnamed: 0,ParticipantIdentifier,trial_date,task_gng_time,task_gng_incorrectNoGo,task_gng_incorrectGo,task_gng_correctGo,task_gng_correctNoGo,task_gng_avgRt
0,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,2022-10-24,07:07:34,0.0,0.0,40.0,10.0,450.675
1,39d785c6-9e3d-44c2-88ec-2ce14ef923f9,2022-10-24,07:08:33,4.0,0.0,40.0,6.0,343.704545


#### EDA

In [None]:
profile = ProfileReport(gng_df.iloc[:,3:], title=f"Go-Nogo Task Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_gng_run{run_num}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#### Clean

We can see that there are 84 trials where the RT is 0. These trials are removed.

We also know from the work of Luce (1984) and Whelan (2008) that RTs below 100ms are not realistic.

Therefore all trials with RTs below 100 were also removed.

In [97]:
zero_trials = (gng_df.task_gng_avgRt == 0).sum()
sub100_trials = (gng_df.task_gng_avgRt < 100).sum()
trials_n = gng_df.shape[0]

print(f'{zero_trials}, or {(zero_trials/trials_n) *100:.1f}% of trials had RTs of 0.')
print(f'{sub100_trials}, or {(sub100_trials/trials_n) * 100:.1f}% of trials had RTs of less than 100.')

117, or 2.8% of trials had RTs of 0.
315, or 7.6% of trials had RTs of less than 100.


In [98]:
gng_df.iloc[:,3:].mean()

task_gng_incorrectNoGo      3.782504
task_gng_incorrectGo        5.640889
task_gng_correctGo         34.359111
task_gng_correctNoGo        4.615273
task_gng_avgRt            325.114669
dtype: float64

In [99]:
# Remove all trials with impossibly short rts
gng_df = gng_df.loc[gng_df.task_gng_avgRt >= 100,]

# Rerun EDA
profile = ProfileReport(gng_df.iloc[:,3:], title=f"Go-Nogo Task Run {run_num} - Clean | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_gng_run{run_num}_clean.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [100]:
# Note the decrease in average incorrectGo (false alarm)
gng_df.iloc[:,3:].mean()

task_gng_incorrectNoGo      4.037405
task_gng_incorrectGo        3.050222
task_gng_correctGo         36.949778
task_gng_correctNoGo        4.373267
task_gng_avgRt            350.201768
dtype: float64

### BART

> The primary score used to measure BART performance is the average number of pumps on unexploded balloons, with higher scores indicative of greater risk-taking propensity (Bornovalova et al. 2005; Lejuez et al. 2002)

[Scoring Alternatives Paper](https://www.researchgate.net/publication/301645337_The_Multiple_Faces_of_Risk-Taking_Scoring_Alternatives_for_the_Balloon-Analogue_Risk_Task)

#### Load Data

In [101]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

if 'bart_df' in globals():
    del(bart_df)
    print('deleted bart_df')

deleted existing df


In [102]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_app_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_app_survey_results.csv')
    
# Convert the date column back to datetime.date
df['trial_date'] = pd.to_datetime(df['trial_date']).dt.date

#### Format Data

In [103]:
# Select rows with BART task
bart_df = df.loc[df.ResultIdentifier.str.contains('custom_bart')].reset_index(drop=True)
# Remove rows that are just directions
bart_df = bart_df.loc[~bart_df.ResultIdentifier.str.contains('_info')].reset_index(drop=True)
# Remove summary rows
bart_df = bart_df.loc[~bart_df.ResultIdentifier.str.contains('summary')].reset_index(drop=True)

In [104]:
json.loads(bart_df.Answers[0])

{'timingInMs': [487,
  150,
  185,
  2654,
  104,
  380,
  102,
  161,
  98,
  103,
  98,
  76,
  107,
  84,
  97,
  118,
  291],
 'thisRoundEarnings': 0,
 'numberOfPumps': 17,
 'totalEarnings': 0,
 'balloonPopsAt': 17}

In [105]:
# avg rt
def foo(x):
    try:
        return np.mean(json.loads(x)['timingInMs'][1:]) # start on second tap as people take longer on first...
    except (ValueError, KeyError):
        return None

v = np.vectorize(foo)
bart_df['avg_rt'] = v(bart_df.Answers)

# pop?
def foo(x):
    try:
        return json.loads(x)['thisRoundEarnings']
    except (ValueError, KeyError):
        return None

v = np.vectorize(foo)
bart_df['thisRoundEarnings'] = v(bart_df.Answers)
    
# numberOfPumps
def foo(x):
    try:
        return json.loads(x)['numberOfPumps']
    except (ValueError, KeyError):
        return None

v = np.vectorize(foo)
bart_df['pumps'] = v(bart_df.Answers)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [106]:
# only take unpopped trials
bart_df = bart_df.loc[bart_df.thisRoundEarnings>0]

In [107]:
pumps = pd.DataFrame(bart_df.groupby(['ParticipantIdentifier', 'trial_date'])['pumps'].sum()).reset_index()
pumps = pumps.rename(columns={'pumps': 'task_bart_total_pumps'})
unpopped = pd.DataFrame(bart_df.groupby(['ParticipantIdentifier', 'trial_date'])['pumps'].count()).reset_index()
unpopped = unpopped.rename(columns={'pumps': 'task_bart_unpopped_n'})
unpopped.head()             

Unnamed: 0,ParticipantIdentifier,trial_date,task_bart_unpopped_n
0,0501ba67-3406-4779-aff1-878a0e9f7885,2022-09-30,5
1,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-01,3
2,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-02,7
3,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-03,6
4,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-04,8


In [108]:
# get mean of pumps and mean of rt for each DAY
bart_df = pd.DataFrame(bart_df.groupby(['ParticipantIdentifier', 'trial_date'])[['avg_rt', 'pumps']].mean()).reset_index()
bart_df = bart_df.rename(columns={"avg_rt": "task_bart_avg_rt", 'pumps': 'task_bart_avg_pumps'})

# add other data
bart_df = bart_df.merge(pumps, how='left', on=['ParticipantIdentifier', 'trial_date'])
bart_df = bart_df.merge(unpopped, how='left', on=['ParticipantIdentifier', 'trial_date'])

# calculate score
bart_df['task_bart_score'] = bart_df.task_bart_total_pumps * 5

In [109]:
bart_df.head(2)

Unnamed: 0,ParticipantIdentifier,trial_date,task_bart_avg_rt,task_bart_avg_pumps,task_bart_total_pumps,task_bart_unpopped_n,task_bart_score
0,0501ba67-3406-4779-aff1-878a0e9f7885,2022-09-30,851.51869,16.8,84,5,420
1,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-01,258.585139,18.666667,56,3,280


#### EDA

In [None]:
profile = ProfileReport(bart_df.iloc[:,2:], title=f"BART Task Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_BART_run{run_num}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#### Clean

We can see that there are trials with VERY long average RTs. These should be removed before calculations are done.

We used the $z$ transform method to flag outliers based on Berger and Kiefer ([2021](https://doi.org/10.3389/fpsyg.2021.675558)) where they tested multiple methods of removing outliers from rt data.

We also set the flag threshold to 3.

---

We can also see that the `unpopped_n` (referring to trials where the balloon did not pop) values go up to 18 which is not possible since there are only 10 trials. Entries where there are more than 10 trials are removed.

In [110]:
bart_df.iloc[:,2:].mean()

task_bart_avg_rt         239.729548
task_bart_avg_pumps       14.196292
task_bart_total_pumps    103.367426
task_bart_unpopped_n       7.383665
task_bart_score          516.837130
dtype: float64

In [111]:
# clean outlier rt values

from scipy.stats import zscore

# Step 1: Filter out rt cols
rt_columns = [col for col in bart_df.columns if col.endswith('_rt')]

# Step 2: Compute the z-scores for these columns
for col in rt_columns:
    z_col_name = col + '_z'
    bart_df[z_col_name] = zscore(bart_df[col], nan_policy='omit')

In [112]:
# Trial removal stats - RT
threshold = 3  # Define a threshold value

trials_removed = (abs(bart_df.task_bart_avg_rt_z) > threshold).sum()
trials_n = bart_df.shape[0]

print(f'Setting a z-score threshold of 3 removes {trials_removed} entries, or {(trials_removed/trials_n) * 100:.1f}% of the total.')

# Remove trials
bart_df = bart_df.loc[abs(bart_df.task_bart_avg_rt_z) <=threshold,]
# Remove z-score column
z_columns_to_drop = [col for col in bart_df.columns if col.endswith('_z')]
bart_df = bart_df.drop(columns=z_columns_to_drop)

Setting a z-score threshold of 3 removes 11 entries, or 0.3% of the total.


In [113]:
# Trial removal stats - trial num
trials_removed = (bart_df.task_bart_unpopped_n > 10).sum()
trials_n = bart_df.shape[0]

print(f'There are {trials_removed} entries, or {(trials_removed/trials_n) * 100:.1f}% that have more than 10 trials.')

# Remove trials
bart_df = bart_df.loc[bart_df.task_bart_unpopped_n <=10,]

There are 2 entries, or 0.0% that have more than 10 trials.


In [96]:
# Rerun EDA
profile = ProfileReport(bart_df.iloc[:,2:], title=f"BART Task Run {run_num} - Clean | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_BART_run{run_num}_clean.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### EmoStroop

#### Load Data

In [114]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

if 'emoStroop_df' in globals():
    del(emoStroop_df)
    print('deleted emoStroop_df')

deleted existing df


In [115]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_app_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_app_survey_results.csv')
    
# Convert the date column back to datetime.date
df['trial_date'] = pd.to_datetime(df['trial_date']).dt.date

In [116]:
emoStroop_df = df.loc[df.ResultIdentifier.str.contains('emoStroop_trial')].reset_index(drop=True)

#### Format Data

In [117]:
json.loads(emoStroop_df.Answers[4])

{'emotion': 'sad',
 'text': 'angry',
 'startTime': 3581,
 'endTime': 6115,
 'chosenEmotion': 'sad',
 'correctness': 'incorrect'}

In [118]:
# congruent
def foo(x):
    try:
        return json.loads(x)['emotion'] == json.loads(x)['text']
    except (ValueError, KeyError):
        return None

v = np.vectorize(foo)
emoStroop_df['task_emoStroop_congruent'] = v(emoStroop_df.Answers)

# rt
def foo(x):
    try:
        return json.loads(x)['endTime'] - json.loads(x)['startTime']
    except (ValueError, KeyError):
        return None

v = np.vectorize(foo)
emoStroop_df['task_emoStroop_rt'] = v(emoStroop_df.Answers)
    
# correct
def foo(x):
    try:
        return json.loads(x)['emotion'] == json.loads(x)['chosenEmotion']
    except (ValueError, KeyError):
        return None

v = np.vectorize(foo)
emoStroop_df['task_emoStroop_correct'] = v(emoStroop_df.Answers)


In [119]:
emoStroop_df = pd.DataFrame(emoStroop_df.groupby(['ParticipantIdentifier', 'trial_date', 'task_emoStroop_congruent'])
             [['task_emoStroop_rt', 'task_emoStroop_correct']].mean()).reset_index()


In [120]:
# convert to wide
emoStroop_df = emoStroop_df.pivot_table(index = ['ParticipantIdentifier', 'trial_date'],
                         columns = 'task_emoStroop_congruent',
                         values = ['task_emoStroop_rt', 'task_emoStroop_correct']).reset_index()

In [121]:
emoStroop_df.columns

MultiIndex([( 'ParticipantIdentifier',    ''),
            (            'trial_date',    ''),
            ('task_emoStroop_correct', False),
            ('task_emoStroop_correct',  True),
            (     'task_emoStroop_rt', False),
            (     'task_emoStroop_rt',  True)],
           names=[None, 'task_emoStroop_congruent'])

In [122]:
# create correct column names
new_cols = []
for i in range(emoStroop_df.shape[1]):
    new_cols.append(emoStroop_df.columns.get_level_values(0)[i] + 
                    str(emoStroop_df.columns.get_level_values(1)[i]))
# x.columns.get_level_values(0)[1] + str(x.columns.get_level_values(1)[1])

In [123]:
new_cols = [
    'ParticipantIdentifier',
    'trial_date',
    'task_emoStroop_accuracy_incongruent',
    'task_emoStroop_accuracy_congruent',
    'task_emoStroop_rt_incongruent',
    'task_emoStroop_rt_congruent'
]

In [124]:
emoStroop_df.columns = emoStroop_df.columns.to_flat_index()

In [125]:
emoStroop_df.columns = new_cols

In [126]:
emoStroop_df.head(2)

Unnamed: 0,ParticipantIdentifier,trial_date,task_emoStroop_accuracy_incongruent,task_emoStroop_accuracy_congruent,task_emoStroop_rt_incongruent,task_emoStroop_rt_congruent
0,0501ba67-3406-4779-aff1-878a0e9f7885,2022-09-30,1.0,1.0,1883.555556,1849.666667
1,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-01,1.0,1.0,3137.909091,1959.5


#### EDA

In [None]:
profile = ProfileReport(emoStroop_df.iloc[:,2:], title=f"EmoStroop Task Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_emoStroop_run{run_num}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#### Clean

We can see that there are entries with VERY long average `rt_congruent` and `rt_incongruent` values. These should be removed before calculations are done.

We used the Median Absolute Deviation (MAD) with a threshold of 3 to remove rt outliers ([Leys et al., 2014](https://www.sciencedirect.com/science/article/abs/pii/S0022103113000668)).

Use rule of thumb of 10 seconds...


In [127]:
emoStroop_df.to_csv('emostroop.csv', index=False)

In [128]:
# Filter rows where the absolute deviation from the median is above 3 times the MAD
emoStroop_df = emoStroop_df[emoStroop_df['task_emoStroop_rt_incongruent'] <= 10000]
emoStroop_df = emoStroop_df[emoStroop_df['task_emoStroop_rt_congruent'] <= 10000]

In [129]:
from scipy.stats import median_abs_deviation

# Original size
n_original = emoStroop_df.shape[0]

# Clean outlier rt values
threshold = 3  # Define a threshold value

# Compute the MAD using scipy
mad_i = median_abs_deviation(emoStroop_df['task_emoStroop_rt_incongruent'], nan_policy='omit')
mad_c = median_abs_deviation(emoStroop_df['task_emoStroop_rt_congruent'], nan_policy='omit')

# Compute the median value of the 'reaction time' column
median_val_i = emoStroop_df['task_emoStroop_rt_incongruent'].median()
median_val_c = emoStroop_df['task_emoStroop_rt_congruent'].median()

# Filter rows where the absolute deviation from the median is above 3 times the MAD
emoStroop_df = emoStroop_df[(emoStroop_df['task_emoStroop_rt_incongruent'] - median_val_i).abs() <= threshold * mad_i]
emoStroop_df = emoStroop_df[(emoStroop_df['task_emoStroop_rt_congruent'] - median_val_c).abs() <= threshold * mad_c]

n_clean = emoStroop_df.shape[0]

print(f'Cleaning using MAD with a threshold of {threshold} removed {n_original - n_clean} entries, or {((n_original - n_clean) / n_original) * 100:.1f}%.')

Cleaning using MAD with a threshold of 3 removed 860 entries, or 21.8%.


In [None]:
# Rerun EDA
profile = ProfileReport(emoStroop_df.iloc[:,2:], title=f"EmoStroop Task Run {run_num} - Clean | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_emoStroop_run{run_num}_clean.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Motivation

#### Load Data

In [130]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

if 'task_motivation' in globals():
    del(task_motivation)
    print('deleted task_motivation')

deleted existing df


In [131]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_app_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_app_survey_results.csv')
    
# Convert the date column back to datetime.date
df['trial_date'] = pd.to_datetime(df['trial_date']).dt.date

In [132]:
task_motivation = df.loc[df.ResultIdentifier.str.contains('task_motivation')].reset_index(drop=True)

#### Format Data

In [133]:
task_motivation = task_motivation[['ParticipantIdentifier', 'trial_date', 'time', 'Answers']]

In [134]:
task_motivation.shape

(4296, 4)

In [135]:
new_cols = [
    'ParticipantIdentifier',
    'trial_date',
    'task_motivation_time',
    'task_motivation_level'
]

In [136]:
task_motivation.columns = new_cols
task_motivation.head(2)

Unnamed: 0,ParticipantIdentifier,trial_date,task_motivation_time,task_motivation_level
0,6b827de8-fe47-4007-aad3-202655b954e3,2022-10-24,07:09:39,6
1,a33e1d38-6ee8-4da6-993b-a94a8ae7fc30,2022-10-24,08:01:46,8


#### EDA

In [120]:
profile = ProfileReport(task_motivation.iloc[:,2:], title=f"Task Motivation Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_motivation_run{run_num}.html")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={"index": "df_index"}, inplace=True)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### NASA TLX

#### Load Data

In [137]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

if 'nasa_tlx' in globals():
    del(nasa_tlx)
    print('deleted nasa_tlx')

deleted existing df


In [138]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_app_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_app_survey_results.csv')
    
# Convert the date column back to datetime.date
df['trial_date'] = pd.to_datetime(df['trial_date']).dt.date

In [139]:
nasa_tlx = df.loc[df.ResultIdentifier.str.contains('nasa_')].reset_index(drop=True)

#### Format Data

In [140]:
nasa_tlx.head()

Unnamed: 0,ParticipantIdentifier,ResultIdentifier,Answers,EndDate,datetime,trial_date,time
0,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,nasa_mental_demand,2,2022-10-24T07:09:03-04:00,2022-10-24 07:09:03-04:00,2022-10-24,07:09:03
1,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,nasa_temporal_demand,0,2022-10-24T07:09:04-04:00,2022-10-24 07:09:04-04:00,2022-10-24,07:09:04
2,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,nasa_performance,17,2022-10-24T07:09:06-04:00,2022-10-24 07:09:06-04:00,2022-10-24,07:09:06
3,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,nasa_effort,2,2022-10-24T07:09:10-04:00,2022-10-24 07:09:10-04:00,2022-10-24,07:09:10
4,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,nasa_frustration,0,2022-10-24T07:09:12-04:00,2022-10-24 07:09:12-04:00,2022-10-24,07:09:12


In [141]:
# convert to wide
nasa_tlx = nasa_tlx.pivot_table(index = ['ParticipantIdentifier', 'trial_date'],
                         columns = 'ResultIdentifier',
                         values = 'Answers').reset_index()

# remove index name
nasa_tlx = nasa_tlx.rename_axis(None, axis=1)

In [142]:
# tweak column names
list(nasa_tlx.iloc[:, 2:].add_prefix('task_').columns)

['task_nasa_distraction',
 'task_nasa_effort',
 'task_nasa_frustration',
 'task_nasa_luck',
 'task_nasa_mental_demand',
 'task_nasa_performance',
 'task_nasa_temporal_demand']

In [143]:
new_cols = [
    'ParticipantIdentifier',
    'trial_date',
    'task_nasa_distraction',
    'task_nasa_effort',
    'task_nasa_frustration',
    'task_nasa_luck',
    'task_nasa_mental_demand',
    'task_nasa_performance',
    'task_nasa_temporal_demand'
]

In [144]:
nasa_tlx.columns = new_cols

nasa_tlx.head(2)

Unnamed: 0,ParticipantIdentifier,trial_date,task_nasa_distraction,task_nasa_effort,task_nasa_frustration,task_nasa_luck,task_nasa_mental_demand,task_nasa_performance,task_nasa_temporal_demand
0,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-01,6.0,18.0,6.0,0.0,14.0,15.0,7.0
1,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-02,5.0,15.0,0.0,7.0,11.0,13.0,6.0


#### EDA

In [None]:
profile = ProfileReport(nasa_tlx.iloc[:,2:], title=f"NASA TLX Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"NASA_tlx_run{run_num}.html")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={"index": "df_index"}, inplace=True)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#### Clean

There are maximimum values over 600 which doesn't make sense since the max is 20

In [145]:
nasa_tlx.to_csv('nasa.csv', index=False)

In [146]:
# Replace values greater than 20 in numeric columns with NaN
nasa_tlx.iloc[:,2:] = nasa_tlx.iloc[:,2:].where(lambda x: x <= 20, other=pd.NA)


In [None]:
# rerun EDA
profile = ProfileReport(nasa_tlx.iloc[:,2:], title=f"NASA TLX Run {run_num} - Clean | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"NASA_tlx_run{run_num}_clean.html")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={"index": "df_index"}, inplace=True)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Join Custom Tasks

In [147]:
gng_df.shape

(3823, 8)

In [148]:
nback_df.shape

(3937, 18)

In [149]:
bart_df.shape

(4105, 7)

In [150]:
emoStroop_df.shape

(3087, 6)

In [151]:
task_motivation.shape

(4296, 4)

In [152]:
nasa_tlx.shape

(4095, 9)

In [153]:
# Create df with one participant ID for every trial_date

# Create a series of dates from '2022-09-27' to '2022-12-20'
if run_num == 1:
    date_series = pd.date_range(start='2022-09-27', end='2022-12-20')
elif run_num == 2:
    date_series = pd.date_range(start='2023-01-30', end='2023-04-24')

ids_series = subjects

# Create a dataframe using a cartesian product of the two series
df_complete_idDate = pd.DataFrame({
    'ParticipantIdentifier': np.repeat(ids_series, len(date_series)),
    'trial_date': date_series.tolist() * len(ids_series)
}).reset_index(drop=True)

# Convert trial_date to datetime.date
df_complete_idDate['trial_date'] = pd.to_datetime(df_complete_idDate['trial_date']).dt.date
df_daily_affect_wide['trial_date'] = pd.to_datetime(df_daily_affect_wide['trial_date']).dt.date

# Join with affect df
df_complete = df_complete_idDate.merge(df_daily_affect_wide, how='left', on=['ParticipantIdentifier', 'trial_date'])

In [154]:
# Join app self report
df_complete = df_complete.merge(df_daily_sr_wide, how='left', on=['ParticipantIdentifier', 'trial_date'] )

if run_num == 2:
    # Join social support weekly
    df_complete = df_complete.merge(ss_df, how='left', on=['ParticipantIdentifier', 'trial_date'] )

    # join social support daily
    df_complete = df_complete.merge(df_daily_ss_wide, how='left', on=['ParticipantIdentifier', 'trial_date'] )

    # Join food task
    df_complete = df_complete.merge(food_df_wide, how='left', on=['ParticipantIdentifier', 'trial_date'] )

# Join motivation
df_complete = df_complete.merge(task_motivation, how='left', on=['ParticipantIdentifier', 'trial_date'] )
# Motivation had a number of duplicate entries where there were two entries on the same day a few seconds apart
df_complete = df_complete.drop_duplicates(subset=['ParticipantIdentifier', 'trial_date'], keep='last')

# Join gng
df_complete = df_complete.merge(gng_df, how='left', on=['ParticipantIdentifier', 'trial_date'] )

In [155]:
df_complete = df_complete.merge(bart_df, how='left', on=['ParticipantIdentifier', 'trial_date'])
df_complete = df_complete.merge(emoStroop_df, how='left', on=['ParticipantIdentifier', 'trial_date'])
df_complete = df_complete.merge(nback_df, how='left', on=['ParticipantIdentifier', 'trial_date'])
df_complete = df_complete.merge(nasa_tlx, how='left', on=['ParticipantIdentifier', 'trial_date'])

# remove duplicates
df_complete = df_complete.drop_duplicates(subset=['ParticipantIdentifier', 'trial_date'], keep='last')

# remove data outliers
if run_num == 2:
    df_complete = df_complete.loc[df_complete['trial_date']>= datetime.date(2023, 1, 30)]

In [156]:
df_complete.head(2)

Unnamed: 0,ParticipantIdentifier,trial_date,affect_neg_angry,affect_neg_ashamed,affect_neg_bored,affect_neg_depressed,affect_neg_embarrassed,affect_neg_frustrated,affect_neg_guilty,affect_neg_lazy,...,task_nback_F1,task_nback_MCC,task_nback_bias,task_nasa_distraction,task_nasa_effort,task_nasa_frustration,task_nasa_luck,task_nasa_mental_demand,task_nasa_performance,task_nasa_temporal_demand
0,d92cdaf1-cc3e-4408-bcdf-7be602cc5c3b,2022-09-27,3.0,3.0,1.0,5.0,5.0,2.0,4.0,5.0,...,0.666667,0.527102,0.310441,4.0,11.0,9.0,5.0,15.0,5.0,10.0
1,d92cdaf1-cc3e-4408-bcdf-7be602cc5c3b,2022-09-28,1.0,4.0,1.0,5.0,1.0,3.0,1.0,5.0,...,0.857143,0.801587,0.262824,8.0,11.0,12.0,12.0,19.0,9.0,11.0


## HK Tasks

### Trail Making

[RKStudio Documentation](https://rkstudio-support.careevolution.com/hc/en-us/articles/1500002201361-Trailmaking-Active-Task-Export-Format)

What is trailmaking task really measuring?

>The Trail Making Test is a neuropsychological test of visual attention and task switching. It consists of two parts in which the subject is instructed to connect a set of 25 dots as quickly as possible while still maintaining accuracy. The test can provide information about visual search speed, scanning, speed of processing, mental flexibility, as well as executive functioning.[[1](https://doi.apa.org/doiLanding?doi=10.1037%2F1040-3590.7.2.220)]

- visual attention
- task switching
- fluid intelligence/cognitive abilities

**Reference**

[1] [Salthouse, 2011](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3141679/)

<div class="alert alert-block alert-warning">
<b>Note:</b><br>
<ul> 
    <li>We are using fewer "dots" (12)</li>
    <li>We are using both a number only and a letter/number version (e.g. 1-A-2-B-3-C...).</li>
<ul>
</div>

In [175]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

if 'trailmaking_df' in globals():
    del(trailmaking_df)
    print('deleted trailmaking_df')

deleted existing df


In [176]:
print(f'Loading cohort {run_num}...')

Loading cohort 1...


In [177]:
# loop through all days
days = [i for i in os.listdir(path) if i.startswith('RK')]
days.sort()
for day in tqdm(days):
    files = os.listdir(path + day)
    surveyQuestions = [i for i in files if i.startswith('SurveyTrailmakingResults')]
    versionInfo = [i for i in files if i.startswith('SurveyResults')]
    # there should be only one
    for file in surveyQuestions:
        if 'df' not in globals():
            df = pd.read_csv(path + day + '/' + file)
            df_version = pd.read_csv(path + day + '/' + versionInfo[0])
        else:
            temp_df = pd.read_csv(path + day + '/' + file)
            temp_df_version = pd.read_csv(path + day + '/' + versionInfo[0])
            df = pd.concat([df,temp_df], axis=0)
            df_version = pd.concat([df_version,temp_df_version], axis=0)

 10%|█         | 9/87 [00:00<00:00, 86.31it/s]

100%|██████████| 87/87 [00:00<00:00, 87.04it/s]


In [178]:
# Identify the numeric and alphanumeric versions
df_version = df_version[['SurveyResultKey', 'SurveyName']]
df_version = df_version.loc[df_version.SurveyName.str.contains('_trail')][['SurveyResultKey', 'SurveyName']]
df = pd.merge(df, df_version, how='left', on='SurveyResultKey')

# rename SurveyName
d = {'task_hk_trail_making': 'task_hk_trailmaking_alphaNumeric', 'task_hk_trailmaking_a1': 'task_hk_trailmaking_numeric'}

df = df.replace({"SurveyName": d})

df = df.dropna(subset=['EndDate']).reset_index(drop=True)

In [179]:
# Select subjects from correct run
df = df.loc[df.ParticipantIdentifier.isin(subjects)].reset_index(drop=True)

In [180]:
# add trial date and time columns
for i in tqdm(range(df.shape[0])):
    dt = parser.parse(df.loc[i, 'EndDate'])
    df.loc[i, 'datetime'] = dt
    df.loc[i, 'trial_date'] = (dt + datetime.timedelta(hours = -4)).date() # trial day associated with sample (4am is when the day flips)
    df.loc[i, 'time'] = dt.time()

100%|██████████| 8384/8384 [00:00<00:00, 9582.09it/s]


In [181]:
df.head(2)

Unnamed: 0,SurveyTrailmakingResultKey,SurveyStepResultKey,SurveyResultKey,ParticipantIdentifier,NumberOfErrors,StartDate,EndDate,Taps,SurveyName,datetime,trial_date,time
0,0867718d-533e-ed11-aac0-0afb9334277d,e766718d-533e-ed11-aac0-0afb9334277d,c766718d-533e-ed11-aac0-0afb9334277d,2f32cd19-e9c5-4aad-8999-6f4646169ab6,0,2022-09-27T06:59:58-04:00,2022-09-27T07:00:10-04:00,"[{""TapTimestamp"":1.1142810583114624,""TapIndex""...",task_hk_trailmaking_numeric,2022-09-27 07:00:10-04:00,2022-09-27,07:00:10
1,bac0dc28-543e-ed11-aac0-0afb9334277d,a1c0dc28-543e-ed11-aac0-0afb9334277d,93c0dc28-543e-ed11-aac0-0afb9334277d,0ca43379-41b5-47fb-90ba-0a22e6bf5586,1,2022-09-27T07:04:18-04:00,2022-09-27T07:04:33-04:00,"[{""TapTimestamp"":1.8917070627212524,""TapIndex""...",task_hk_trailmaking_numeric,2022-09-27 07:04:33-04:00,2022-09-27,07:04:33


The important data is in `Taps` where we have:
- `TapTimestamp`
- `TapIndex`
- `TapIncorrect`

I want to get the last `TapTimestamp` to calculate total timing

In [182]:
# Taps is a string of a list of dictionaries
df.Taps[250]

'[{"TapTimestamp":2.0672110319137573,"TapIndex":0,"TapIncorrect":false},{"TapTimestamp":3.3330730199813843,"TapIndex":1,"TapIncorrect":false},{"TapTimestamp":4.3167099952697754,"TapIndex":2,"TapIncorrect":false},{"TapTimestamp":6.08289897441864,"TapIndex":3,"TapIncorrect":false},{"TapTimestamp":6.7010899782180786,"TapIndex":4,"TapIncorrect":false},{"TapTimestamp":7.3834370374679565,"TapIndex":5,"TapIncorrect":false},{"TapTimestamp":8.11682403087616,"TapIndex":6,"TapIncorrect":false},{"TapTimestamp":8.8669099807739258,"TapIndex":7,"TapIncorrect":false},{"TapTimestamp":9.3168929815292358,"TapIndex":8,"TapIncorrect":false},{"TapTimestamp":9.8836699724197388,"TapIndex":9,"TapIncorrect":false},{"TapTimestamp":10.367033958435059,"TapIndex":10,"TapIncorrect":false},{"TapTimestamp":10.850473046302795,"TapIndex":11,"TapIncorrect":false},{"TapTimestamp":11.283805012702942,"TapIndex":12,"TapIncorrect":false}]'

In [183]:
# Can convert to list of dicts and then access an individual dict
data = json.loads(df.Taps[0])
print(data[-1])
data[-1]['TapTimestamp']

{'TapTimestamp': 11.237926006317139, 'TapIndex': 12, 'TapIncorrect': False}


11.237926006317139

In [184]:
# convert string Taps to list of dicts
df['TapsList'] = df['Taps'].apply(json.loads)

# test if any lists are empty...
for i in range(len(df.TapsList)):
    if df.TapsList[i]:
        x = df.TapsList[i][-1]['TapTimestamp']
    else:
        print(i)

2187


In [185]:
if run_num == 1:
    # drop rows with empty lists
    df = df.drop(df.index[[
        2187
        ]])

if run_num == 2:
    # drop rows with empty lists
    df = df.drop(df.index[[
        1667,
        11010
        ]])

In [186]:
df.shape

(8383, 13)

In [187]:
# Add variance
def variance_of_differences(taps_str):
    # Convert string to list of dictionaries
    taps_list = json.loads(taps_str)
    
    # Extract TapTimestamp values
    timestamps = [tap['TapTimestamp'] for tap in taps_list]
    
    # Calculate the differences between consecutive timestamps
    differences = np.diff(timestamps)[1:] # diff between first two taps not relevant
    
    # Return variance of differences
    return np.var(differences)

# Apply the function to the TapsList column
df['variance_of_diff'] = df['Taps'].apply(variance_of_differences)

In [188]:
# assign new columns with final value from TapsList
# rename column
df = df.assign(task_trailmaking_time=lambda x: x.TapsList.apply(lambda x: x[-1]['TapTimestamp'] - x[1]['TapTimestamp']),
               task_trailmaking_errors=lambda x: x.NumberOfErrors)

# keep relevant columns
trailmaking_df = df[['ParticipantIdentifier', 'trial_date', 'time', 'SurveyName', 'task_trailmaking_time', 'task_trailmaking_errors', 'variance_of_diff']]

In [189]:
# CLEAN DATA (see cleaning heading for detail)
trailmaking_df = trailmaking_df.loc[(trailmaking_df['task_trailmaking_time'] < 30) & (trailmaking_df['task_trailmaking_time'] !=0) & (trailmaking_df['task_trailmaking_errors'] <=10)]

In [190]:
trailmaking_df.head()

Unnamed: 0,ParticipantIdentifier,trial_date,time,SurveyName,task_trailmaking_time,task_trailmaking_errors,variance_of_diff
0,2f32cd19-e9c5-4aad-8999-6f4646169ab6,2022-09-27,07:00:10,task_hk_trailmaking_numeric,9.556804,0,0.278111
1,0ca43379-41b5-47fb-90ba-0a22e6bf5586,2022-09-27,07:04:33,task_hk_trailmaking_numeric,10.499643,1,0.271957
3,412330b3-cc02-4030-96cd-f4cfdcc45fa6,2022-09-27,07:07:11,task_hk_trailmaking_numeric,11.782398,2,0.242148
4,0ca43379-41b5-47fb-90ba-0a22e6bf5586,2022-09-27,07:09:36,task_hk_trailmaking_alphaNumeric,12.533077,1,0.229768
5,3bb57dd9-1d69-471c-b769-b3323748bb9f,2022-09-27,07:23:09,task_hk_trailmaking_alphaNumeric,13.851104,0,0.59777


In [191]:
# make wide
trailmaking_df = trailmaking_df.pivot_table(index=['trial_date', 'ParticipantIdentifier'],
                                            columns='SurveyName', 
                                            values=['task_trailmaking_time', 'task_trailmaking_errors', 'variance_of_diff']).reset_index()

In [192]:
# flatten columns
trailmaking_df.columns = trailmaking_df.columns.to_series().str.join('_')

In [193]:
# rename columns
trailmaking_df.columns = ['trial_date', 'ParticipantIdentifier', 'task_trailmaking_alphaNumeric_errors', 'task_trailmaking_numeric_errors', 'task_trailmaking_alphaNumeric_time', 'task_trailmaking_numeric_time', 'task_trailmaking_alphaNumeric_var', 'task_trailmaking_numeric_var']

In [194]:
trailmaking_df.head()

Unnamed: 0,trial_date,ParticipantIdentifier,task_trailmaking_alphaNumeric_errors,task_trailmaking_numeric_errors,task_trailmaking_alphaNumeric_time,task_trailmaking_numeric_time,task_trailmaking_alphaNumeric_var,task_trailmaking_numeric_var
0,2022-09-27,099765a5-a9c9-4fff-b297-a39eab517267,1.0,2.0,17.782964,8.057159,1.611959,0.103868
1,2022-09-27,0ca43379-41b5-47fb-90ba-0a22e6bf5586,1.0,1.0,12.533077,10.499643,0.229768,0.271957
2,2022-09-27,146e1ab8-c4ca-4a52-9261-e108b38eea53,,0.0,,4.950253,,0.002129
3,2022-09-27,147400db-43d9-4155-8bf2-b85b8adf4315,1.0,0.0,16.690687,6.503019,0.407312,0.015242
4,2022-09-27,14b58072-ae3b-491e-a8ca-207f0d27ccf6,1.0,0.0,23.364913,7.081511,1.523848,0.15173


In [195]:
trailmaking_df.iloc[:,3:].mean()

task_trailmaking_numeric_errors       0.391064
task_trailmaking_alphaNumeric_time    8.404549
task_trailmaking_numeric_time         6.663877
task_trailmaking_alphaNumeric_var     0.274284
task_trailmaking_numeric_var          0.168276
dtype: float64

In [196]:
trailmaking_df.shape

(4184, 8)

#### EDA

In [None]:
profile = ProfileReport(trailmaking_df.iloc[:,2:], title=f"Trailmaking Task Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_trailmaking_run{run_num}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#### Clean

We had numerous outlier values.

**ERRORS**

Given that there were only 12 responses in a given trial and that the mean error rate was close to 0.5, maximum values such as 54 seem a bit unlikely and may indicate either someone was not trying or not understanding the task.

Given that we are dealing with a zero-inflated skewed distribution removing outliers appropriately is challenging.

I used an ad-hoc decision to remove those trials with more than 10 errors. This was total of 26 out fo the 6498 total trials.

**RTs**

We used median absolute deviation to remove outliers (see [Leys et al., 2013](https://www.sciencedirect.com/science/article/abs/pii/S0022103113000668)).

We set the flag threshold to 3.

However this removed more than 10% of the obervations.

Therefore we decided to again use an ad-hoc approach setting the maximum RT threshold to 30 seconds which removes less than 1% of trials.

**Also** there were a number of trials with RTs of 0. These were removed.

<div class="alert alert-block alert-info">
<b>📝 Note:</b><br>
    I went back and applied these cleaning heuristics to the data in long format so as not to remove both trials in the event they were not both outliers.
</div>

In [197]:
# Remove high error trials

threshold = 10

count = (trailmaking_df.task_trailmaking_numeric_errors > threshold).sum()
total_n =  len(trailmaking_df)

print(f'NUMERIC')
print(f'Setting a threshold of {threshold} removes {count} trials out of {total_n} total numeric trials.')
print(f'This is {(count/total_n)*100:.1f}% of the trials.\n')

count = (trailmaking_df.task_trailmaking_alphaNumeric_errors > threshold).sum()

print(f'ALPHANUMERIC')
print(f'Setting a threshold of {threshold} removes {count} trials out of {total_n} total alpha-numeric trials.')
print(f'This is {(count/total_n)*100:.1f}% of the trials.')

NUMERIC
Setting a threshold of 10 removes 0 trials out of 4184 total numeric trials.
This is 0.0% of the trials.

ALPHANUMERIC
Setting a threshold of 10 removes 0 trials out of 4184 total alpha-numeric trials.
This is 0.0% of the trials.


In [198]:
# MAD approach to RT outlier removal

# Calculate MAD for the specified columns
def mad(series):
    median_value = series.median()
    return (series - median_value).abs().median()

# MAD-based outlier detection
def detect_outliers(series, threshold):
    median_value = series.median()
    mad_value = mad(series)
    
    # Detect outliers
    outliers = ((series - median_value).abs() > threshold * mad_value)
    return outliers

threshold = 3.0

# Detect outliers in the 'task_trailmaking_numeric_time' column
x = detect_outliers(trailmaking_df['task_trailmaking_numeric_time'], threshold=threshold).sum()
print(f'NUMERIC:\nUsing a threshold of {threshold} with MAD we find {x} outliers in the {trailmaking_df.shape[0]} observations.\n')

# Detect outliers in the 'task_trailmaking_alphaNumeric_time' column
x = detect_outliers(trailmaking_df['task_trailmaking_alphaNumeric_time'], threshold=threshold).sum()
print(f'ALPHANUMERIC:\nUsing a threshold of {threshold} with MAD we find {x} outliers in the {trailmaking_df.shape[0]} observations.')

NUMERIC:
Using a threshold of 3.0 with MAD we find 379 outliers in the 4184 observations.

ALPHANUMERIC:
Using a threshold of 3.0 with MAD we find 434 outliers in the 4184 observations.


In [199]:
# Ad hoc approach to RT outlier removal
threshold = 30

count = (trailmaking_df.task_trailmaking_numeric_time > threshold).sum()
total_n = trailmaking_df.shape[0]

print(f'NUMERIC')
print(f'Setting a threshold of {threshold} removes {count} trials out of {total_n} total numeric trials.')
print(f'This is {(count/total_n)*100:.1f}% of the trials.\n')

count = (trailmaking_df.task_trailmaking_alphaNumeric_time > threshold).sum()

print(f'ALPHANUMERIC')
print(f'Setting a threshold of {threshold} removes {count} trials out of {total_n} total alpha-numeric trials.')
print(f'This is {(count/total_n)*100:.1f}% of the trials.')

NUMERIC
Setting a threshold of 30 removes 0 trials out of 4184 total numeric trials.
This is 0.0% of the trials.

ALPHANUMERIC
Setting a threshold of 30 removes 0 trials out of 4184 total alpha-numeric trials.
This is 0.0% of the trials.


### Stroop

[RK Studio Documentation](http://researchkit.org/docs/docs/ActiveTasks/ActiveTasks.html#stroophttp://researchkit.org/docs/docs/ActiveTasks/ActiveTasks.html#stroop)

[Scarpina & Tagini, 2017](https://www.frontiersin.org/articles/10.3389/fpsyg.2017.00557/full) on scoring in their paper The Stroop Color and Words Test.

>The Stroop Color and Word Test (SCWT) is a neuropsychological test extensively used to assess the ability to inhibit cognitive interference that occurs when the processing of a specific stimulus feature impedes the simultaneous processing of a second stimulus attribute, well-known as the Stroop Effect.

**Interpretation**
>While the SCWT is widelyused to measure the ability to inhibit cognitive interference; previous literature also reports itsapplication to measure other cognitive functions such as attention, processing speed, cognitive flexibility (Jensen and Rohwer, 1966), and working memory(Kane and Engle, 2003). Thus, it may be possible to use the SCWTto measure multiple cognitive functions.

>According to the review, the studies with Italian normativedata present different theoretical interpretations of the SCWTscores.Amato et al. (2006)andCaffarra et al. (2002)describe theSCWT score as a measure of the fronto-executive functioning,while others use it as an index of the attentional functioning(Barbarotto et al., 1998; Valgimigli et al., 2010) or of generalcognitive efficiency (Brugnolo et al., 2015). Slowing to a responseconflict would be due to a failure of selective attention or a lack inthe cognitive efficiency instead of a failure of response inhibition(Chafetz and Matthews, 2004); however, the performance inthe SCWT is not exclusively related to concentration, attentionor cognitive effectiveness, but it relies to a more specificexecutive-frontal domain. Indeed, subjects have to processselectively a specific visual feature blocking out continuouslythe automatic processing of reading (Zajano and Gorman, 1986;Shum et al., 1990), in order to solve correctly the task. The specificinvolvement of executive processes is supported by clinical data.Patients with anterior frontal lesions, and not with posteriorcerebral damages, report significant difficulties in maintaining aconsistent activation of the intended response (Valgimigli et al.,2010). Furthermore, Parkinson’s Disease patients, characterizedby executive dysfunction due to the disruption of dopaminergicpathway (Fera et al., 2007), reported difficulties in SCWT despiteunimpaired attentional abilities (Fera et al., 2007; Djamshidianet al., 2011).

#### Load Data

In [200]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')
    
if 'stroop_df' in globals():
    del(stroop_df)
    print('deleted existing stroop_df')

deleted existing df


In [201]:
# loop through all days
days = [i for i in os.listdir(path) if i.startswith('RK')]
for day in tqdm(days):
    files = os.listdir(path + day)
    surveyQuestions = [i for i in files if i.startswith('SurveyStroopResults')]
    # there should be only one
    for file in surveyQuestions:
        if 'df' not in globals():
            df = pd.read_csv(path + day + '/' + file)
        else:
            temp_df = pd.read_csv(path + day + '/' + file)
            df = pd.concat([df,temp_df], axis=0)

100%|██████████| 87/87 [00:00<00:00, 138.27it/s]


#### Format Data

In [202]:
# check for na dates...
df.isna().sum()

SurveyStroopResultKey        0
SurveyStepResultKey          0
SurveyResultKey              0
ParticipantIdentifier        0
StartTime                    0
EndTime                      0
ColorSelected                0
Color                        0
Text                         0
StroopStyle              85044
StartDate                    0
EndDate                      0
dtype: int64

In [203]:
df = df.dropna(subset=['EndDate']).reset_index(drop=True)

In [204]:
# Select only subjects in correct cohort
df = df.loc[df.ParticipantIdentifier.isin(subjects)].reset_index(drop=True)

# # temporarily removing spanish participant
# df = df.loc[df.ParticipantIdentifier!='35d11ffc-7034-4708-a086-cd4bd47b51fd'].reset_index(drop=True)

In [205]:
# add trial date and time columns
for i in tqdm(range(df.shape[0])):
    dt = parser.parse(df.loc[i, 'EndDate'])
    df.loc[i, 'datetime'] = dt
    df.loc[i, 'trial_date'] = (dt + datetime.timedelta(hours = -4)).date() # trial day associated with sample (4am is when the day flips)
    df.loc[i, 'time'] = dt.time()

100%|██████████| 83447/83447 [00:08<00:00, 9701.26it/s] 


In [206]:
df.head(2)

Unnamed: 0,SurveyStroopResultKey,SurveyStepResultKey,SurveyResultKey,ParticipantIdentifier,StartTime,EndTime,ColorSelected,Color,Text,StroopStyle,StartDate,EndDate,datetime,trial_date,time
0,89742a1e-8b53-ed11-aac1-0afb9334277d,4e742a1e-8b53-ed11-aac1-0afb9334277d,29742a1e-8b53-ed11-aac1-0afb9334277d,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,309203.635967,309204.352445,GREEN,GREEN,GREEN,,2022-10-24T07:00:26-04:00,2022-10-24T07:00:26-04:00,2022-10-24 07:00:26-04:00,2022-10-24,07:00:26
1,8f742a1e-8b53-ed11-aac1-0afb9334277d,4e742a1e-8b53-ed11-aac1-0afb9334277d,29742a1e-8b53-ed11-aac1-0afb9334277d,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,309204.855281,309205.381077,YELLOW,YELLOW,YELLOW,,2022-10-24T07:00:27-04:00,2022-10-24T07:00:27-04:00,2022-10-24 07:00:27-04:00,2022-10-24,07:00:27


In [207]:
# replace commas with dots
df['StartTime'] = df['StartTime'].replace(',', '.', regex=True)
df.StartTime = df.StartTime.astype('float')

df['EndTime'] = df['EndTime'].replace(',', '.', regex=True)
df.EndTime = df.EndTime.astype('float')

In [208]:
# create correct, congruous and time columns
df = df[['ParticipantIdentifier', 'trial_date', 'StartTime', 'EndTime', 'ColorSelected', 'Color', 'Text']]
df = df.assign(congruent=lambda x: x.Color == x.Text,
               correct=lambda x: x.Color == x.ColorSelected,
               rt=lambda x: (x.EndTime - x.StartTime)
              )
df.head(3)

Unnamed: 0,ParticipantIdentifier,trial_date,StartTime,EndTime,ColorSelected,Color,Text,congruent,correct,rt
0,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,2022-10-24,309203.635967,309204.352445,GREEN,GREEN,GREEN,True,True,0.716479
1,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,2022-10-24,309204.855281,309205.381077,YELLOW,YELLOW,YELLOW,True,True,0.525796
2,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,2022-10-24,309205.883879,309206.448313,BLUE,BLUE,BLUE,True,True,0.564434


In [209]:
# Add additional columns
# define function that returns a Series of all aggregations

def f(x):
    d = {}
    
    d['task_stroop_totalCorrectProp'] = x['correct'].sum()/len(x['correct'])
    d['task_stroop_congruentCorrectProp'] = len(x.loc[(x.congruent==True) & (x.correct==True)])/ (x.congruent == True).sum()
    d['task_stroop_incongruentCorrectProp'] = len(x.loc[(x.congruent==False) & (x.correct==True)])/ (x.congruent == False).sum()
    d['task_stroop_totalAvgRT'] = x['rt'].sum()/len(x['rt'])    
    d['task_stroop_congruentAvgRT'] = x.loc[x.congruent==True,'rt'].sum()/ (x.congruent == True).sum()
    d['task_stroop_incongruentAvgRT'] = x.loc[x.congruent==False,'rt'].sum()/ (x.congruent == False).sum()
    
    return pd.Series(d, index=['task_stroop_totalCorrectProp', 'task_stroop_congruentCorrectProp',
                               'task_stroop_incongruentCorrectProp', 'task_stroop_totalAvgRT',
                               'task_stroop_congruentAvgRT', 'task_stroop_incongruentAvgRT'
                              ])

# note that value_counts gives us the number of trues and falses for boolean columns
# then indexing into 0 for false and 1 for true

In [210]:
stroop_df = df.groupby(['ParticipantIdentifier', 'trial_date']).apply(f).reset_index()
stroop_df.head(3)

Unnamed: 0,ParticipantIdentifier,trial_date,task_stroop_totalCorrectProp,task_stroop_congruentCorrectProp,task_stroop_incongruentCorrectProp,task_stroop_totalAvgRT,task_stroop_congruentAvgRT,task_stroop_incongruentAvgRT
0,0501ba67-3406-4779-aff1-878a0e9f7885,2022-09-30,1.0,1.0,1.0,1.566986,1.624938,1.393131
1,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-01,1.0,1.0,1.0,0.991799,0.940324,1.026115
2,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-02,1.0,1.0,1.0,0.99122,0.927535,1.109492


In [211]:
stroop_df.iloc[:,2:].mean()

task_stroop_totalCorrectProp          0.895491
task_stroop_congruentCorrectProp      0.966339
task_stroop_incongruentCorrectProp    0.822726
task_stroop_totalAvgRT                1.057124
task_stroop_congruentAvgRT            0.840355
task_stroop_incongruentAvgRT          1.245894
dtype: float64

#### EDA

In [None]:
profile = ProfileReport(stroop_df.iloc[:,2:], title=f"Stroop Task Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_stroop_run{run_num}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#### Clean

We had some clear RT outlier values given that the mean of the trials' average RT (even with outliers) was less than 1 second, and yet there where maxmum average RT that were multiple minutes.

**RTs**

We first used median absolute deviation to remove outliers (see [Leys et al., 2013](https://www.sciencedirect.com/science/article/abs/pii/S0022103113000668)).

We set the flag threshold to 3.

However this would remove more than 10% of the obervations.

Therefore we decided to use an ad-hoc approach setting the maximum RT threshold to 3 seconds which removes less than 1% of trials.

We also know from the work of Luce (1984) and Whelan (2008) that RTs below 100ms are not realistic.

Therefore all trials with average RTs below 100 were also removed (this only removed one trial).

In [212]:
# MAD approach to RT outlier removal

# Calculate MAD for the specified columns
def mad(series):
    median_value = series.median()
    return (series - median_value).abs().median()

# MAD-based outlier detection
def detect_outliers(series, threshold):
    median_value = series.median()
    mad_value = mad(series)
    
    # Detect outliers
    outliers = ((series - median_value).abs() > threshold * mad_value)
    return outliers

threshold = 3.0

# Detect outliers in the 'task_trailmaking_numeric_time' column
x = detect_outliers(stroop_df['task_stroop_totalAvgRT'], threshold=threshold).sum()
print(f'NUMERIC:\nUsing a threshold of {threshold} with MAD we find {x} outliers in the {trailmaking_df.shape[0]} observations.\n')

# # Detect outliers in the 'task_trailmaking_alphaNumeric_time' column
# x = detect_outliers(trailmaking_df['task_trailmaking_alphaNumeric_time'], threshold=threshold).sum()
# print(f'ALPHANUMERIC:\nUsing a threshold of {threshold} with MAD we find {x} outliers in the {trailmaking_df.shape[0]} observations.')

NUMERIC:
Using a threshold of 3.0 with MAD we find 527 outliers in the 4184 observations.



In [213]:
# Ad hoc approach to RT outlier removal
threshold = 3

count = (stroop_df.task_stroop_totalAvgRT > threshold).sum()
total_n = stroop_df.shape[0]

print(f'CEILING')
print(f'Setting a threshold of {threshold}s removes {count} trials out of {total_n} total numeric trials.')
print(f'This is {(count/total_n)*100:.1f}% of the trials.\n')

# Remove values based on these thresholds
stroop_df = stroop_df.loc[stroop_df.task_stroop_totalAvgRT <= threshold]

threshold = .1
count = (stroop_df.task_stroop_totalAvgRT < threshold).sum()

print(f'FLOOR')
print(f'Setting a threshold of {threshold}s removes {count} trials out of {total_n} total alpha-numeric trials.')
print(f'This is {(count/total_n)*100:.1f}% of the trials.')

stroop_df = stroop_df.loc[stroop_df.task_stroop_totalAvgRT > threshold]

CEILING
Setting a threshold of 3s removes 30 trials out of 4195 total numeric trials.
This is 0.7% of the trials.

FLOOR
Setting a threshold of 0.1s removes 0 trials out of 4195 total alpha-numeric trials.
This is 0.0% of the trials.


In [530]:
# Rerun EDA with clean df
profile = ProfileReport(stroop_df.iloc[:,2:], title=f"Stroop Task Run {run_num} - Clean | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_stroop_run{run_num}_clean.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### PSAT

[RKStudio Documentation](https://rkstudio-support.careevolution.com/hc/en-us/articles/1500002352262-Paced-Serial-Addition-Test-PSAT-Active-Task-Export-Format)

#### Load Data

In [214]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')
    
if 'psat_df' in globals():
    del(psat_df)
    print('deleted existing psat_df')

deleted existing df


In [215]:
# loop through all days
days = [i for i in os.listdir(path) if i.startswith('RK')]
for day in tqdm(days):
    files = os.listdir(path + day)
    surveyQuestions = [i for i in files if i.startswith('SurveyPSATResults')]
    # there should be only one
    for file in surveyQuestions:
        if 'df' not in globals():
            df = pd.read_csv(path + day + '/' + file)
        else:
            temp_df = pd.read_csv(path + day + '/' + file)
            df = pd.concat([df,temp_df], axis=0)

100%|██████████| 87/87 [00:00<00:00, 428.54it/s]


#### Format Data

In [216]:
# check for na dates...
df.isna().sum()

SurveyPSATResultKey      0
SurveyStepResultKey      0
SurveyResultKey          0
ParticipantIdentifier    0
PresentationMode         0
InterStimulusInterval    0
StimulusDuration         0
Length                   0
TotalCorrect             0
TotalDyad                0
TotalTime                0
InitialDigit             0
StartDate                0
EndDate                  0
Samples                  0
dtype: int64

In [217]:
df = df.dropna(subset=['EndDate']).reset_index(drop=True)

In [218]:
# Select only subjects in current run
df = df.loc[df.ParticipantIdentifier.isin(subjects)].reset_index(drop=True)

In [219]:
# add trial date and time columns
for i in tqdm(range(df.shape[0])):
    dt = parser.parse(df.loc[i, 'EndDate'])
    df.loc[i, 'datetime'] = dt
    df.loc[i, 'trial_date'] = (dt + datetime.timedelta(hours = -4)).date() # trial day associated with sample (4am is when the day flips)
    df.loc[i, 'time'] = dt.time()

  0%|          | 0/4113 [00:00<?, ?it/s]

100%|██████████| 4113/4113 [00:00<00:00, 9685.05it/s]


In [220]:
df.head(2)

Unnamed: 0,SurveyPSATResultKey,SurveyStepResultKey,SurveyResultKey,ParticipantIdentifier,PresentationMode,InterStimulusInterval,StimulusDuration,Length,TotalCorrect,TotalDyad,TotalTime,InitialDigit,StartDate,EndDate,Samples,datetime,trial_date,time
0,0440c97d-8b53-ed11-aac1-0afb9334277d,f43fc97d-8b53-ed11-aac1-0afb9334277d,ea3fc97d-8b53-ed11-aac1-0afb9334277d,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,Visual,3,1,30,30,29,36.382437,6,2022-10-24T07:03:37-04:00,2022-10-24T07:03:37-04:00,"[{""Answer"":15,""Correct"":true,""Time"":1.28411066...",2022-10-24 07:03:37-04:00,2022-10-24,07:03:37
1,7c2fccdd-8b53-ed11-aac1-0afb9334277d,6c2fccdd-8b53-ed11-aac1-0afb9334277d,602fccdd-8b53-ed11-aac1-0afb9334277d,39d785c6-9e3d-44c2-88ec-2ce14ef923f9,Visual,3,1,30,23,18,46.628217,2,2022-10-24T07:06:10-04:00,2022-10-24T07:06:10-04:00,"[{""Answer"":6,""Correct"":true,""Time"":1.073050333...",2022-10-24 07:06:10-04:00,2022-10-24,07:06:10


---

The important data is:
- `Length` + `TotalCorrect` to determine accuracy
- `TotalTime` / `Length` to get time/trial 
    - (use this instead of `TotalTime` in case we change number of trials

In [221]:
# Replace commas with decimals (for European participants)
df['TotalTime'] = df['TotalTime'].replace(',', '.', regex=True)
df.TotalTime = df.TotalTime.astype('float')

In [222]:
# Calculate 'task_psat_accuracy' column
df['task_psat_accuracy'] = df['TotalCorrect'] / df['Length']

# Define a function to calculate 'task_psat_avgRT', 'task_psat_flag_3plusRT_n', 'task_psat_flag_sub100RT_n' and 'task_psat_varRT'
def calculate_avgRT_and_flag(samples, length):
    # Convert the string representation of the samples list into a list of dictionaries
    samples_list = json.loads(samples)
    
    # Extract all 'Time' values
    times = [item["Time"] for item in samples_list]
    
    # Count the number of 'Time' values greater than 3.0
    flag_count = sum(1 for time in times if time > 3.0)
    
    # Replace 'Time' values greater than 3.0 with 3.0
    times = [min(time, 3.0) for time in times]
    
    # Calculate the average 'Time' value
    avgRT = sum(times) / length
    
    # Flag 'Time' values less than 100ms
    min100_count = sum(1 for time in times if time < 0.1)
    
    # Calculate the variance of 'Time' values
    varRT = np.var(times)
    
    return avgRT, flag_count, varRT, min100_count

# Apply the function to the DataFrame and split the results into two new columns
df['task_psat_avgRT'], df['task_psat_flag_3plusRT_n'], df['task_psat_varRT'], df['task_psat_flag_sub100RT_n'] = zip(*df.apply(lambda row: calculate_avgRT_and_flag(row['Samples'], row['Length']), axis=1))

# keep relevant columns
psat_df = df[['ParticipantIdentifier', 'trial_date', 'task_psat_accuracy', 'task_psat_avgRT', 'task_psat_varRT', 'task_psat_flag_sub100RT_n', 'task_psat_flag_3plusRT_n']]
psat_df.head(2)


Unnamed: 0,ParticipantIdentifier,trial_date,task_psat_accuracy,task_psat_avgRT,task_psat_varRT,task_psat_flag_sub100RT_n,task_psat_flag_3plusRT_n
0,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,2022-10-24,1.0,1.212748,0.263278,0,0
1,39d785c6-9e3d-44c2-88ec-2ce14ef923f9,2022-10-24,0.766667,1.554274,0.241998,0,0


#### EDA

In [None]:
profile = ProfileReport(psat_df.iloc[:,2:], title=f"PSAT Task Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_psat_run{run_num}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#### Clean

We had some clear RT outlier values given that the mean of the trials' average RT (even with outliers) was less than 1 second, and yet there where maxmum average RT that were multiple minutes.

Going back it appeared that there were glitches as on some trials there were response times above 3.0 seconds which should have been the in-app limit. 

I replaced any values above 3.0 with 3.0 before calculating the average RT time per trial.
I also created a column called `task_psat_flag_3plusRT_n` that indicated how many responses had an RT greater than 3.0 orginally, for each trial. For cohort 2, for example, 212 trials (out of 6418) had at least one RT greater than 3.0.

**RTs**

We also know from the work of Luce (1984) and Whelan (2008) that RTs below 100ms are not realistic.

We therefore also counted up how many responses in each trial had RTs BELOW 100ms and indicated the numebr in the column `task_psat_flag_sub100RT_n`

### Tower of Hanoi

[RK Studio Documentation](http://researchkit.org/docs/docs/ActiveTasks/ActiveTasks.html#towerhttp://researchkit.org/docs/docs/ActiveTasks/ActiveTasks.html#tower)

>In the Tower of Hanoi task the user is asked to solve the classic Tower of Hanoi puzzle in a minimum number of moves. To solve the puzzle, the user must move the entire stack to the highlighted platform in as few moves as possible. This task measures the user’s problem solving skills. A Tower of Hanoi task finishes when the user completes the puzzle correctly or concedes that they cannot solve the puzzle.

>Data collected by this task is in the form of an ORKTowerOfHanoiResult object. It contains every move taken by the user and indicates whether the puzzle was successfully completed or not.

#### Load Data

In [223]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')
    
if 'toh_df' in globals():
    del(toh_df)
    print('deleted existing toh_df')

deleted existing df


In [224]:
# loop through all days
days = [i for i in os.listdir(path) if i.startswith('RK')]
for day in tqdm(days):
    files = os.listdir(path + day)
    surveyQuestions = [i for i in files if i.startswith('SurveyTowerOfHanoiResults')]
    # there should be only one
    for file in surveyQuestions:
        if 'df' not in globals():
            df = pd.read_csv(path + day + '/' + file)
        else:
            temp_df = pd.read_csv(path + day + '/' + file)
            df = pd.concat([df,temp_df], axis=0)

100%|██████████| 87/87 [00:00<00:00, 348.75it/s]


#### Format Data

In [225]:
# check for na dates...
df.isna().sum()

SurveyTowerOfHanoiResultKey    0
SurveyStepResultKey            0
SurveyResultKey                0
ParticipantIdentifier          0
PuzzleWasSolved                0
StartDate                      0
EndDate                        0
Moves                          0
dtype: int64

In [226]:
df = df.dropna(subset=['EndDate']).reset_index(drop=True)

In [227]:
# Select only subjects in current run
df = df.loc[df.ParticipantIdentifier.isin(subjects)].reset_index(drop=True)

In [228]:
# add trial date and time columns
for i in tqdm(range(df.shape[0])):
    dt = parser.parse(df.loc[i, 'EndDate'])
    df.loc[i, 'datetime'] = dt
    df.loc[i, 'trial_date'] = (dt + datetime.timedelta(hours = -4)).date() # trial day associated with sample (4am is when the day flips)
    df.loc[i, 'time'] = dt.time()
    
df.head(3)

  0%|          | 0/4206 [00:00<?, ?it/s]

100%|██████████| 4206/4206 [00:00<00:00, 9675.50it/s]


Unnamed: 0,SurveyTowerOfHanoiResultKey,SurveyStepResultKey,SurveyResultKey,ParticipantIdentifier,PuzzleWasSolved,StartDate,EndDate,Moves,datetime,trial_date,time
0,03111c2a-8b53-ed11-aac1-0afb9334277d,ef101c2a-8b53-ed11-aac1-0afb9334277d,dd101c2a-8b53-ed11-aac1-0afb9334277d,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,True,2022-10-24T07:00:57-04:00,2022-10-24T07:01:13-04:00,"[{""Timestamp"":0.0,""DonorTowerIndex"":0,""Recipie...",2022-10-24 07:01:13-04:00,2022-10-24,07:01:13
1,da3ce753-8b53-ed11-aac1-0afb9334277d,ce3ce753-8b53-ed11-aac1-0afb9334277d,c53ce753-8b53-ed11-aac1-0afb9334277d,39d785c6-9e3d-44c2-88ec-2ce14ef923f9,True,2022-10-24T07:01:31-04:00,2022-10-24T07:02:19-04:00,"[{""Timestamp"":0.0,""DonorTowerIndex"":0,""Recipie...",2022-10-24 07:02:19-04:00,2022-10-24,07:02:19
2,ca1d92df-8c53-ed11-aac1-0afb9334277d,b41d92df-8c53-ed11-aac1-0afb9334277d,a31d92df-8c53-ed11-aac1-0afb9334277d,6b827de8-fe47-4007-aad3-202655b954e3,True,2022-10-24T07:12:32-04:00,2022-10-24T07:13:30-04:00,"[{""Timestamp"":0.0,""DonorTowerIndex"":0,""Recipie...",2022-10-24 07:13:30-04:00,2022-10-24,07:13:30


The important data is in `Moves` where we have:
- `TapTimestamp`
- `TapIndex`
- `TapIncorrect`

I want to get the last `TapTimestamp` to calculate total timing

In [229]:
# Taps is a string of a list of dictionaries
df.Moves[0]

'[{"Timestamp":0.0,"DonorTowerIndex":0,"RecipientTowerIndex":2},{"Timestamp":0.44844293594360352,"DonorTowerIndex":0,"RecipientTowerIndex":1},{"Timestamp":0.79825496673583984,"DonorTowerIndex":2,"RecipientTowerIndex":1},{"Timestamp":1.2141859531402588,"DonorTowerIndex":0,"RecipientTowerIndex":2},{"Timestamp":1.6481199264526367,"DonorTowerIndex":1,"RecipientTowerIndex":0},{"Timestamp":2.0084099769592285,"DonorTowerIndex":1,"RecipientTowerIndex":2},{"Timestamp":2.4056679010391235,"DonorTowerIndex":0,"RecipientTowerIndex":2},{"Timestamp":2.8057299852371216,"DonorTowerIndex":0,"RecipientTowerIndex":1},{"Timestamp":3.2057470083236694,"DonorTowerIndex":2,"RecipientTowerIndex":1},{"Timestamp":3.5718460083007812,"DonorTowerIndex":2,"RecipientTowerIndex":0},{"Timestamp":7.4691159725189209,"DonorTowerIndex":1,"RecipientTowerIndex":0},{"Timestamp":7.8433569669723511,"DonorTowerIndex":2,"RecipientTowerIndex":1},{"Timestamp":8.24301290512085,"DonorTowerIndex":0,"RecipientTowerIndex":2},{"Timestamp"

In [230]:
# Can convert to list of dicts and then access an individual dict
data = json.loads(df.Moves[0])

# print last dict
print(data[-1])

# extract the time
data[-1]['Timestamp']

{'Timestamp': 15.945243000984192, 'DonorTowerIndex': 0, 'RecipientTowerIndex': 2}


15.945243000984192

In [231]:
# numbmer of moves
len(data)

31

In [232]:
# convert string Taps to list of dicts
df['MovesList'] = df['Moves'].apply(json.loads)

# test if any lists are empty...this means NO MOVES
drop_ix = []

for i in range(len(df.MovesList)):
    if df.MovesList[i]:
        x = df.MovesList[i][-1]['Timestamp']
    else:
        drop_ix.append(i)

In [233]:
# drop rows with empty move lists
df = df.drop(df.index[drop_ix]).reset_index(drop=True)

---

The important data is:
- `PuzzleWasSolved`
    - just to indicate completion
- Get total time required
    - `Timestamp` in last dictionary
- Get number of moves
    - 1 dict/move so get count of dicts)

In [234]:
# assign new columns
df = df.assign(task_hanoi_solved=lambda x: x.PuzzleWasSolved,
               task_hanoi_time=lambda x: x.MovesList.apply(lambda x: x[-1]['Timestamp']),
               task_hanoi_moves=[len(moves) for moves in df.MovesList] # maybe give this as a multiple on optimality (ideal = 1)?
              )

# keep relevant columns
toh_df = df[['ParticipantIdentifier', 'trial_date', 'task_hanoi_solved', 'task_hanoi_time', 'task_hanoi_moves']]

# add extra moves column
toh_df['task_hanoi_extraMoves'] = toh_df['task_hanoi_moves'] - 31
toh_df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  toh_df['task_hanoi_extraMoves'] = toh_df['task_hanoi_moves'] - 31


Unnamed: 0,ParticipantIdentifier,trial_date,task_hanoi_solved,task_hanoi_time,task_hanoi_moves,task_hanoi_extraMoves
0,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,2022-10-24,True,15.945243,31,0
1,39d785c6-9e3d-44c2-88ec-2ce14ef923f9,2022-10-24,True,48.495506,48,17


In [235]:
toh_df.iloc[:, 2:].mean()

task_hanoi_solved         0.907474
task_hanoi_time          51.748870
task_hanoi_moves         46.648513
task_hanoi_extraMoves    15.648513
dtype: float64

In [273]:
# if the puzzle was not solved then extraMoves does not make sense
toh_df.loc[toh_df['task_hanoi_solved']==False, 'task_hanoi_extraMoves'] = np.nan

#### EDA

In [None]:
profile = ProfileReport(toh_df.iloc[:,2:], title=f"TOH Task Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_toh_run{run_num}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#### Clean

**RTs**

While our mean time to solve the puzzle was about 48 seconds (without removing outliers), we have extreme values of over 12,000 seconds.

We chose to use an ad-hoc method of removing trials where we cut out any trials that took longer than 10 minutes. This only removed 13 trials (out of 6333) in cohort 2.

---

We also have values of zero (95 in cohort 2). Most of these actually indicate that the puzzle was solved, which is of course impossible.

All rows with a `task_hanoi_time` of zero are removed.

---

In [236]:
# Remove all trials with zero time
toh_df = toh_df.loc[toh_df['task_hanoi_time'] >0]

# Remove all trials with more than 10 minutes time
toh_df = toh_df.loc[toh_df['task_hanoi_time'] <=600]

# Rerun EDA
profile = ProfileReport(toh_df.iloc[:,2:], title=f"TOH Task Run {run_num} - Clean | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_toh_run{run_num}_clean.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Reaction Time

This was **Not Working** for at least the first half of **Run 1**.

A new task has been created by RK Studio, called the **Normalized Reaction Time** task.

The data export format info is [here](https://support.mydatahelps.org/hc/en-us/articles/1500002230281-Normalized-Reaction-Time-Active-Task-Export-Format).

#### Load Data

In [237]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')
    
if 'rt_df' in globals():
    del(rt_df)
    print('deleted existing rt_df')

deleted existing df


In [238]:
# loop through all days
days = [i for i in os.listdir(path) if i.startswith('RK')]
for day in tqdm(days):
    files = os.listdir(path + day)
    surveyQuestions = [i for i in files if i.startswith('SurveyNormalizedReactionTime')]
    # there should be only one
    for file in surveyQuestions:
        if 'df' not in globals():
            df = pd.read_csv(path + day + '/' + file)
        else:
            temp_df = pd.read_csv(path + day + '/' + file)
            df = pd.concat([df,temp_df], axis=0)

100%|██████████| 87/87 [00:00<00:00, 515.34it/s]


#### Format Data

In [239]:
# check for na dates...
df.isna().sum()

SurveyNormalizedReactionTimeResultKey    0
SurveyStepResultKey                      0
SurveyResultKey                          0
ParticipantIdentifier                    0
ReactionDate                             0
StimulusStartDate                        0
TimerStartDate                           0
TimerEndDate                             0
CurrentInterval                          0
StartDate                                0
EndDate                                  0
dtype: int64

In [240]:
df = df.dropna(subset=['EndDate']).reset_index(drop=True)

In [241]:
# Select only subjects in current run
df = df.loc[df.ParticipantIdentifier.isin(subjects)].reset_index(drop=True)

In [242]:
# add trial date and time columns
for i in tqdm(range(df.shape[0])):
    dt = parser.parse(df.loc[i, 'EndDate'])
    df.loc[i, 'datetime'] = dt
    df.loc[i, 'trial_date'] = (dt + datetime.timedelta(hours = -4)).date() # trial day associated with sample (4am is when the day flips)
    df.loc[i, 'time'] = dt.time()

100%|██████████| 22917/22917 [00:02<00:00, 9414.92it/s]


In [243]:
df.head(2)

Unnamed: 0,SurveyNormalizedReactionTimeResultKey,SurveyStepResultKey,SurveyResultKey,ParticipantIdentifier,ReactionDate,StimulusStartDate,TimerStartDate,TimerEndDate,CurrentInterval,StartDate,EndDate,datetime,trial_date,time
0,837212da-315c-ed11-aac1-0afb9334277d,707212da-315c-ed11-aac1-0afb9334277d,627212da-315c-ed11-aac1-0afb9334277d,39d785c6-9e3d-44c2-88ec-2ce14ef923f9,27766.579012,27765.93828,27766.279159,27766.579085,6,2022-11-04T07:13:18-04:00,2022-11-04T07:13:18-04:00,2022-11-04 07:13:18-04:00,2022-11-04,07:13:18
1,847212da-315c-ed11-aac1-0afb9334277d,707212da-315c-ed11-aac1-0afb9334277d,627212da-315c-ed11-aac1-0afb9334277d,39d785c6-9e3d-44c2-88ec-2ce14ef923f9,27774.257387,27773.614597,27773.940479,27774.257411,4,2022-11-04T07:13:26-04:00,2022-11-04T07:13:26-04:00,2022-11-04 07:13:26-04:00,2022-11-04,07:13:26


In [244]:
# replace commas with dots
df['ReactionDate'] = df['ReactionDate'].replace(',', '.', regex=True)
df.ReactionDate = df.ReactionDate.astype('float')

df['StimulusStartDate'] = df['StimulusStartDate'].replace(',', '.', regex=True)
df.StimulusStartDate = df.StimulusStartDate.astype('float')

In [245]:
df['task_rt_time'] = df.ReactionDate - df.StimulusStartDate

In [246]:
# keep relevant columns
rt_df = df[['ParticipantIdentifier', 'trial_date', 'task_rt_time']]
# If negative it was a missed/error trial
rt_df.loc[rt_df.task_rt_time <=0, 'task_rt_time'] = None

rt_df.head(10)

Unnamed: 0,ParticipantIdentifier,trial_date,task_rt_time
0,39d785c6-9e3d-44c2-88ec-2ce14ef923f9,2022-11-04,0.640732
1,39d785c6-9e3d-44c2-88ec-2ce14ef923f9,2022-11-04,0.64279
2,39d785c6-9e3d-44c2-88ec-2ce14ef923f9,2022-11-04,0.710502
3,39d785c6-9e3d-44c2-88ec-2ce14ef923f9,2022-11-04,0.656859
4,39d785c6-9e3d-44c2-88ec-2ce14ef923f9,2022-11-04,0.710456
5,39d785c6-9e3d-44c2-88ec-2ce14ef923f9,2022-11-04,0.744159
6,39d785c6-9e3d-44c2-88ec-2ce14ef923f9,2022-11-04,0.643986
7,39d785c6-9e3d-44c2-88ec-2ce14ef923f9,2022-11-04,0.694111
8,39d785c6-9e3d-44c2-88ec-2ce14ef923f9,2022-11-04,0.60833
9,39d785c6-9e3d-44c2-88ec-2ce14ef923f9,2022-11-04,0.659535


In [247]:
labels = [
    'task_rt_1',
    'task_rt_2',
    'task_rt_3',
    'task_rt_4',
]

In [248]:
oldSub = None
oldDay = None
i = 1
rt_df['label'] = None

for row in range(len(df)):
    sub = rt_df.loc[row,'ParticipantIdentifier']
    day = rt_df.loc[row, 'trial_date']
    if (sub == oldSub) & (day == oldDay):
        if i >3:
            continue
        else:
            rt_df.loc[row, 'label'] = labels[i]
            oldSub = sub
            oldDay = day
            i+=1
    else:
        rt_df.loc[row, 'label'] = labels[0]
        i = 1
        oldSub = sub
        oldDay = day
        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rt_df['label'] = None


In [249]:
rt_df = rt_df.dropna(subset=['label'])

In [250]:
rt_df.duplicated(subset=['ParticipantIdentifier', 'trial_date', 'label']).sum()

1749

In [251]:
rt_df = rt_df.drop_duplicates(subset=['ParticipantIdentifier', 'trial_date', 'label'], keep='last')

In [252]:
rt_df = rt_df.pivot(index=['ParticipantIdentifier', 'trial_date'], columns='label', values='task_rt_time').reset_index()

# Remove index name
rt_df = rt_df.rename_axis(None, axis=1)

rt_df.head()

Unnamed: 0,ParticipantIdentifier,trial_date,task_rt_1,task_rt_2,task_rt_3,task_rt_4
0,0501ba67-3406-4779-aff1-878a0e9f7885,2022-11-02,1.024102,0.47468,0.475108,0.651219
1,0501ba67-3406-4779-aff1-878a0e9f7885,2022-11-03,0.6767,0.659689,0.985539,
2,0501ba67-3406-4779-aff1-878a0e9f7885,2022-11-04,,0.551622,0.473626,0.509821
3,0501ba67-3406-4779-aff1-878a0e9f7885,2022-11-05,0.593008,0.883007,,0.599112
4,0501ba67-3406-4779-aff1-878a0e9f7885,2022-11-06,,,,0.508961


In [253]:
# Extract columns starting with "task_rt_"
task_rt_columns = [col for col in rt_df.columns if col.startswith("task_rt_")]

# Replace values in task_rt_columns that are below 0.1 with NaN
rt_df[task_rt_columns] = rt_df[task_rt_columns].applymap(lambda x: x if x >= 0.1 else float('nan'))

# Calculate 'task_rt_avgRT' column
rt_df['task_rt_avgRT'] = rt_df[task_rt_columns].mean(axis=1)

# Calculate 'task_rt_flag_plus2_n' column
rt_df['task_rt_flag_plus2_n'] = rt_df[task_rt_columns].apply(lambda row: sum(row > 2), axis=1)

#### EDA

In [None]:
profile = ProfileReport(rt_df.iloc[:,2:], title=f"RT Task Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_rt_run{run_num}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#### Clean

**RTs**

We have some very long RTs (many minutes long). Clearly people are distracted for these trials. 

We leave these in but create a column called `task_rt_flag_plus2_n` that indicates how many trials had rts greater than 2 seconds.

We also know from the work of Luce (1984) and Whelan (2008) that RTs below 100ms are not realistic, so we remove those values and replace with NaN

In [571]:
# Rerun EDA filtering out flagged trials
profile = ProfileReport(rt_df.loc[rt_df['task_rt_flag_plus2_n']==0, "task_rt_1":], title=f"RT Task Run {run_num} - Cleaned | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_rt_run{run_num}_clean.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Spatial Span Memory


[RK Studio Documentation](http://researchkit.org/docs/docs/ActiveTasks/ActiveTasks.html#spatialhttp://researchkit.org/docs/docs/ActiveTasks/ActiveTasks.html#spatial)

> In the spatial memory task the user is asked to observe and then recall pattern sequences of increasing length in a game-like environment. The task collects data that can be used to assess visuospatial memory and executive function.

>The span (that is, the length of the pattern sequence) is automatically varied during the task, increasing after successful completion of a sequence, and decreasing after failures, in the range from minimumSpan to maximumSpan. The playSpeed property lets you control the speed of sequence playback, and the customTargetImage property lets you customize the shape of the tap target. The game finishes when either maxTests tests have been completed, or the user has made maxConsecutiveFailures errors in a row.

>The results collected are scores derived from the game, the details of the game, and the touch inputs made by the user.

#### Load Data

In [254]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')
    
if 'spatialSpan_df' in globals():
    del(spatialSpan_df)
    print('deleted existing spatialSpan_df')

deleted existing df


In [255]:
# loop through all days
days = [i for i in os.listdir(path) if i.startswith('RK')]
for day in tqdm(days):
    files = os.listdir(path + day)
    surveyQuestions = [i for i in files if i.startswith('SurveySpatialSpanMemoryResults')]
    # there should be only one
    for file in surveyQuestions:
        if 'df' not in globals():
            df = pd.read_csv(path + day + '/' + file)
        else:
            temp_df = pd.read_csv(path + day + '/' + file)
            df = pd.concat([df,temp_df], axis=0)

100%|██████████| 87/87 [00:00<00:00, 153.04it/s]


#### Format Data

In [256]:
# check for na dates...
df.isna().sum()

SurveySpatialSpanMemoryResultKey    0
SurveyStepResultKey                 0
SurveyResultKey                     0
ParticipantIdentifier               0
Score                               0
NumberOfGames                       0
NumberOfFailures                    0
StartDate                           0
EndDate                             0
GameRecords                         0
dtype: int64

In [257]:
df = df.dropna(subset=['EndDate']).reset_index(drop=True)

In [258]:
# Select correct subjects
df = df.loc[df.ParticipantIdentifier.isin(subjects)].reset_index(drop=True)

In [259]:
# add trial date and time columns
for i in tqdm(range(df.shape[0])):
    dt = parser.parse(df.loc[i, 'EndDate'])
    df.loc[i, 'datetime'] = dt
    df.loc[i, 'trial_date'] = (dt + datetime.timedelta(hours = -4)).date() # trial day associated with sample (4am is when the day flips)
    df.loc[i, 'time'] = dt.time()

  0%|          | 0/4117 [00:00<?, ?it/s]

100%|██████████| 4117/4117 [00:00<00:00, 8960.83it/s]


In [260]:
df.head(2)

Unnamed: 0,SurveySpatialSpanMemoryResultKey,SurveyStepResultKey,SurveyResultKey,ParticipantIdentifier,Score,NumberOfGames,NumberOfFailures,StartDate,EndDate,GameRecords,datetime,trial_date,time
0,4c40ca9b-8b53-ed11-aac1-0afb9334277d,4040ca9b-8b53-ed11-aac1-0afb9334277d,3740ca9b-8b53-ed11-aac1-0afb9334277d,39d785c6-9e3d-44c2-88ec-2ce14ef923f9,330,6,1,2022-10-24T07:03:32-04:00,2022-10-24T07:04:19-04:00,"[{""Seed"":1595374595,""Sequence"":[2,6,8],""GameSi...",2022-10-24 07:04:19-04:00,2022-10-24,07:04:19
1,999cc2a1-8b53-ed11-aac1-0afb9334277d,8c9cc2a1-8b53-ed11-aac1-0afb9334277d,829cc2a1-8b53-ed11-aac1-0afb9334277d,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,385,6,1,2022-10-24T07:03:42-04:00,2022-10-24T07:04:31-04:00,"[{""Seed"":0,""Sequence"":[6,2,1],""GameSize"":9,""Ga...",2022-10-24 07:04:31-04:00,2022-10-24,07:04:31


---

To capture performance we are using:
- `Score` 

**NB** | Might be worth checking out exactly how this is calculated, but for our purposes it seems to be a good proxy of how well you actually do on the task (e.g. you get a better score if you fail on the last attempt (to get 8 in a row) then if you fail on the second attempt and only make it to 6 in a row...)

In [261]:
# assign new column with accuracy value
df = df.assign(task_spatialSpan_score=lambda x: x.Score,
               task_spatialSpan_tod=lambda x: x.time
              )

# keep relevant columns
spatialSpan_df = df[['ParticipantIdentifier', 'trial_date', 'task_spatialSpan_score', 'task_spatialSpan_tod']]
spatialSpan_df.head(2)

Unnamed: 0,ParticipantIdentifier,trial_date,task_spatialSpan_score,task_spatialSpan_tod
0,39d785c6-9e3d-44c2-88ec-2ce14ef923f9,2022-10-24,330,07:04:19
1,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,2022-10-24,385,07:04:31


#### EDA

In [None]:
profile = ProfileReport(spatialSpan_df.iloc[:,2:], title=f"Spatial Span Task Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_spatialSpan_run{run_num}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#### Clean

No cleaning necessary

### Join Tasks

In [263]:
df_complete = df_complete.merge(trailmaking_df, how='left', on=['ParticipantIdentifier', 'trial_date'])
df_complete = df_complete.merge(stroop_df, how='left', on=['ParticipantIdentifier', 'trial_date'])
df_complete = df_complete.merge(psat_df, how='left', on=['ParticipantIdentifier', 'trial_date'])
df_complete = df_complete.merge(toh_df, how='left', on=['ParticipantIdentifier', 'trial_date'])
df_complete = df_complete.merge(rt_df, how='left', on=['ParticipantIdentifier', 'trial_date'])
df_complete = df_complete.merge(spatialSpan_df, how='left', on=['ParticipantIdentifier', 'trial_date'])

In [278]:
# A lot of extra rows added...removing these
# Note that I have not looked closely into WHY there are the extra rows
df_complete = df_complete.drop_duplicates(subset=['ParticipantIdentifier', 'trial_date'], keep='last')

In [279]:
df_complete.head()

Unnamed: 0,ParticipantIdentifier,trial_date,affect_neg_angry,affect_neg_ashamed,affect_neg_bored,affect_neg_depressed,affect_neg_embarrassed,affect_neg_frustrated,affect_neg_guilty,affect_neg_lazy,...,task_rt_3,task_rt_4,task_rt_avgRT,task_rt_flag_plus2_n,task_spatialSpan_score,task_spatialSpan_tod,task_hanoi_solved,task_hanoi_time,task_hanoi_moves,task_hanoi_extraMoves
0,d92cdaf1-cc3e-4408-bcdf-7be602cc5c3b,2022-09-27,3.0,3.0,1.0,5.0,5.0,2.0,4.0,5.0,...,,,,,280.0,09:09:20,False,18.734397,4.0,
1,d92cdaf1-cc3e-4408-bcdf-7be602cc5c3b,2022-09-28,1.0,4.0,1.0,5.0,1.0,3.0,1.0,5.0,...,,,,,315.0,11:05:24,False,25.741186,6.0,
2,d92cdaf1-cc3e-4408-bcdf-7be602cc5c3b,2022-09-29,,,,,,,,,...,,,,,355.0,09:27:19,False,1.858064,2.0,
3,d92cdaf1-cc3e-4408-bcdf-7be602cc5c3b,2022-09-30,,,,,,,,,...,,,,,315.0,19:09:25,False,2.050316,2.0,
4,d92cdaf1-cc3e-4408-bcdf-7be602cc5c3b,2022-10-01,4.0,4.0,3.0,4.0,1.0,3.0,4.0,1.0,...,,,,,380.0,00:22:08,False,68.897515,13.0,


# Passive Data

## Health Kit

### Load Data

In [270]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

deleted existing df


In [271]:
# loop through all days
days = [i for i in os.listdir(path) if i.startswith('RK')]
for day in tqdm(days):
    files = os.listdir(path + day)
    surveyQuestions = [i for i in files if i.startswith('HealthKitSamples')]
    # there should be only one
    for file in surveyQuestions:
        if 'df' not in globals():
            df = pd.read_csv(path + day + '/' + file)
        else:
            temp_df = pd.read_csv(path + day + '/' + file)
            df = pd.concat([df,temp_df], axis=0)

  df = pd.read_csv(path + day + '/' + file)
  temp_df = pd.read_csv(path + day + '/' + file)
  temp_df = pd.read_csv(path + day + '/' + file)
  temp_df = pd.read_csv(path + day + '/' + file)
  temp_df = pd.read_csv(path + day + '/' + file)
  temp_df = pd.read_csv(path + day + '/' + file)
  temp_df = pd.read_csv(path + day + '/' + file)
  temp_df = pd.read_csv(path + day + '/' + file)
  temp_df = pd.read_csv(path + day + '/' + file)
  temp_df = pd.read_csv(path + day + '/' + file)
  temp_df = pd.read_csv(path + day + '/' + file)
  temp_df = pd.read_csv(path + day + '/' + file)
  temp_df = pd.read_csv(path + day + '/' + file)
  temp_df = pd.read_csv(path + day + '/' + file)
  temp_df = pd.read_csv(path + day + '/' + file)
  temp_df = pd.read_csv(path + day + '/' + file)
  temp_df = pd.read_csv(path + day + '/' + file)
  temp_df = pd.read_csv(path + day + '/' + file)
  temp_df = pd.read_csv(path + day + '/' + file)
  temp_df = pd.read_csv(path + day + '/' + file)
  temp_df = pd.read_csv(p

### Format Data

In [272]:
df.head(3)

Unnamed: 0,HealthKitSampleKey,ParticipantIdentifier,StartDate,Date,Type,Value,Units,SourceIdentifier,SourceName,SourceVersion,DeviceName,DeviceModel,DeviceManufacturer,DeviceHardwareVersion,DeviceSoftwareVersion,DeviceFirmwareVersion,DeviceFDAIdentifier,DeviceLocalIdentifier,Metadata,InsertedDate
0,0c41e753-8b53-ed11-aac1-0afb9334277d,6e86c427-c679-4acd-aeb4-19ada9022b66,2022-10-24T06:12:01-04:00,2022-10-24T06:12:01-04:00,HeartRate,71,count/min,com.apple.health.B7883FAE-4E44-4B37-B142-2D969...,Tamera’s Apple Watch,9.0.2,Apple Watch,Watch,Apple Inc.,"Watch5,9",9.0.2,,,,"{""HKMetadataKeyHeartRateMotionContext"":""0""}",2022-10-24T11:02:29Z
1,0b41e753-8b53-ed11-aac1-0afb9334277d,6e86c427-c679-4acd-aeb4-19ada9022b66,2022-10-24T06:08:16-04:00,2022-10-24T06:08:16-04:00,HeartRate,71,count/min,com.apple.health.B7883FAE-4E44-4B37-B142-2D969...,Tamera’s Apple Watch,9.0.2,Apple Watch,Watch,Apple Inc.,"Watch5,9",9.0.2,,,,"{""HKMetadataKeyHeartRateMotionContext"":""0""}",2022-10-24T11:02:29Z
2,0a41e753-8b53-ed11-aac1-0afb9334277d,6e86c427-c679-4acd-aeb4-19ada9022b66,2022-10-24T06:04:35-04:00,2022-10-24T06:04:35-04:00,HeartRate,72,count/min,com.apple.health.B7883FAE-4E44-4B37-B142-2D969...,Tamera’s Apple Watch,9.0.2,Apple Watch,Watch,Apple Inc.,"Watch5,9",9.0.2,,,,"{""HKMetadataKeyHeartRateMotionContext"":""0""}",2022-10-24T11:02:29Z


In [273]:
df.columns

Index(['HealthKitSampleKey', 'ParticipantIdentifier', 'StartDate', 'Date',
       'Type', 'Value', 'Units', 'SourceIdentifier', 'SourceName',
       'SourceVersion', 'DeviceName', 'DeviceModel', 'DeviceManufacturer',
       'DeviceHardwareVersion', 'DeviceSoftwareVersion',
       'DeviceFirmwareVersion', 'DeviceFDAIdentifier', 'DeviceLocalIdentifier',
       'Metadata', 'InsertedDate'],
      dtype='object')

In [274]:
# rename Date to EndDate
df.rename(columns={'Date': 'EndDate'}, inplace=True)

In [275]:
keep_cols = [
    'ParticipantIdentifier', 
    'StartDate',
    'EndDate',
    'Type', 
    'Value', 
    'Units',       
    'SourceVersion', 
    'DeviceName', 
    'DeviceModel'
]

In [276]:
df = df[keep_cols]

In [277]:
# check for na dates...
df.isna().sum()

ParticipantIdentifier         0
StartDate                     0
EndDate                       0
Type                          0
Value                         0
Units                    110826
SourceVersion              6653
DeviceName               217698
DeviceModel              237851
dtype: int64

In [278]:
# Select appropriate subjects
df = df.loc[df.ParticipantIdentifier.isin(subjects)].reset_index(drop=True)

In [279]:
df.shape

(7560564, 9)

In [280]:
%%time

# vectorized version

# datetime col based on EndDate
def foo(x):
    try:
        return parser.parse(x)
    except (ValueError, KeyError):
        return None

v = np.vectorize(foo)
df['datetime'] = v(df.StartDate)

# trial_date col, based on sleep before 6pm...
def foo(x):
    try:
        return (x + datetime.timedelta(hours = -4)).date()
    except (ValueError, KeyError):
        return None

v = np.vectorize(foo)
df['trial_date'] = v(df.datetime)

# time col
def foo(x):
    try:
        return x.time()
    except (ValueError, KeyError):
        return None

v = np.vectorize(foo)
df['time'] = v(df.datetime)

CPU times: user 2min 51s, sys: 246 ms, total: 2min 51s
Wall time: 2min 52s


In [281]:
np.unique(df.Type)

array(['ActiveEnergyBurned', 'AppleStandTime', 'AudioExposureEvent',
       'DailySteps', 'DistanceCycling', 'DistanceWalkingRunning',
       'EnvironmentalAudioExposure', 'HandwashingEvent',
       'HeadphoneAudioExposure', 'HeartRate', 'HeartRateVariability',
       'MindfulSession', 'RestingEnergyBurned', 'RestingHeartRate',
       'SleepAnalysisInterval', 'StandHourInterval', 'Steps',
       'WalkingHeartRateAverage', 'WalkingSpeed'], dtype=object)

In [282]:
df['trial_date'][0]

datetime.date(2022, 10, 24)

### Activity

We will take:

- `ActiveEnergyBurned`
- `RestingEnergyBurned`
- `DistanceWalkingRunning`
- `DistanceCycling`
- `AppleStandTime`
- `WalkingSpeed`
- `Steps`

---

**Note**: for some of these it makes sense to take the daily sum (e.g. `AppleStandTime`) while others make more sense as a daily average (e.g. `WalkingSpeed`)

In [283]:
type_vals_mean = ['WalkingSpeed']

type_vals_sum = ['ActiveEnergyBurned', 'RestingEnergyBurned', 'DistanceWalkingRunning',
                  'DistanceCycling', 'AppleStandTime', 'Steps']

df_activity_mean = df.loc[df.Type.isin(type_vals_mean)].reset_index(drop=True)
df_activity_sum = df.loc[df.Type.isin(type_vals_sum)].reset_index(drop=True)

df_activity_sum.head(3)

Unnamed: 0,ParticipantIdentifier,StartDate,EndDate,Type,Value,Units,SourceVersion,DeviceName,DeviceModel,datetime,trial_date,time
0,fed3a16b-c5a3-4a84-a111-32b7574e04b5,2022-10-24T06:01:08-04:00,2022-10-24T06:01:18-04:00,DistanceWalkingRunning,4.929999999993015,m,9.0.2,Apple Watch,Watch,2022-10-24 06:01:08-04:00,2022-10-24,06:01:08
1,fed3a16b-c5a3-4a84-a111-32b7574e04b5,2022-10-24T06:04:55-04:00,2022-10-24T06:05:44-04:00,DistanceWalkingRunning,18.27200002671452,m,9.0.2,Apple Watch,Watch,2022-10-24 06:04:55-04:00,2022-10-24,06:04:55
2,fed3a16b-c5a3-4a84-a111-32b7574e04b5,2022-10-24T06:06:04-04:00,2022-10-24T06:06:17-04:00,DistanceWalkingRunning,20.73000000001048,m,9.0.2,Apple Watch,Watch,2022-10-24 06:06:04-04:00,2022-10-24,06:06:04


In [284]:
# cast Value to numeric
df_activity_mean.Value = pd.to_numeric(df_activity_mean.Value)
df_activity_sum.Value = pd.to_numeric(df_activity_sum.Value)

# Calculate sums and means
df_activity_mean = pd.DataFrame(df_activity_mean.groupby(['trial_date','ParticipantIdentifier', 'Type'])['Value'].mean()).reset_index()
df_activity_sum = pd.DataFrame(df_activity_sum.groupby(['trial_date','ParticipantIdentifier', 'Type'])['Value'].sum()).reset_index()

# convert from long to wide
df_activity_mean = df_activity_mean.pivot_table(index=['trial_date', 'ParticipantIdentifier'],
                                                columns='Type', 
                                                values='Value').reset_index()
df_activity_sum = df_activity_sum.pivot_table(index=['trial_date', 'ParticipantIdentifier'],
                                              columns='Type', 
                                              values='Value').reset_index()


In [285]:
# Get rid of index name (set to "Type")
df_activity_mean.columns.name = None
df_activity_sum.columns.name = None

# Rename columns
df_activity_mean.rename(columns={'WalkingSpeed': 'passive_hk_activity_averageWalkingSpeed'
                                }, inplace=True)

df_activity_sum.rename(columns={'ActiveEnergyBurned': 'passive_hk_activity_activeEnergyBurned',
                                'RestingEnergyBurned': 'passive_hk_activity_restingEnergyBurned',
                                'Steps': 'passive_hk_activity_steps',
                                'DistanceWalkingRunning': 'passive_hk_activity_distanceWalkingRunning',
                                'DistanceCycling': 'passive_hk_activity_distanceCycling',
                                'AppleStandTime': 'passive_hk_activity_appleStandTime'
                       }, inplace=True)

In [286]:
df_activity_sum.head(3)

Unnamed: 0,trial_date,ParticipantIdentifier,passive_hk_activity_activeEnergyBurned,passive_hk_activity_appleStandTime,passive_hk_activity_distanceCycling,passive_hk_activity_distanceWalkingRunning,passive_hk_activity_restingEnergyBurned,passive_hk_activity_steps
0,2022-09-08,b032c7ad-4287-4c23-8078-f75d3da262b8,,,,346.66,,545.0
1,2022-09-09,147400db-43d9-4155-8bf2-b85b8adf4315,194.443,71.0,,4002.801667,948.472,6043.0
2,2022-09-09,25ca39d7-4279-48fd-903f-d0927adadb77,8.03,1.0,,51.543363,784.065,61.0


In [287]:
# join
df_activity = df_activity_sum.merge(df_activity_mean, how='left', on=['ParticipantIdentifier', 'trial_date'])
df_activity.head(3)

Unnamed: 0,trial_date,ParticipantIdentifier,passive_hk_activity_activeEnergyBurned,passive_hk_activity_appleStandTime,passive_hk_activity_distanceCycling,passive_hk_activity_distanceWalkingRunning,passive_hk_activity_restingEnergyBurned,passive_hk_activity_steps,passive_hk_activity_averageWalkingSpeed
0,2022-09-08,b032c7ad-4287-4c23-8078-f75d3da262b8,,,,346.66,,545.0,0.895
1,2022-09-09,147400db-43d9-4155-8bf2-b85b8adf4315,194.443,71.0,,4002.801667,948.472,6043.0,1.009
2,2022-09-09,25ca39d7-4279-48fd-903f-d0927adadb77,8.03,1.0,,51.543363,784.065,61.0,


#### EDA

In [None]:
profile = ProfileReport(df_activity.iloc[:,2:], title=f"HK Activity Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"passive_hkActivity_run{run_num}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#### Clean

No cleaning necessary

### Heart Rate

We will take:

- `RestingHeartRate`
- `WalkingHeartRateAverage`
- `HeartRateVariability`

Should we consider max heart rate?

In [288]:
type_vals = ['RestingHeartRate', 'WalkingHeartRateAverage', 'HeartRateVariability']

df_hr = df.loc[df.Type.isin(type_vals)].reset_index(drop=True)
df_hr.head(3)

Unnamed: 0,ParticipantIdentifier,StartDate,EndDate,Type,Value,Units,SourceVersion,DeviceName,DeviceModel,datetime,trial_date,time
0,0ca43379-41b5-47fb-90ba-0a22e6bf5586,2022-10-24T00:03:22-04:00,2022-10-24T10:51:30-04:00,RestingHeartRate,73,count/min,8.5.1,,,2022-10-24 00:03:22-04:00,2022-10-23,00:03:22
1,ff129772-aeab-4432-8136-8f94027b8504,2022-10-24T09:36:36-04:00,2022-10-24T11:07:55-04:00,RestingHeartRate,82,count/min,8.5.1,,,2022-10-24 09:36:36-04:00,2022-10-24,09:36:36
2,d11241a0-932e-4931-83ee-f3d28f66875f,2022-10-24T00:02:28-04:00,2022-10-24T10:35:58-04:00,RestingHeartRate,76,count/min,9.0.2,,,2022-10-24 00:02:28-04:00,2022-10-23,00:02:28


In [289]:
# cast Value to numeric
df_hr.Value = pd.to_numeric(df_hr.Value)

# Get mean for each type for each participant for each day
df_hr = pd.DataFrame(df_hr.groupby(['trial_date','ParticipantIdentifier', 'Type'])['Value'].mean()).reset_index()

# convert from long to wide
df_hr = df_hr.pivot_table(index=['trial_date', 'ParticipantIdentifier'],
                          columns='Type', 
                          values='Value').reset_index()

In [290]:
# Get rid of index name (set to "Type")
df_hr.columns.name = None

# Rename columns
df_hr.rename(columns={'HeartRateVariability': 'passive_hk_hr_variability',
                      'RestingHeartRate' : 'passive_hk_hr_resting',
                      'WalkingHeartRateAverage':'passive_hk_hr_averageWalking'
                     }, inplace=True)

df_hr.head(3)

Unnamed: 0,trial_date,ParticipantIdentifier,passive_hk_hr_variability,passive_hk_hr_resting,passive_hk_hr_averageWalking
0,2022-09-09,147400db-43d9-4155-8bf2-b85b8adf4315,27.943125,,
1,2022-09-10,147400db-43d9-4155-8bf2-b85b8adf4315,32.898518,71.0,95.0
2,2022-09-11,147400db-43d9-4155-8bf2-b85b8adf4315,31.765743,68.0,88.5


#### EDA

In [None]:
profile = ProfileReport(df_hr.iloc[:,2:], title=f"HK Heart Rate Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"passive_hkHeart_run{run_num}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#### Clean

No cleaning necessary

### Sleep

Apple `HKCategoryValueSleepAnalysis` [documentation](https://developer.apple.com/documentation/healthkit/hkcategoryvaluesleepanalysis)

- `awake`: The user is awake.
- `inBed`: The user is in bed.
- `asleepCore`: The user is in light or intermediate sleep.
> This value corresponds to stage N2 of the American Academy of Sleep Medicine’s scoring model. Stage N2 is also referred to as light or intermediate sleep. It accounts for a major part of the time spent asleep. It also includes stage N1, which makes up only a small portion of the night.
- `asleepDeep`: The user is in deep sleep.
> This value corresponds to stage N3 of the American Academy of Sleep Medicine’s scoring model.
- `asleepREM`: The user is in REM sleep.
> This value corresponds to the rapid eye movement (REM) stage of the American Academy of Sleep Medicine’s scoring model.

---

Note that final dataframe is in **minutes**

#### Format Data

In [291]:
type_vals = ['SleepAnalysisInterval']

sleep_df = df.loc[df.Type.isin(type_vals)].reset_index(drop=True)
sleep_df.head(3)

Unnamed: 0,ParticipantIdentifier,StartDate,EndDate,Type,Value,Units,SourceVersion,DeviceName,DeviceModel,datetime,trial_date,time
0,2db37fd7-2694-4b46-80f9-9f4b58d57bfa,2022-10-23T22:21:00-04:00,2022-10-23T22:31:55-04:00,SleepAnalysisInterval,InBed,,16.0.2,,,2022-10-23 22:21:00-04:00,2022-10-23,22:21:00
1,2db37fd7-2694-4b46-80f9-9f4b58d57bfa,2022-10-23T22:35:33-04:00,2022-10-23T22:39:29-04:00,SleepAnalysisInterval,InBed,,16.0.2,,,2022-10-23 22:35:33-04:00,2022-10-23,22:35:33
2,2db37fd7-2694-4b46-80f9-9f4b58d57bfa,2022-10-23T22:43:53-04:00,2022-10-23T22:45:53-04:00,SleepAnalysisInterval,InBed,,16.0.2,,,2022-10-23 22:43:53-04:00,2022-10-23,22:43:53


---
Sleep is a bit tricky as:
1. Sleeping often bridges two days
2. There are people that sleep in late and/or nap

To try and deal with this we label all sleep that **starts** before 6pm as belonging to the **previous day**

In [292]:
# sleep date col
def foo(x):
    try:
        return (x + datetime.timedelta(hours = -18)).date()
    except (ValueError, KeyError):
        return None

v = np.vectorize(foo)
sleep_df['trial_date'] = v(sleep_df.datetime)

---
Now we want to calculate the duration of each `InBed` and `Asleep` `Value`...


In [293]:
sleep_df['duration'] = [(datetime.datetime.fromisoformat(finish) - 
                         datetime.datetime.fromisoformat(start)).seconds/60
                        for start,finish in zip(sleep_df['StartDate'],sleep_df['EndDate'])]


In [294]:
# Get sum for each value (InBed, Asleep) for each participant for each SLEEP day
sleep_df = pd.DataFrame(sleep_df.groupby(['trial_date','ParticipantIdentifier', 'Value'])['duration'].sum()).reset_index()

# convert from long to wide
sleep_df = sleep_df.pivot_table(index=['trial_date', 'ParticipantIdentifier'],
                    columns='Value', 
                    values='duration').reset_index()

---
`SleepDay` was essentially a proxy for `StudyDay` but with additional hours (until 18h00) to account for long sleepers and naps.

So we will change it back to `SleepDay` to join later.

In [295]:
sleep_df.columns

Index(['trial_date', 'ParticipantIdentifier', 'Asleep', 'AsleepCore',
       'AsleepDeep', 'AsleepREM', 'Awake', 'InBed'],
      dtype='object', name='Value')

In [296]:
# Get rid of index name (set to "Value")
sleep_df.columns.name = None

# Rename columns
sleep_df.rename(columns={'Asleep': 'passive_hk_sleep_asleep',
                         'InBed' : 'passive_hk_sleep_inBed',
                         'AsleepCore': 'passive_hk_sleep_core',
                         'AsleepDeep': 'passive_hk_sleep_deep',
                         'AsleepREM': 'passive_hk_sleep_REM',
                         'Awake': 'passive_hk_sleep_awake'
                         
                  }, inplace=True)

sleep_df.head(3)

Unnamed: 0,trial_date,ParticipantIdentifier,passive_hk_sleep_asleep,passive_hk_sleep_core,passive_hk_sleep_deep,passive_hk_sleep_REM,passive_hk_sleep_awake,passive_hk_sleep_inBed
0,2022-09-08,b032c7ad-4287-4c23-8078-f75d3da262b8,,,,,,225.05
1,2022-09-09,b032c7ad-4287-4c23-8078-f75d3da262b8,,,,,,430.266667
2,2022-09-10,b032c7ad-4287-4c23-8078-f75d3da262b8,,,,,,438.116667


#### EDA

In [None]:
profile = ProfileReport(sleep_df.iloc[:,2:], title=f"HK Sleep Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"passive_hkSleep_run{run_num}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#### Clean

We have issues with a number of values.

---
`passive_hk_sleep_asleep` has a max of 6449 minutes, which would be over 100 hours (in run 2), so clearly not possible.

We did minimal cleaning on this, just removing entries where the amount of sleep time exceeded 24 hours.
- this is 35 entries in run 2

---
`passive_hk_sleep_inBed` also has a max of over 100 hours so we again remove entries over 24 hours.
- this is 50 entries in run 2


In [297]:
# remove values above 24 hours
sleep_df = sleep_df.loc[~(sleep_df.passive_hk_sleep_asleep > (24 * 60))]
sleep_df = sleep_df.loc[~(sleep_df.passive_hk_sleep_inBed > (24 * 60))]

# Rerun EDA
profile = ProfileReport(sleep_df.iloc[:,2:], title=f"HK Sleep Run {run_num} - Clean | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"passive_hkSleep_run{run_num}_clean.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Join HK

In [298]:
import utils
reload(utils)

<module 'utils' from '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_3_data_analysis/01_data munge/utils.py'>

In [301]:
df_complete.shape

(7820, 214)

In [300]:
df_complete = df_complete.merge(df_activity, how='left', on=['ParticipantIdentifier', 'trial_date'])
df_complete = df_complete.merge(df_hr, how='left', on=['ParticipantIdentifier', 'trial_date'])
df_complete = df_complete.merge(sleep_df, how='left', on=['ParticipantIdentifier', 'trial_date'])
df_complete.head(3)

Unnamed: 0,ParticipantIdentifier,trial_date,affect_neg_angry,affect_neg_ashamed,affect_neg_bored,affect_neg_depressed,affect_neg_embarrassed,affect_neg_frustrated,affect_neg_guilty,affect_neg_lazy,...,passive_hk_activity_averageWalkingSpeed,passive_hk_hr_variability,passive_hk_hr_resting,passive_hk_hr_averageWalking,passive_hk_sleep_asleep,passive_hk_sleep_core,passive_hk_sleep_deep,passive_hk_sleep_REM,passive_hk_sleep_awake,passive_hk_sleep_inBed
0,d92cdaf1-cc3e-4408-bcdf-7be602cc5c3b,2022-09-27,3.0,3.0,1.0,5.0,5.0,2.0,4.0,5.0,...,1.272807,,,,,,,,,
1,d92cdaf1-cc3e-4408-bcdf-7be602cc5c3b,2022-09-28,1.0,4.0,1.0,5.0,1.0,3.0,1.0,5.0,...,1.269057,,,,,,,,,
2,d92cdaf1-cc3e-4408-bcdf-7be602cc5c3b,2022-09-29,,,,,,,,,...,1.297391,,,,,,,,,


## Sensor Kit

Note that **SensorKit** processing is done in a separate notebook.

Here we aggregate the data by day

### Device Usage

In [302]:
sk_device = pd.read_csv(save_path + 'run' + str(run_num) + '_sk_deviceUsage.csv')

In [303]:
sk_device.head(2)

Unnamed: 0,ParticipantIdentifier,date_time,trial_date,passive_sk_device_time,passive_sk_device_duration,passive_sk_device_total_unlocks,passive_sk_device_total_screen_wakes,passive_sk_device_total_unlock_duration,passive_sk_device_app_usage_books,passive_sk_device_app_usage_business,...,passive_sk_device_web_usage_productivity,passive_sk_device_web_usage_reference,passive_sk_device_web_usage_shopping,passive_sk_device_web_usage_social_networking,passive_sk_device_web_usage_sports,passive_sk_device_web_usage_stickers,passive_sk_device_web_usage_travel,passive_sk_device_web_usage_utilities,passive_sk_device_web_usage_weather,passive_sk_device_time_diff
0,01801252-3a7e-4f5f-8b6d-49e8da3902f3,2022-09-29 13:45:00+01:00,2022-09-29,13:45:00,900,5,4,423.0,,,...,,,,,,,,,,
1,01801252-3a7e-4f5f-8b6d-49e8da3902f3,2022-09-29 13:59:59+01:00,2022-09-29,13:59:59,900,3,6,65.0,,,...,,,,,,,,,,899.0


#### Aggregate by Day

In [304]:
sk_device = pd.DataFrame(sk_device.groupby(['ParticipantIdentifier', 'trial_date']).sum().loc[:, 'passive_sk_device_total_unlocks':'passive_sk_device_web_usage_weather']).reset_index()
sk_device.head(2)

Unnamed: 0,ParticipantIdentifier,trial_date,passive_sk_device_total_unlocks,passive_sk_device_total_screen_wakes,passive_sk_device_total_unlock_duration,passive_sk_device_app_usage_books,passive_sk_device_app_usage_business,passive_sk_device_app_usage_catalogs,passive_sk_device_app_usage_developer_tools,passive_sk_device_app_usage_education,...,passive_sk_device_web_usage_photo_and_video,passive_sk_device_web_usage_productivity,passive_sk_device_web_usage_reference,passive_sk_device_web_usage_shopping,passive_sk_device_web_usage_social_networking,passive_sk_device_web_usage_sports,passive_sk_device_web_usage_stickers,passive_sk_device_web_usage_travel,passive_sk_device_web_usage_utilities,passive_sk_device_web_usage_weather
0,01801252-3a7e-4f5f-8b6d-49e8da3902f3,2022-09-29,94,154,6204.0,58.0,7.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,01801252-3a7e-4f5f-8b6d-49e8da3902f3,2022-09-30,137,301,13606.0,139.0,53.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0,0.0,0.0


#### Join

In [305]:
df_complete = df_complete.merge(sk_device, how='left', on=['ParticipantIdentifier', 'trial_date'])

### Keyboard Data

In [306]:
sk_keyboard = pd.read_csv(save_path + 'run' + str(run_num) + '_sk_keyboard.csv')

In [307]:
sk_keyboard.columns

Index(['ParticipantIdentifier', 'date_time', 'trial_date',
       'passive_sk_keyboard_time', 'passive_sk_keyboard_total_words',
       'passive_sk_keyboard_total_autocorrections',
       'passive_sk_keyboard_total_typing_duration',
       'passive_sk_keyboard_total_emojis', 'passive_sk_keyboard_total_deletes',
       'passive_sk_keyboard_typing_speed', 'passive_sk_keyboard_total_pauses',
       'passive_sk_keyboard_total_typing_episodes',
       'passive_sk_keyboard_sentiment_emoji_absolutist',
       'passive_sk_keyboard_sentiment_emoji_down',
       'passive_sk_keyboard_sentiment_emoji_death',
       'passive_sk_keyboard_sentiment_emoji_anxiety',
       'passive_sk_keyboard_sentiment_emoji_anger',
       'passive_sk_keyboard_sentiment_emoji_health',
       'passive_sk_keyboard_sentiment_emoji_positive',
       'passive_sk_keyboard_sentiment_emoji_sad',
       'passive_sk_keyboard_sentiment_emoji_lowEnergy',
       'passive_sk_keyboard_sentiment_emoji_confused',
       'passive_sk_ke

In [308]:
sk_keyboard.head(2)

Unnamed: 0,ParticipantIdentifier,date_time,trial_date,passive_sk_keyboard_time,passive_sk_keyboard_total_words,passive_sk_keyboard_total_autocorrections,passive_sk_keyboard_total_typing_duration,passive_sk_keyboard_total_emojis,passive_sk_keyboard_total_deletes,passive_sk_keyboard_typing_speed,...,passive_sk_keyboard_sentiment_word_absolutist,passive_sk_keyboard_sentiment_word_down,passive_sk_keyboard_sentiment_word_death,passive_sk_keyboard_sentiment_word_anxiety,passive_sk_keyboard_sentiment_word_anger,passive_sk_keyboard_sentiment_word_health,passive_sk_keyboard_sentiment_word_positive,passive_sk_keyboard_sentiment_word_sad,passive_sk_keyboard_sentiment_word_lowEnergy,passive_sk_keyboard_sentiment_word_confused
0,099765a5-a9c9-4fff-b297-a39eab517267,2022-09-26 13:36:21-04:00,2022-09-26,13:36:21,28,0,51.0,8,16,3.106007,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,099765a5-a9c9-4fff-b297-a39eab517267,2022-09-26 14:40:47-04:00,2022-09-26,14:40:47,26,0,27.0,3,15,4.629528,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


#### Aggregate by Day

In [309]:
# Columns to sum (all columns starting from 'passive_sk_keyboard_total_words' except for 'passive_sk_keyboard_typing_speed')
columns_to_sum = [col for col in sk_keyboard.columns if col.startswith('passive_sk_') and col != 'passive_sk_keyboard_time' and col != 'passive_sk_keyboard_typing_speed']

# Creating a dictionary for aggregation
agg_dict = {col: 'sum' for col in columns_to_sum}
agg_dict['passive_sk_keyboard_typing_speed'] = 'mean'

# Apply the groupby with aggregation
sk_keyboard = pd.DataFrame(sk_keyboard.groupby(['trial_date', 'ParticipantIdentifier']).agg(agg_dict)).reset_index()
sk_keyboard.head()

Unnamed: 0,trial_date,ParticipantIdentifier,passive_sk_keyboard_total_words,passive_sk_keyboard_total_autocorrections,passive_sk_keyboard_total_typing_duration,passive_sk_keyboard_total_emojis,passive_sk_keyboard_total_deletes,passive_sk_keyboard_total_pauses,passive_sk_keyboard_total_typing_episodes,passive_sk_keyboard_sentiment_emoji_absolutist,...,passive_sk_keyboard_sentiment_word_down,passive_sk_keyboard_sentiment_word_death,passive_sk_keyboard_sentiment_word_anxiety,passive_sk_keyboard_sentiment_word_anger,passive_sk_keyboard_sentiment_word_health,passive_sk_keyboard_sentiment_word_positive,passive_sk_keyboard_sentiment_word_sad,passive_sk_keyboard_sentiment_word_lowEnergy,passive_sk_keyboard_sentiment_word_confused,passive_sk_keyboard_typing_speed
0,2022-09-23,099765a5-a9c9-4fff-b297-a39eab517267,357,0,374.019829,11,189,38.0,61.0,0.0,...,0.0,0.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,4.9162
1,2022-09-23,3b692f4d-2f21-4869-b87f-c134c918e18f,176,13,127.0,7,62,0.0,28.0,0.0,...,0.0,0.0,0.0,3.0,2.0,5.0,0.0,0.0,0.0,6.514057
2,2022-09-24,099765a5-a9c9-4fff-b297-a39eab517267,2298,0,2281.574644,29,907,203.0,237.0,0.0,...,6.0,0.0,3.0,1.0,5.0,11.0,0.0,0.0,0.0,5.02594
3,2022-09-24,156895d9-0f71-4844-92dd-3fb298f84f2b,61,16,118.0,0,46,12.0,43.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.406593
4,2022-09-24,1b9b62f1-095b-4819-92a0-ea8e7abee884,33,3,64.0,1,32,11.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.001927


#### Join

In [310]:
df_complete = df_complete.merge(sk_keyboard, how='left', on=['ParticipantIdentifier', 'trial_date'])

### Visits

In [311]:
sk_visits = pd.read_csv(save_path + 'run' + str(run_num) + '_sk_visits.csv')

In [312]:
sk_visits.head(2)

Unnamed: 0,ParticipantIdentifier,date_time,trial_date,passive_sk_visits_time,passive_sk_visits_gym_startTime,passive_sk_visits_gym_endTime,passive_sk_visits_gym_elapsed,passive_sk_visits_gym_distance,passive_sk_visits_home_startTime,passive_sk_visits_home_endTime,...,passive_sk_visits_school_distance,passive_sk_visits_unknown_startTime,passive_sk_visits_unknown_endTime,passive_sk_visits_unknown_elapsed,passive_sk_visits_unknown_distance,passive_sk_visits_work_startTime,passive_sk_visits_work_endTime,passive_sk_visits_work_elapsed,passive_sk_visits_work_distance,passive_sk_visits_unknown_distance_flag
0,099765a5-a9c9-4fff-b297-a39eab517267,2022-09-21 21:00:00-04:00,2022-09-21,21:00:00,,,,,2022-09-21 21:00:00-04:00,2022-09-21 23:59:59-04:00,...,,,,,,,,,,False
1,099765a5-a9c9-4fff-b297-a39eab517267,2022-09-22 19:30:00-04:00,2022-09-22,19:30:00,,,,,2022-09-22 19:30:00-04:00,2022-09-22 23:59:59-04:00,...,,,,,,,,,,False


#### Aggregate by Day

In [313]:
sk_visits.columns

Index(['ParticipantIdentifier', 'date_time', 'trial_date',
       'passive_sk_visits_time', 'passive_sk_visits_gym_startTime',
       'passive_sk_visits_gym_endTime', 'passive_sk_visits_gym_elapsed',
       'passive_sk_visits_gym_distance', 'passive_sk_visits_home_startTime',
       'passive_sk_visits_home_endTime', 'passive_sk_visits_home_elapsed',
       'passive_sk_visits_home_distance', 'passive_sk_visits_school_startTime',
       'passive_sk_visits_school_endTime', 'passive_sk_visits_school_elapsed',
       'passive_sk_visits_school_distance',
       'passive_sk_visits_unknown_startTime',
       'passive_sk_visits_unknown_endTime',
       'passive_sk_visits_unknown_elapsed',
       'passive_sk_visits_unknown_distance',
       'passive_sk_visits_work_startTime', 'passive_sk_visits_work_endTime',
       'passive_sk_visits_work_elapsed', 'passive_sk_visits_work_distance',
       'passive_sk_visits_unknown_distance_flag'],
      dtype='object')

In [314]:
sk_visits.head()

Unnamed: 0,ParticipantIdentifier,date_time,trial_date,passive_sk_visits_time,passive_sk_visits_gym_startTime,passive_sk_visits_gym_endTime,passive_sk_visits_gym_elapsed,passive_sk_visits_gym_distance,passive_sk_visits_home_startTime,passive_sk_visits_home_endTime,...,passive_sk_visits_school_distance,passive_sk_visits_unknown_startTime,passive_sk_visits_unknown_endTime,passive_sk_visits_unknown_elapsed,passive_sk_visits_unknown_distance,passive_sk_visits_work_startTime,passive_sk_visits_work_endTime,passive_sk_visits_work_elapsed,passive_sk_visits_work_distance,passive_sk_visits_unknown_distance_flag
0,099765a5-a9c9-4fff-b297-a39eab517267,2022-09-21 21:00:00-04:00,2022-09-21,21:00:00,,,,,2022-09-21 21:00:00-04:00,2022-09-21 23:59:59-04:00,...,,,,,,,,,,False
1,099765a5-a9c9-4fff-b297-a39eab517267,2022-09-22 19:30:00-04:00,2022-09-22,19:30:00,,,,,2022-09-22 19:30:00-04:00,2022-09-22 23:59:59-04:00,...,,,,,,,,,,False
2,14b58072-ae3b-491e-a8ca-207f0d27ccf6,2022-09-24 20:00:00-04:00,2022-09-24,20:00:00,,,,,2022-09-24 20:00:00-04:00,2022-09-24 23:59:59-04:00,...,,,,,,,,,,False
3,14b58072-ae3b-491e-a8ca-207f0d27ccf6,2022-09-25 20:45:00-04:00,2022-09-25,20:45:00,,,,,,,...,,2022-09-25 20:45:00-04:00,2022-09-25 21:00:00-04:00,0.25,3809.591619,,,,,False
4,14b58072-ae3b-491e-a8ca-207f0d27ccf6,2022-09-25 16:00:00-04:00,2022-09-25,16:00:00,,,,,,,...,,2022-09-25 16:00:00-04:00,2022-09-25 20:30:00-04:00,4.5,6780.37739,,,,,False


In [315]:
# we lose a fair bit of information when aggregating since all the start and end times have to go and we no longer know how many "unknown" locations may have been visited

# Split the column names based on their endings
distance_columns = [col for col in sk_visits.columns if col.endswith('_distance')]
elapsed_columns = [col for col in sk_visits.columns if col.endswith('_elapsed')]
flag_columns = [col for col in sk_visits.columns if col.endswith('_flag')]
unknown_start_time_column = 'passive_sk_visits_unknown_startTime' # This column exists as verified

# Aggregate the dataframe
sk_visits = sk_visits.groupby(['ParticipantIdentifier', 'trial_date']).agg({
    **{col: 'mean' for col in distance_columns},
    **{col: 'sum' for col in elapsed_columns},
    unknown_start_time_column: 'count',
    **{col: 'any' for col in flag_columns}
}).reset_index()

# rename column
sk_visits = sk_visits.rename(columns={'passive_sk_visits_unknown_startTime': 'passive_sk_visits_unknown_count'})

sk_visits.head(2)

Unnamed: 0,ParticipantIdentifier,trial_date,passive_sk_visits_gym_distance,passive_sk_visits_home_distance,passive_sk_visits_school_distance,passive_sk_visits_unknown_distance,passive_sk_visits_work_distance,passive_sk_visits_gym_elapsed,passive_sk_visits_home_elapsed,passive_sk_visits_school_elapsed,passive_sk_visits_unknown_elapsed,passive_sk_visits_work_elapsed,passive_sk_visits_unknown_count,passive_sk_visits_unknown_distance_flag
0,01801252-3a7e-4f5f-8b6d-49e8da3902f3,2022-09-24,,0.0,,,,0.0,4.499722,0.0,0.0,0.0,0,False
1,01801252-3a7e-4f5f-8b6d-49e8da3902f3,2022-09-25,,0.0,,,,0.0,18.75,0.0,0.0,0.0,0,False


#### Join

In [316]:
df_complete = df_complete.merge(sk_visits, how='left', on=['ParticipantIdentifier', 'trial_date'])

### Telephony

In [317]:
sk_telephony = pd.read_csv(save_path + 'run' + str(run_num) + '_sk_telephony_small.csv')

In [318]:
sk_telephony.head(2)

Unnamed: 0,ParticipantIdentifier,date_time,trial_date,passive_sk_telephony_time,passive_sk_telephony_speakingRate,passive_sk_telephony_averagePauseDuration,passive_sk_telephony_speechDuration
0,099765a5-a9c9-4fff-b297-a39eab517267,2022-09-24 07:44:25-04:00,2022-09-24,07:44:25,258.064516,0.2,0.94
1,099765a5-a9c9-4fff-b297-a39eab517267,2022-09-24 07:44:25-04:00,2022-09-24,07:44:25,80.0,0.39,2.25


#### Aggregate by Day

In [319]:
# Grouping the dataframe
grouped_df = sk_telephony.groupby(['ParticipantIdentifier', 'trial_date'])

# Calculating the required statistics
sk_telephony = grouped_df.agg(
    passive_sk_telephony_speakingRate_mean=('passive_sk_telephony_speakingRate', 'mean'),
    passive_sk_telephony_speakingRate_variance=('passive_sk_telephony_speakingRate', 'var'),
    passive_sk_telephony_averagePauseDuration_mean=('passive_sk_telephony_averagePauseDuration', 'mean'),
    passive_sk_telephony_averagePauseDuration_variance=('passive_sk_telephony_averagePauseDuration', 'var'),
    passive_sk_telephony_speechDuration_sum=('passive_sk_telephony_speechDuration', 'sum')
).reset_index()


sk_telephony.head(2)

Unnamed: 0,ParticipantIdentifier,trial_date,passive_sk_telephony_speakingRate_mean,passive_sk_telephony_speakingRate_variance,passive_sk_telephony_averagePauseDuration_mean,passive_sk_telephony_averagePauseDuration_variance,passive_sk_telephony_speechDuration_sum
0,01801252-3a7e-4f5f-8b6d-49e8da3902f3,2022-09-27,174.508127,0.0,0.236061,0.0,23.38
1,01801252-3a7e-4f5f-8b6d-49e8da3902f3,2022-09-28,185.112941,4150.094405,0.101709,0.195204,1150.41


#### Join

In [320]:
df_complete = df_complete.merge(sk_telephony, how='left', on=['ParticipantIdentifier', 'trial_date'])

### Messages + Calls

In [321]:
sk_messagesCalls = pd.read_csv(save_path + 'run' + str(run_num) + '_sk_messagesCalls.csv')

In [322]:
sk_messagesCalls.head(2)

Unnamed: 0,ParticipantIdentifier,date_time,trial_date,passive_sk_messageCall_time,passive_sk_messageCall_timespan,passive_sk_messages_incoming,passive_sk_messages_outgoing,passive_sk_messages_uniqueContacts,passive_sk_calls_incoming,passive_sk_calls_outgoing,passive_sk_calls_duration,passive_sk_calls_uniqueContacts
0,099765a5-a9c9-4fff-b297-a39eab517267,2022-09-25 23:59:59-04:00,2022-09-25,23:59:59,86400,,,,6.0,6.0,630.833643,5.0
1,099765a5-a9c9-4fff-b297-a39eab517267,2022-09-22 23:59:58-04:00,2022-09-22,23:59:58,86400,,,,4.0,4.0,346.256107,3.0


#### Aggregate by Day

In [323]:
# Dropping unnecessary columns
columns_to_drop = ['date_time', 'passive_sk_messageCall_time', 'passive_sk_messageCall_timespan']
sk_messagesCalls.drop(columns=columns_to_drop, inplace=True)


# Grouping the dataframe
grouped_message_call_df = sk_messagesCalls.groupby(['ParticipantIdentifier', 'trial_date'])

# Defining a custom aggregation function
def custom_agg(series):
    if series.name.endswith('_uniqueContacts'):
        return series.max()
    else:
        return series.sum()

# Applying the custom aggregation
sk_messagesCalls = grouped_message_call_df.agg(custom_agg).reset_index()
sk_messagesCalls.head(2)

Unnamed: 0,ParticipantIdentifier,trial_date,passive_sk_messages_incoming,passive_sk_messages_outgoing,passive_sk_messages_uniqueContacts,passive_sk_calls_incoming,passive_sk_calls_outgoing,passive_sk_calls_duration,passive_sk_calls_uniqueContacts
0,01801252-3a7e-4f5f-8b6d-49e8da3902f3,2022-09-27,49.0,4.0,4.0,0.0,0.0,0.0,
1,01801252-3a7e-4f5f-8b6d-49e8da3902f3,2022-09-28,66.0,44.0,10.0,0.0,1.0,0.0,1.0


#### Join

In [324]:
df_complete = df_complete.merge(sk_messagesCalls, how='left', on=['ParticipantIdentifier', 'trial_date'])

# Save Full

In [326]:
# consolidate df to make it contiguous in memory
df_complete = df_complete.copy()

In [327]:
df_complete.shape

(7820, 327)

In [328]:
# save to csv
if run_num ==1:
    # run 1
    # Label cohort
    df_complete['cohort'] = 'cohort 1'
    df_complete.to_csv(save_path + 'run1_full_daily_df.csv', index=False)
if run_num ==2:
    # run 2
    # Label cohort
    df_complete['cohort'] = 'cohort 2'
    df_complete.to_csv(save_path + 'run2_full_daily_df.csv', index=False)

# Grades

In [341]:
run_num = 2

In [342]:
# Function to replace 'CR' with NaN
def replace_cr_with_nan(df):
    return df.replace('CR', np.nan)

# Function to calculate row-wise mean, ignoring NaN values
def rowwise_mean(df):
    return df.mean(axis=1, skipna=True)

if run_num == 1:
    # 1st Cohort
    cohort1 = pd.read_csv('/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_1_raw_data/run_1/run1_subjects.csv')
        
    # Reading CSV file
    grades1 = pd.read_csv('/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_2_processed_data/run_1/run1_grades.csv')  # Change path as needed
    grades1 = grades1[['ParticipantIdentifier', 'grades_avg']]

    # Merge with cohort1 data (assuming cohort1 is a DataFrame you have)
    grades1 = pd.merge(grades1, cohort1, on='ParticipantIdentifier')
    grades1 = grades1[['Student Number', 'grades_avg', 'ParticipantIdentifier']]

    # Drop rows with NA in the 'Student.Number' column
    grades1.dropna(subset=['Student Number'], inplace=True)

    # Label cohort
    grades1['cohort'] = 'cohort 1'

    # Convert grade avg
    grades1['grades_avg'] *= 100

elif run_num == 2:
    # 2nd Cohort
    cohort2 = pd.read_csv('/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_1_raw_data/run_2/run2_subjects.csv')
    
    # Reading Excel file
    grades2 = pd.read_excel('/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_1_raw_data/run_2/grades/ACADEMIC_ACTIVITIES_230509.xlsx')  # Change path as needed

    # Convert to wide format
    grades2 = grades2.pivot(index='PERSON_ID', columns='ACAD_ACT_CD', values='ENTERED_MARK')

    # Replace 'CR' with NaN and convert to numeric
    grades2 = replace_cr_with_nan(grades2).apply(pd.to_numeric, errors='coerce')

    # Calculate row average, ignoring NAs
    grades2['grades_avg'] = rowwise_mean(grades2)

    # Add Student Number
    grades2['Student Number'] = grades2.index

    # Select columns
    grades2 = grades2[['Student Number', 'grades_avg']]

    # Merge with cohort2 data (assuming cohort2 is a DataFrame you have)
    grades2 = pd.merge(grades2, cohort2[['Student Number', 'ParticipantIdentifier']], on='Student Number')

    # Drop rows with NA in the 'Student.Number' column
    grades2.dropna(subset=['Student Number'], inplace=True)

## Save

In [346]:
# save to csv
if run_num ==1:
    # run 1
    grades1 = grades1[['ParticipantIdentifier', 'grades_avg']]
    # Label cohort
    grades1['cohort'] = 'cohort 1'
    grades1.to_csv(save_path + 'run1_grades_df.csv', index=False)
if run_num ==2:
    # run 2
    grades2 = grades2[['ParticipantIdentifier', 'grades_avg']]
    # Label cohort
    grades2['cohort'] = 'cohort 2'
    grades2.to_csv(save_path + 'run2_grades_df.csv', index=False)