# Setup

## Imports

In [6]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import json
import datetime
from dateutil import parser
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport

  from .autonotebook import tqdm as notebook_tqdm


## Set Paths

In [7]:
# set run
run_num = 2

if run_num ==1:
    # run 1
    path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_1_raw_data/run_1/app_data/'
    save_path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_2_processed_data/run_1/'
    subjects_run1 = pd.read_csv(path + '../run1_subjects.csv')
    subjects = subjects_run1.ParticipantIdentifier
elif run_num ==2:
    # run 2
    path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_1_raw_data/run_2/app_data/'
    save_path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_2_processed_data/run_2/'
    subjects_run2 = pd.read_csv(path + '../run2_subjects.csv')
    subjects = subjects_run2.ParticipantIdentifier
    
eda_reports_path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_4_outputs/EDA/'

# Active Data

## Import Data

In [8]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

In [434]:
# loop through all days
days = [i for i in os.listdir(path) if i.startswith('RK')]
for day in tqdm(days):
    files = os.listdir(path + day)
    surveyQuestions = [i for i in files if i.startswith('SurveyQuestionResults')]
    # there should be only one
    for file in surveyQuestions:
        if 'df' not in globals():
            df = pd.read_csv(path + day + '/' + file)
        else:
            temp_df = pd.read_csv(path + day + '/' + file)
            df = pd.concat([df,temp_df], axis=0)

100%|██████████| 55/55 [00:05<00:00, 10.42it/s]


In [435]:
df.shape

(1124570, 8)

In [436]:
# select relevant columns
df = df[['ParticipantIdentifier', 'ResultIdentifier', 'Answers', 'EndDate']]
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,ParticipantIdentifier,ResultIdentifier,Answers,EndDate
0,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,current_situation1_tasks,in_public,2023-04-06T07:15:45-04:00
1,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,current_situation2_tasks,False,2023-04-06T07:15:46-04:00
2,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,task_motivation,10,2023-04-06T07:15:48-04:00
3,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,affect_neg_frustrated_am,1,2023-04-06T07:15:58-04:00
4,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,affect_pos_relaxedCalm_am,4,2023-04-06T07:16:01-04:00


In [437]:
df.isna().sum()

ParticipantIdentifier     0
ResultIdentifier          0
Answers                  37
EndDate                   0
dtype: int64

In [438]:
# Remove rows without valid EndDate value
df = df.dropna(subset=['EndDate']).reset_index(drop=True)

# Select relevant subjects
df = df.loc[df.ParticipantIdentifier.isin(subjects)].reset_index(drop=True)

In [439]:
# add trial date and time columns
for i in tqdm(range(df.shape[0])):
    dt = parser.parse(df.loc[i, 'EndDate'])
    df.loc[i, 'datetime'] = dt
    df.loc[i, 'trial_date'] = (dt + datetime.timedelta(hours = -4.75)).date() # trial day associated with sample (4:45am is when the day flips)
    df.loc[i, 'time'] = dt.time()

100%|██████████| 1104412/1104412 [02:01<00:00, 9122.44it/s]


In [440]:
# save to csv
if run_num ==1:
    # run 1
    df.to_csv(save_path + 'run1_survey_results.csv', index=False)
if run_num ==2:
    # run 2
    df.to_csv(save_path + 'run2_survey_results.csv', index=False)

# Gap App

## Self Report

### Affect

#### Load Data

In [615]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

if 'df_daily_affect_wide' in globals():
    del(df_daily_affect_wide)
    print('deleted affect df')

deleted existing df
deleted affect df


In [616]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_survey_results.csv')

#### Select Data

In [617]:
df_affect = df.loc[df.ResultIdentifier.str.startswith('affect_')].reset_index(drop=True)
df_affect_am = df.loc[(df.ResultIdentifier.str.startswith('affect_')) & (df.ResultIdentifier.str.endswith('am'))].reset_index(drop=True)
df_affect_pm = df.loc[(df.ResultIdentifier.str.startswith('affect_')) & (~df.ResultIdentifier.str.endswith('am'))].reset_index(drop=True)

#### Convert to Wide

In [618]:
df_affect_pm_wide = df_affect_pm.pivot_table(index=["ParticipantIdentifier", "trial_date"], 
                    columns='ResultIdentifier', 
                    values='Answers').reset_index()
# get rid of name on index
df_affect_pm_wide = df_affect_pm_wide.rename_axis(None, axis=1)

df_affect_am_wide = df_affect_am.pivot_table(index=["ParticipantIdentifier", "trial_date"], 
                    columns='ResultIdentifier', 
                    values='Answers').reset_index()
# get rid of name on index
df_affect_am_wide = df_affect_am_wide.rename_axis(None, axis=1)

In [619]:
# join
df_daily_affect_wide = df_affect_pm_wide.merge(df_affect_am_wide, how='left', on=['ParticipantIdentifier', 'trial_date'])

df_daily_affect_wide.head(3)

Unnamed: 0,ParticipantIdentifier,trial_date,affect_neg_angry,affect_neg_ashamed,affect_neg_bored,affect_neg_depressed,affect_neg_embarrassed,affect_neg_frustrated,affect_neg_guilty,affect_neg_lazy,...,affect_neg_sad_am,affect_neg_stressed_am,affect_pos_amused_am,affect_pos_appreciated_am,affect_pos_excited_am,affect_pos_focused_am,affect_pos_happy_am,affect_pos_hopeful_am,affect_pos_motivated_am,affect_pos_relaxedCalm_am
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-30,2.0,1.0,2.0,1.0,1.0,1.0,1.0,2.0,...,1.0,4.0,4.0,4.0,3.0,3.0,3.0,4.0,3.0,4.0
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-31,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,4.0,2.0,3.0,3.0,3.0,3.0,3.0,2.0,4.0
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,1.0,3.0,3.0,3.0,3.0,1.0,1.0,2.0,...,1.0,2.0,1.0,3.0,1.0,3.0,3.0,3.0,3.0,2.0


#### EDA Profiling

In [61]:
profile = ProfileReport(df_daily_affect_wide.iloc[:,2:], title=f"Affect Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"affect_report_run{run_num}.html")

Summarize dataset:   2%|▏         | 1/45 [00:00<00:01, 29.53it/s, Describe variable:affect_neg_ashamed]

Summarize dataset: 100%|██████████| 450/450 [00:21<00:00, 20.49it/s, Completed]                                                   
Generate report structure: 100%|██████████| 1/1 [00:03<00:00,  3.91s/it]
Render HTML: 100%|██████████| 1/1 [00:03<00:00,  3.37s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 60.93it/s]


**NOTE**

There are a number of variables where the maximum values are well above 5, which is the maximum option that should be available. These glitch entries should be removed

#### Clean

<div class="alert alert-block alert-info">
<b>🧹 Process:</b><br>
For our cleaning process we do the following:<br><br>

<ol>
    <li>Remove values greater than 5, given that the Likert scale only went to 5</li>
    <br>
    <li>Look for zero variance <b>columns</b> (affective measures) and <b>rows</b> (days) for subjects</li></h5>
    <br>
    👉 If the variance was zero for three or more columns then the subject was flagged (`affect_zeroVarCols_flag = True`)<br>
    <br>
    👉 If the variance was zero for more than 10% of a subjects completed days then the subject was flagged (`affect_zeroVarRows_flag = True`)
</ol>
</div>

##### Impossible Values

In [620]:
# Number of instances where the cell value is out of range (greater than 5)
df_daily_affect_wide.iloc[:,2:][df_daily_affect_wide.iloc[:,2:] > 5].count().sum()

106

In [621]:
# Replace all values below threshold with NaN
df_daily_affect_wide.iloc[:,2:] = np.where(df_daily_affect_wide.iloc[:,2:]>5, np.nan, df_daily_affect_wide.iloc[:,2:])

In [622]:
# Recheck for instances above 5
df_daily_affect_wide.iloc[:,2:][df_daily_affect_wide.iloc[:,2:] > 5].count().sum()

0

In [474]:
# Rerun profiling
profile = ProfileReport(df_daily_affect_wide.iloc[:,2:],
                        title=f"Affect Run {run_num} | Pandas Profiling Report",
                        infer_dtypes = False)
profile.to_file(eda_reports_path + f"affect_report_clean_run{run_num}.html")

Summarize dataset: 100%|██████████| 1650/1650 [01:36<00:00, 17.13it/s, Completed]                                                         
Generate report structure: 100%|██████████| 1/1 [00:04<00:00,  4.23s/it]
Render HTML: 100%|██████████| 1/1 [00:12<00:00, 12.67s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 22.69it/s]


##### Variance

We can see that there are a number of participants who have no variance in a given category.

This is obviously a problem for some analyses...

In [623]:
# Drop 'trial_date' column
data = df_daily_affect_wide.drop(columns='trial_date', errors='ignore')

# Group by 'ParticipantIdentifier' and compute the variance
grouped_variance = data.groupby('ParticipantIdentifier').var()

# Filter the grouped_variance dataframe to only include columns with 0 variance for any participant
zero_variance_df = grouped_variance[grouped_variance == 0].dropna(how='all')

# Melt the dataframe to have ParticipantIdentifier, Column with 0 variance
melted_zero_variance_df = zero_variance_df.reset_index().melt(id_vars=['ParticipantIdentifier'], value_name='Variance')
final_zero_variance_df = melted_zero_variance_df.dropna(subset=['Variance']).drop(columns='Variance')

# Count the number of values present in the variable column for each participant
value_counts = data.groupby('ParticipantIdentifier').count()

# Merge the value counts with the final_zero_variance_df
merged_df = final_zero_variance_df.merge(value_counts, on='ParticipantIdentifier', how='left')

# Extract only the relevant columns
result_df = merged_df[['ParticipantIdentifier', 'variable', 'affect_neg_angry']]
result_df.columns = ['ParticipantIdentifier', 'ZeroVariance', 'Count']

result_df

Unnamed: 0,ParticipantIdentifier,ZeroVariance,Count
0,1081454a-03ff-445c-9602-ac9fe9e3e5cf,affect_neg_angry,60
1,1081454a-03ff-445c-9602-ac9fe9e3e5cf,affect_neg_ashamed,60
2,852c24f8-36d6-4bbd-b79f-7f6fe64d1275,affect_neg_ashamed,72
3,1081454a-03ff-445c-9602-ac9fe9e3e5cf,affect_neg_bored,60
4,739f2417-2416-4646-b108-e73bb870d326,affect_neg_bored,85
...,...,...,...
223,ec83dfe2-3df1-44dc-a1ef-3a199327c229,affect_pos_motivated_am,32
224,02f48bee-6e86-437c-9394-10ae57dadd14,affect_pos_relaxedCalm_am,79
225,4c1d752c-a092-433b-8a13-36a9677eeb1c,affect_pos_relaxedCalm_am,28
226,c57c38e4-e887-40d2-ab8b-8ae62f5dfaa8,affect_pos_relaxedCalm_am,82


Some subjects had mulitple categories without any variance.

In [624]:
zero_var_cols = result_df.ParticipantIdentifier.value_counts().reset_index(name='affect_zeroVar_cols')
# Remove rows where both morning and evening surveys have NaN values
df_count = df_daily_affect_wide[~((df_daily_affect_wide['affect_neg_angry'].isnull()) & (df_daily_affect_wide['affect_neg_angry_am'].isnull()))]
# Calculate how many completed days
df_count =  df_count.groupby('ParticipantIdentifier').size().reset_index(name='total_count')

# merge
zero_var_cols = zero_var_cols.merge(df_count, on='ParticipantIdentifier', how='left')

# add flag
zero_var_cols['affect_zeroVarCols_flag'] = False
zero_var_cols.loc[zero_var_cols.affect_zeroVar_cols > 2, 'affect_zeroVarCols_flag'] = True
zero_var_cols

Unnamed: 0,ParticipantIdentifier,affect_zeroVar_cols,total_count,affect_zeroVarCols_flag
0,1081454a-03ff-445c-9602-ac9fe9e3e5cf,27,60,True
1,02f48bee-6e86-437c-9394-10ae57dadd14,20,79,True
2,852c24f8-36d6-4bbd-b79f-7f6fe64d1275,9,72,True
3,f8f71506-9382-40c7-99db-5c170b2a9abb,9,83,True
4,739f2417-2416-4646-b108-e73bb870d326,8,85,True
5,630ece82-994f-4aef-b2e3-46760583e453,8,81,True
6,fb6c8f5a-f92d-4af8-9f87-73ffd4e21f98,8,75,True
7,c2bfc053-7c57-4ec9-aa69-2fcba2aaba5d,8,66,True
8,e883a6d9-ec85-44eb-9366-9928c15fbe95,7,85,True
9,2ca5c7c8-3834-4c79-a416-ff7f9b9e8140,7,63,True


In [625]:
# merge with main affect df
df_daily_affect_wide = df_daily_affect_wide.merge(zero_var_cols.drop(columns=['total_count']), on='ParticipantIdentifier', how='left')

How many subjects had at least one column with no variance?

In [626]:
len(np.unique(result_df.ParticipantIdentifier))

51

Here is the subject who had 15 variables with no variance (run 1):

In [627]:
df_daily_affect_wide.loc[df_daily_affect_wide.ParticipantIdentifier == '27f7805e-5951-47b4-9f42-4c6200001cc6', :].dropna()

Unnamed: 0,ParticipantIdentifier,trial_date,affect_neg_angry,affect_neg_ashamed,affect_neg_bored,affect_neg_depressed,affect_neg_embarrassed,affect_neg_frustrated,affect_neg_guilty,affect_neg_lazy,...,affect_pos_amused_am,affect_pos_appreciated_am,affect_pos_excited_am,affect_pos_focused_am,affect_pos_happy_am,affect_pos_hopeful_am,affect_pos_motivated_am,affect_pos_relaxedCalm_am,affect_zeroVar_cols,affect_zeroVarCols_flag


But maybe variance within day is more important in terms of actually cleaning data. If someone enters the same value for every variable perhaps it is because they are not answering accurately and just trying to finish as quickly as possible.

In [628]:
# find rows with zero variance
idx = np.where(df_daily_affect_wide.drop(columns=['ParticipantIdentifier', 'trial_date']).var(axis=1) == 0)[0]
# calculate how many zero variance days per subject
df_zeroVar = df_daily_affect_wide.iloc[idx,:].groupby('ParticipantIdentifier').size().reset_index(name='zeroVar_count')

# Remove rows where both morning and evening surveys have NaN values
df_count = df_daily_affect_wide[~((df_daily_affect_wide['affect_neg_angry'].isnull()) & (df_daily_affect_wide['affect_neg_angry_am'].isnull()))]
# Calculate how many completed days
df_count =  df_count.groupby('ParticipantIdentifier').size().reset_index(name='total_count')

# merge
df_zeroVar = df_zeroVar.merge(df_count, on='ParticipantIdentifier', how='left')
df_zeroVar['affect_pct_zeroVarRows'] = (df_zeroVar.zeroVar_count / df_zeroVar.total_count) * 100

# add flag
df_zeroVar['affect_zeroVarRows_flag'] = False
df_zeroVar.loc[df_zeroVar.affect_pct_zeroVarRows > 10, 'affect_zeroVarRows_flag'] = True
df_zeroVar

Unnamed: 0,ParticipantIdentifier,zeroVar_count,total_count,affect_pct_zeroVarRows,affect_zeroVarRows_flag
0,3e1d1276-0e73-4457-9911-f189b0ed0778,4,85,4.705882,False
1,783dd47a-1180-4965-874e-eb405ee6143e,4,53,7.54717,False
2,8c58f5aa-f20e-4c39-a38d-f9c9a44a6cee,7,84,8.333333,False
3,c61e40df-fa64-4037-838e-65d912521dc2,5,10,50.0,True
4,f55d6d94-8602-46cb-b3bd-53ea561eb296,4,70,5.714286,False


In [629]:
# merge with main affect df
df_daily_affect_wide = df_daily_affect_wide.merge(df_zeroVar.drop(columns=['zeroVar_count', 'total_count']), on='ParticipantIdentifier', how='left')

In [630]:
df_daily_affect_wide.sample(5)

Unnamed: 0,ParticipantIdentifier,trial_date,affect_neg_angry,affect_neg_ashamed,affect_neg_bored,affect_neg_depressed,affect_neg_embarrassed,affect_neg_frustrated,affect_neg_guilty,affect_neg_lazy,...,affect_pos_excited_am,affect_pos_focused_am,affect_pos_happy_am,affect_pos_hopeful_am,affect_pos_motivated_am,affect_pos_relaxedCalm_am,affect_zeroVar_cols,affect_zeroVarCols_flag,affect_pct_zeroVarRows,affect_zeroVarRows_flag
843,0ff91d6a-e400-403c-bd87-4cd1803bc5e7,2023-04-22,3.0,1.0,4.0,5.0,1.0,4.0,2.0,3.0,...,,,,,,,,,,
4736,b50ef395-6d97-4314-b397-e5d755595dc2,2023-01-29,2.0,5.0,1.0,5.0,1.0,4.0,5.0,4.0,...,,,,,,,1.0,False,,
195,042d7595-3fdc-4cf9-b288-c4b7961916d8,2023-03-02,3.0,1.0,4.0,5.0,2.0,2.0,4.0,5.0,...,,,,,,,,,,
5566,dfef360e-27cb-4d35-bb4a-d6633803eb96,2023-04-14,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
764,0e14ee82-85d9-41c5-a27e-a0e9c4178117,2023-04-14,1.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,...,1.0,3.0,1.0,1.0,3.0,2.0,,,,


#### Save

In [631]:
# Create df with one participant ID for every trial_date

# Create a series of dates from '2022-09-27' to '2022-12-20'
if run_num == 1:
    date_series = pd.date_range(start='2022-09-27', end='2022-12-20')
elif run_num == 2:
    date_series = pd.date_range(start='2023-01-30', end='2023-04-24')

ids_series = subjects

# Create a dataframe using a cartesian product of the two series
df_complete_idDate = pd.DataFrame({
    'ParticipantIdentifier': np.repeat(ids_series, len(date_series)),
    'trial_date': date_series.tolist() * len(ids_series)
}).reset_index(drop=True)

# Convert trial_date to datetime.date
df_complete_idDate['trial_date'] = pd.to_datetime(df_complete_idDate['trial_date']).dt.date
df_daily_affect_wide['trial_date'] = pd.to_datetime(df_daily_affect_wide['trial_date']).dt.date

# Join with affect df
df_daily_affect_wide = df_complete_idDate.merge(df_daily_affect_wide, how='left', on=['ParticipantIdentifier', 'trial_date'])

In [633]:
# save to csv
if run_num ==1:
    # run 1
    df_daily_affect_wide.to_csv(save_path + 'run1_affect.csv', index=False)
if run_num ==2:
    # run 2
    df_daily_affect_wide.to_csv(save_path + 'run2_affect.csv', index=False)

### Daily General and Detail

#### Load Data

In [714]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

if 'df_daily_sr_wide' in globals():
    del(df_daily_sr_wide)
    print('deleted daily self report df')

deleted existing df
deleted daily self report df


In [715]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_survey_results.csv')

#### Select Data

In [716]:
past24_general = [
    'DAILY_survey_situation1_surveys',
    'DAILY_survey_situation2_surveys',
    'DAILY_survey_missed',
    'DAILY_past48to24_gap',
    'DAILY_past48to24_gapCause',
    'DAILY_past24_ideal',
    'DAILY_past24_satisfaction',
    'DAILY_past24_change',
    'DAILY_past24_productivity',
    'DAILY_past24_procrastination',
    'DAILY_past24_punctuality',
    'DAILY_past24_mentalEffort',
    'DAILY_past24_physicalEffort',
    'DAILY_past24_values',
    'DAILY_past24_gap',
    'DAILY_past24_gapCause',
    'DAILY_past24_illness',
    'DAILY_past24_fatigue',
    'DAILY_past24_unusualEvents'
]

past24_categories = [
    'DAILY_past24_sleep',
    'DAILY_past24_occupation',
    'DAILY_past24_nonoccupation',
    'DAILY_past24_exercise',
    'DAILY_past24_leisureSolo',
    'DAILY_past24_leisureSoloMental',
    'DAILY_past24_leisureSoloPhysical',
    'DAILY_past24_leisureNonSolo',
    'DAILY_past24_leisureNonSoloMental',
    'DAILY_past24_leisureNonSoloPhysical',
    'DAILY_past24_diet',
    'DAILY_past24_socialMedia',
    'DAILY_past24_drinks'
]

next24_categories = [
    'DAILY_next24_sleep',
    'DAILY_next24_occupation',
    'DAILY_next24_nonoccupation',
    'DAILY_next24_leisureSolo',
    'DAILY_next24_leisureNonSolo',
    'DAILY_next24_exercise',
    'DAILY_next24_socialMedia',
    'DAILY_next24_drinks',
    'DAILY_next24_diet'
]

monthly_goals = [
    'MONTHLY_ib_gap_change',
    'MONTHLY_ib_gap_change_app',
    'MONTHLY_goal_report1',
    'MONTHLY_goal_set1_importance',
    'MONTHLY_goal_set1_consequences',
    'MONTHLY_goal_set1_motivationInternal',
    'MONTHLY_goal_set1_motivationExternal',
    'MONTHLY_goal_set1_confidence',
    'MONTHLY_goal_set1_effort',
    'MONTHLY_goal_report2',
    'MONTHLY_goal_set2_importance',
    'MONTHLY_goal_set2_consequences',
    'MONTHLY_goal_set2_motivationInternal',
    'MONTHLY_goal_set2_motivationExternal',
    'MONTHLY_goal_set2_confidence',
    'MONTHLY_goal_set2_effort',
    'MONTHLY_goal_set2_interaction_eachOther'
]

monthly_ideals = [
    'IDEAL_weekday_sleep',
    'IDEAL_weekday_occupation',
    'IDEAL_weekday_nonoccupation',
    'IDEAL_weekday_leisureSolo',
    'IDEAL_weekday_leisureNonSolo',
    'IDEAL_weekday_exercise',
    'IDEAL_weekday_socialMedia',
    'IDEAL_weekday_drinks',
    'IDEAL_weekend_sleep',
    'IDEAL_weekend_occupation',
    'IDEAL_weekend_nonoccupation',
    'IDEAL_weekend_leisureSolo',
    'IDEAL_weekend_leisureNonSolo',
    'IDEAL_weekend_exercise',
    'IDEAL_weekend_socialMedia',
    'IDEAL_weekend_drinks'
]

if run_num == 1:
    specific_goals = [
        'DAILY_goal1_report',
        'DAILY_goal1_importance',
        'DAILY_goal1_consequences',
        'DAILY_goal1_motivationInternal',
        'DAILY_goal1_motivationExternal',
        'DAILY_goal1_confidence',
        'DAILY_goal1_effort',
        'DAILY_goal1_interaction_week',
        'DAILY_goal1_interaction_month',
        'DAILY_goal2_report',
        'DAILY_goal2_importance',
        'DAILY_goal2_consequences',
        'DAILY_goal2_motivationInternal',
        'DAILY_goal2_motivationExternal',
        'DAILY_goal2_confidence',
        'DAILY_goal2_effort',
        'DAILY_goal2_interaction_week',
        'DAILY_goal2_interaction_month',
        'DAILY_goal2_interaction_eachOther'
    ]
    
    weekly_goals = [
        'WEEKLY_goal_report1',
        'WEEKLY_goal_set1_importance',
        'WEEKLY_goal_set1_consequences',
        'WEEKLY_goal_set1_motivationInternal',
        'WEEKLY_goal_set1_motivationExternal',
        'WEEKLY_goal_set1_confidence',
        'WEEKLY_goal_set1_effort',
        'WEEKLY_goal_set1_interaction_month',
        'WEEKLY_goal_report2',
        'WEEKLY_goal_set2_importance',
        'WEEKLY_goal_set2_consequences',
        'WEEKLY_goal_set2_motivationInternal',
        'WEEKLY_goal_set2_motivationExternal',
        'WEEKLY_goal_set2_confidence',
        'WEEKLY_goal_set2_effort',
        'WEEKLY_goal_set2_interaction_month',
        'WEEKLY_goal_set2_interaction_eachOther'
    ]

elif run_num == 2:
    specific_goals = [
        'DAILY_goal1_report',
        'DAILY_goal1_importance',
        'DAILY_goal1_consequences',
        'DAILY_goal1_motivationInternal',
        'DAILY_goal1_motivationExternal',
        'DAILY_goal1_confidence',
        'DAILY_goal1_effort',
        'DAILY_goal1_interaction_week1',
        'DAILY_goal1_interaction_week2',
        'DAILY_goal1_interaction_month1',
        'DAILY_goal1_interaction_month2',
        'DAILY_goal2_report',
        'DAILY_goal2_importance',
        'DAILY_goal2_consequences',
        'DAILY_goal2_motivationInternal',
        'DAILY_goal2_motivationExternal',
        'DAILY_goal2_confidence',
        'DAILY_goal2_effort',
        'DAILY_goal2_interaction_week1',
        'DAILY_goal2_interaction_week2',
        'DAILY_goal2_interaction_month1',
        'DAILY_goal2_interaction_month2',
        'DAILY_goal2_interaction_eachOther'
    ]
    
    weekly_goals = [
        'WEEKLY_goal_report1',
        'WEEKLY_goal_set1_importance',
        'WEEKLY_goal_set1_consequences',
        'WEEKLY_goal_set1_motivationInternal',
        'WEEKLY_goal_set1_motivationExternal',
        'WEEKLY_goal_set1_confidence',
        'WEEKLY_goal_set1_effort',
        'WEEKLY_goal_set1_interaction_month1',
        'WEEKLY_goal_set1_interaction_month2',
        'WEEKLY_goal_report2',
        'WEEKLY_goal_set2_importance',
        'WEEKLY_goal_set2_consequences',
        'WEEKLY_goal_set2_motivationInternal',
        'WEEKLY_goal_set2_motivationExternal',
        'WEEKLY_goal_set2_confidence',
        'WEEKLY_goal_set2_effort',
        'WEEKLY_goal_set2_interaction_month1',
        'WEEKLY_goal_set2_interaction_month2',
        'WEEKLY_goal_set2_interaction_eachOther'
    ]

non_numeric_cols = [
    'IDEAL_values_monthly1',
    'IDEAL_values_monthly2',
    'IDEAL_values_monthly3',
    'DAILY_goal1_set',
    'DAILY_goal2_set',
    'WEEKLY_goal_set1',
    'WEEKLY_goal_set2',
    'MONTHLY_goal_set1',
    'MONTHLY_goal_set2',
    'DAILY_next24_diet',
    'DAILY_past48to24_gapCause',
    'DAILY_survey_situation1_surveys',
    'DAILY_survey_situation2_surveys',
    'DAILY_survey_missed',
    'DAILY_past24_gapCause',
    'ParticipantIdentifier',
    'trial_date'
]

In [717]:
df_daily_sr = df.loc[df.ResultIdentifier.isin(past24_general + 
                                              past24_categories + 
                                              next24_categories + 
                                              specific_goals +
                                              non_numeric_cols)].reset_index(drop=True)

#### Convert to Wide

In [718]:
# Convert values in 'Answers' column to numeric where possible, else leave as string
# df_daily_sr['Answers'] = pd.to_numeric(df_daily_sr['Answers'], errors='coerce').fillna(df_daily_sr['Answers'])

# Pivot the data
df_daily_sr_wide = df_daily_sr.pivot_table(index=["ParticipantIdentifier", "trial_date"],
                                           columns='ResultIdentifier',
                                           values='Answers',
                                           aggfunc=lambda x: ' '.join(map(str, x))).reset_index()
# get rid of name on index
df_daily_sr_wide = df_daily_sr_wide.rename_axis(None, axis=1)

In [719]:
# convert data to numeric where appropriate
df_daily_sr_wide.loc[:,df_daily_sr_wide.columns[~df_daily_sr_wide.columns.isin(non_numeric_cols + ['ParticipantIdentifier', 'trial_date'])]] = df_daily_sr_wide.loc[:,df_daily_sr_wide.columns[~df_daily_sr_wide.columns.isin(non_numeric_cols)]].apply(pd.to_numeric, errors='coerce')

In [720]:
# Break gap cause into two columns
if run_num == 1:
    df_daily_sr_wide[['DAILY_past24_gapCause_internal', 'DAILY_past24_gapCause_external', 'drop_col']] = df_daily_sr_wide.DAILY_past24_gapCause.str.split("_", expand = True)
    df_daily_sr_wide.drop(columns='drop_col', inplace=True)

elif run_num == 2:
    df_daily_sr_wide[['DAILY_past24_gapCause_internal', 'DAILY_past24_gapCause_external', 'drop_col', 'drop_col']] = df_daily_sr_wide.DAILY_past24_gapCause.str.split("_", expand = True)
    df_daily_sr_wide.drop(columns='drop_col', inplace=True)

# convert to numeric 0-1
cols = ['DAILY_past24_gapCause_internal', 'DAILY_past24_gapCause_external']
df_daily_sr_wide[cols] = df_daily_sr_wide[cols].apply(pd.to_numeric, errors = 'coerce')
df_daily_sr_wide[cols] = df_daily_sr_wide[cols]/100

In [721]:
# REPEAT FOR MISSED DAY DATA
# Break gap cause into two columns
df_daily_sr_wide[['DAILY_past48to24_gapCause_internal', 'DAILY_past48to24_gapCause_external']] = df_daily_sr_wide.DAILY_past48to24_gapCause.str.split("_", expand = True)

# convert to numeric 0-1
cols = ['DAILY_past48to24_gapCause_internal', 'DAILY_past48to24_gapCause_external']
df_daily_sr_wide[cols] = df_daily_sr_wide[cols].apply(pd.to_numeric, errors = 'coerce')
df_daily_sr_wide[cols] = df_daily_sr_wide[cols]/100

#### Category Gap Calculation

⚡ Make sure that we are not calculating gaps where there was no PREDICTION MADE

<div class="alert alert-block alert-info">
<b>📝 Note:</b><br>
    I am assuming that peoples' goals are directional in a way that MAY NOT BE ACCURATE for everyone.<br><br>
    For example, I am assuming that people want to sleep more and drink less - in other words they have a <b>gap</b> if they have <b>more</b> drinks than planned, but for <b>sleep</b> the gap calculation is reversed since we assume a gap means that you had <b>fewer</b> hours of sleep than planned.<br><br>
    While this may be accurate <i>in general</i> I would reasonably expect there to be exceptions.
</div>

In [722]:
# calculate diet gap (since it is originally a success measure)
df_daily_sr_wide['DAILY_gap_diet'] = 100 - df_daily_sr_wide.DAILY_past24_diet

In [723]:
# take the predicted amount from the day before and subtract the actual amount...
for i in range(df_daily_sr_wide.shape[0]-1):
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_sleep'] =  df_daily_sr_wide.loc[i, 'DAILY_next24_sleep'] - df_daily_sr_wide.loc[i+1, 'DAILY_past24_sleep']
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_occupation'] =  df_daily_sr_wide.loc[i, 'DAILY_next24_occupation'] - df_daily_sr_wide.loc[i+1, 'DAILY_past24_occupation']    
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_nonoccupation'] =  df_daily_sr_wide.loc[i, 'DAILY_next24_nonoccupation'] - df_daily_sr_wide.loc[i+1, 'DAILY_past24_nonoccupation']    
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_leisureSolo'] =  df_daily_sr_wide.loc[i+1, 'DAILY_past24_leisureSolo'] - df_daily_sr_wide.loc[i, 'DAILY_next24_leisureSolo'] # reversed
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_leisureNonSolo'] =  df_daily_sr_wide.loc[i+1, 'DAILY_past24_leisureNonSolo']  - df_daily_sr_wide.loc[i, 'DAILY_next24_leisureNonSolo'] # reversed
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_exercise'] =  df_daily_sr_wide.loc[i, 'DAILY_next24_exercise'] - df_daily_sr_wide.loc[i+1, 'DAILY_past24_exercise']    
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_socialMedia'] =  df_daily_sr_wide.loc[i+1, 'DAILY_past24_socialMedia'] - df_daily_sr_wide.loc[i, 'DAILY_next24_socialMedia'] # reversed 
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_drinks'] =  df_daily_sr_wide.loc[i+1, 'DAILY_past24_drinks'] - df_daily_sr_wide.loc[i, 'DAILY_next24_drinks'] # reversed
    

#### EDA Profiling

In [679]:
profile = ProfileReport(df_daily_sr_wide.iloc[:,2:], title=f"Daily Reports Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"daily_reports_run{run_num}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Summarize dataset: 100%|██████████| 5281/5281 [04:37<00:00, 19.03it/s, Completed]                                                                             
Generate report structure: 100%|██████████| 1/1 [00:12<00:00, 12.49s/it]
Render HTML: 100%|██████████| 1/1 [00:45<00:00, 45.73s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00,  1.94it/s]


#### Clean

##### Variance

We can see that there are a number of participants who have no variance in a given category.

This is obviously a problem for some analyses...

In [724]:
# check for variance in numeric cols only, leaving ParticipantIdentifier for grouping
non_numeric_cols_alt = [item for item in non_numeric_cols if item != 'ParticipantIdentifier']

In [725]:
# Drop 'trial_date' column
data = df_daily_sr_wide.drop(columns=non_numeric_cols_alt, errors='ignore')

# Group by 'ParticipantIdentifier' and compute the variance
grouped_variance = data.groupby('ParticipantIdentifier').var()

# Filter the grouped_variance dataframe to only include columns with 0 variance for any participant
zero_variance_df = grouped_variance[grouped_variance == 0].dropna(how='all')

# Melt the dataframe to have ParticipantIdentifier, Column with 0 variance
melted_zero_variance_df = zero_variance_df.reset_index().melt(id_vars=['ParticipantIdentifier'], value_name='Variance')
final_zero_variance_df = melted_zero_variance_df.dropna(subset=['Variance']).drop(columns='Variance')
final_zero_variance_df = pd.DataFrame(final_zero_variance_df.groupby('ParticipantIdentifier').count()).reset_index().sort_values(by='variable', ascending=False)
final_zero_variance_df = final_zero_variance_df.rename(columns={'variable': 'ZeroVariance'})

# Count the number of values present in the variable column for each participant
value_counts = data.groupby('ParticipantIdentifier').count()

# Merge the value counts with the final_zero_variance_df
merged_df = final_zero_variance_df.merge(value_counts, on='ParticipantIdentifier', how='left')

# Extract only the relevant columns
result_df = merged_df[['ParticipantIdentifier', 'ZeroVariance', 'DAILY_goal1_confidence']]
result_df.columns = ['ParticipantIdentifier', 'dailySR_zeroVar_cols', 'Count']

result_df

Unnamed: 0,ParticipantIdentifier,dailySR_zeroVar_cols,Count
0,1081454a-03ff-445c-9602-ac9fe9e3e5cf,16,60
1,3e1d1276-0e73-4457-9911-f189b0ed0778,9,85
2,39ecc6ed-1f7e-4d59-8442-371fd16efb08,8,28
3,06af7782-cd70-4938-8e67-b6d98b34b665,5,75
4,fee5cd07-329a-4f07-bb1a-913dfa09e3b4,4,83
...,...,...,...
70,5350441c-7181-463e-9165-5611b5bcab10,1,85
71,aa94f196-94ac-4e0e-b66d-5e2c06f717b7,1,36
72,4e465685-8d64-4b22-8b6c-9409f9eb3c02,1,68
73,4f4440e7-3a38-4fa7-9271-9730806e441a,1,85


In [726]:
# we can see that drinks have the most people with zero variance...
pd.DataFrame(grouped_variance[grouped_variance == 0].dropna(how='all').eq(0).sum()).reset_index(names='Variable').sort_values(by=0, ascending=False).reset_index(drop=True)

Unnamed: 0,Variable,0
0,DAILY_past24_drinks,53
1,DAILY_gap_drinks,15
2,DAILY_next24_drinks,15
3,DAILY_next24_sleep,8
4,DAILY_next24_socialMedia,8
...,...,...
66,DAILY_past24_occupation,0
67,DAILY_past24_physicalEffort,0
68,DAILY_past24_procrastination,0
69,DAILY_past24_change,0


In [727]:
# add flag for subjects with more than 5 zero variance columns
flag_threshold = 5

result_df['dailySR_zeroVarCols_flag'] = False
result_df.loc[result_df.dailySR_zeroVar_cols > flag_threshold, 'dailySR_zeroVarCols_flag'] = True
result_df.sort_values(by='dailySR_zeroVar_cols', ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['dailySR_zeroVarCols_flag'] = False


Unnamed: 0,ParticipantIdentifier,dailySR_zeroVar_cols,Count,dailySR_zeroVarCols_flag
0,1081454a-03ff-445c-9602-ac9fe9e3e5cf,16,60,True
1,3e1d1276-0e73-4457-9911-f189b0ed0778,9,85,True
2,39ecc6ed-1f7e-4d59-8442-371fd16efb08,8,28,True
3,06af7782-cd70-4938-8e67-b6d98b34b665,5,75,False
4,fee5cd07-329a-4f07-bb1a-913dfa09e3b4,4,83,False
...,...,...,...,...
48,fc490430-6a41-4853-a2cf-ae0b15265cb6,1,65,False
47,042d7595-3fdc-4cf9-b288-c4b7961916d8,1,42,False
46,9d5c19e4-da3f-4760-b769-0ed24d80c917,1,84,False
45,98f7c7df-3bbf-44bf-99be-e2995f557e91,1,82,False


In [728]:
# look at subject with most zero var columns
df_daily_sr_wide.loc[(df_daily_sr_wide['ParticipantIdentifier'] == '27329533-d0a4-4605-9da5-0eb857154cae') & (df_daily_sr_wide['DAILY_goal1_confidence'].notna())]

Unnamed: 0,ParticipantIdentifier,trial_date,DAILY_goal1_confidence,DAILY_goal1_consequences,DAILY_goal1_effort,DAILY_goal1_importance,DAILY_goal1_interaction_month1,DAILY_goal1_interaction_month2,DAILY_goal1_interaction_week1,DAILY_goal1_interaction_week2,...,DAILY_past48to24_gapCause_external,DAILY_gap_diet,DAILY_gap_sleep,DAILY_gap_occupation,DAILY_gap_nonoccupation,DAILY_gap_leisureSolo,DAILY_gap_leisureNonSolo,DAILY_gap_exercise,DAILY_gap_socialMedia,DAILY_gap_drinks


In [729]:
# merge with main sr df
df_daily_sr_wide = df_daily_sr_wide.merge(result_df.drop(columns=['Count']), on='ParticipantIdentifier', how='left')

In [730]:
# Add sr prefix
# df_daily_sr_wide.columns[2:]

df_daily_sr_wide.columns = ['ParticipantIdentifier', 'trial_date'] + ['sr_' + col for col in df_daily_sr_wide.columns[2:]]

#### Save

In [731]:
# Create df with one participant ID for every trial_date

# Create a series of dates from '2022-09-27' to '2022-12-20'
if run_num == 1:
    date_series = pd.date_range(start='2022-09-27', end='2022-12-20')
elif run_num == 2:
    date_series = pd.date_range(start='2023-01-30', end='2023-04-24')

ids_series = subjects

# Create a dataframe using a cartesian product of the two series
df_complete_idDate = pd.DataFrame({
    'ParticipantIdentifier': np.repeat(ids_series, len(date_series)),
    'trial_date': date_series.tolist() * len(ids_series)
}).reset_index(drop=True)

# Convert trial_date to datetime.date
df_complete_idDate['trial_date'] = pd.to_datetime(df_complete_idDate['trial_date']).dt.date
df_daily_sr_wide['trial_date'] = pd.to_datetime(df_daily_sr_wide['trial_date']).dt.date

# Join with affect df
df_daily_sr_wide = df_complete_idDate.merge(df_daily_sr_wide, how='left', on=['ParticipantIdentifier', 'trial_date'])

In [732]:
# save to csv
if run_num ==1:
    # run 1
    df_daily_sr_wide.to_csv(save_path + 'run1_selfReport.csv', index=False)
if run_num ==2:
    # run 2
    df_daily_sr_wide.to_csv(save_path + 'run2_selfReport.csv', index=False)

### Daily/Weekly Social Support

**NOTE**: These were only collected in run 2

Weekly support is a 12 item scale scored on a 5 point Likert-scale (0-4).
It is based on the The [Interpersonal Support Evaluation List](https://www.cmu.edu/common-cold-project/measures-by-study/psychological-and-social-constructs/social-relationships-loneliness-measures/social-support.html), using the [ISEL-12 version](https://www.cmu.edu/common-cold-project/measures-by-study/psychological-and-social-constructs/social-relationships-loneliness-measures/isel_12_item.pdf).

Daily social support is a custom measure developed by [Leo Huang](https://www.leohuangneuro.com/about-me), [Cendri Hutcherson](https://www.linkedin.com/in/cendri-hutcherson-3327a161/?originalSubdomain=ca), and [Daniel J Wilson](https://github.com/danieljwilson).

The items fall under 3 categories 👇

![Example Image](../../3_3_6_inputs/images/ss_daily_items.jpg)

#### Load Weekly Data

In [756]:
ss_df = df.loc[df['ResultIdentifier'].str.startswith('ss_weekly_')].reset_index(drop=True)

#### Munge Data

In [764]:
# Make Answers numeric
ss_df['Answers'] = pd.to_numeric(ss_df['Answers'], errors='coerce')

# Reverse score the specified items
reverse_items = ['ss_weekly_1', 'ss_weekly_4', 'ss_weekly_5', 'ss_weekly_7', 'ss_weekly_10', 'ss_weekly_12']
ss_df.loc[ss_df['ResultIdentifier'].isin(reverse_items) & (ss_df['Answers'] != 0), 'Answers'] = 5 - ss_df['Answers']

In [773]:
# Calculate the total score for each participant
total_scores = ss_df.groupby(['ParticipantIdentifier', 'trial_date'])['Answers'].sum().reset_index()
total_scores.rename(columns={'Answers': 'ss_weekly_ISEL12_totalScore'}, inplace=True)

# Calculate subscale scores
appraisal_items = ['ss_weekly_1', 'ss_weekly_2', 'ss_weekly_3', 'ss_weekly_4']
belonging_items = ['ss_weekly_5', 'ss_weekly_6', 'ss_weekly_7', 'ss_weekly_8']
tangible_items = ['ss_weekly_9', 'ss_weekly_10', 'ss_weekly_11', 'ss_weekly_12']

appraisal_scores = ss_df[ss_df['ResultIdentifier'].isin(appraisal_items)].groupby(['ParticipantIdentifier', 'trial_date'])['Answers'].sum().reset_index()
appraisal_scores.rename(columns={'Answers': 'ss_weekly_ISEL12_appraisal'}, inplace=True)

belonging_scores = ss_df[ss_df['ResultIdentifier'].isin(belonging_items)].groupby(['ParticipantIdentifier', 'trial_date'])['Answers'].sum().reset_index()
belonging_scores.rename(columns={'Answers': 'ss_weekly_ISEL12_belonging'}, inplace=True)

tangible_scores = ss_df[ss_df['ResultIdentifier'].isin(tangible_items)].groupby(['ParticipantIdentifier', 'trial_date'])['Answers'].sum().reset_index()
tangible_scores.rename(columns={'Answers': 'ss_weekly_ISEL12_tangible'}, inplace=True)

In [781]:
# Merge
ss_df = total_scores.merge(appraisal_scores, on=['ParticipantIdentifier', 'trial_date']).merge(belonging_scores, on=['ParticipantIdentifier', 'trial_date']).merge(tangible_scores, on=['ParticipantIdentifier', 'trial_date'])

#### EDA

In [782]:
profile = ProfileReport(ss_df.iloc[:,2:], title=f"ISEL 12 Social Support Weekly Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"ss_weekly_run{run_num}.html")

Summarize dataset: 100%|██████████| 29/29 [00:00<00:00, 30.28it/s, Completed]                                                       
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  2.10it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  5.78it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 219.48it/s]


#### Save

In [None]:
# save to csv
if run_num ==1:
    # run 1
    print('No social support weekly measure for run 1...')
if run_num ==2:
    # run 2
    ss_df.to_csv(save_path + 'run2_ss_weekly.csv', index=False)

#### Load Daily Data

In [806]:
df_daily_ss = df.loc[df['ResultIdentifier'].str.startswith('ss_')].reset_index(drop=True)

# Remove weekly measures
df_daily_ss = df_daily_ss.loc[~df_daily_ss['ResultIdentifier'].str.contains('_weekly_')].reset_index(drop=True)

#### Convert to Wide

In [807]:
# Convert values in 'Answers' column to numeric where possible, else leave as string
# df_daily_sr['Answers'] = pd.to_numeric(df_daily_sr['Answers'], errors='coerce').fillna(df_daily_sr['Answers'])

# Pivot the data
df_daily_ss_wide = df_daily_ss.pivot_table(index=["ParticipantIdentifier", "trial_date"],
                                           columns='ResultIdentifier',
                                           values='Answers',
                                           aggfunc=lambda x: ' '.join(map(str, x))).reset_index()
# get rid of name on index
df_daily_ss_wide = df_daily_ss_wide.rename_axis(None, axis=1)

In [812]:
df_daily_ss_wide.head()

Unnamed: 0,ParticipantIdentifier,trial_date,ss_desired_appraisal,ss_desired_belonging,ss_desired_tangible,ss_received_appraisal,ss_received_appraisal_satisfaction,ss_received_belonging,ss_received_belonging_satisfaction,ss_received_tangible,ss_received_tangible_satisfaction,ss_sought_appraisal,ss_sought_belonging,ss_sought_tangible
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-30,1.0,1.0,0.0,0.0,,0.0,,2.0,4.0,0.0,0.0,0.0
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-31,0.0,0.0,0.0,0.0,,1.0,6.0,0.0,,0.0,0.0,0.0
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,0.0,1.0,0.0,0.0,,0.0,,0.0,,0.0,0.0,0.0
3,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,0.0,0.0,0.0,1.0,4.0,1.0,4.0,0.0,,0.0,0.0,0.0
4,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-03,0.0,0.0,0.0,2.0,5.0,2.0,5.0,1.0,2.0,0.0,0.0,0.0


In [810]:
# convert data to numeric where appropriate
df_daily_ss_wide.loc[:,df_daily_ss_wide.columns[~df_daily_ss_wide.columns.isin(non_numeric_cols + ['ParticipantIdentifier', 'trial_date'])]] = df_daily_ss_wide.loc[:,df_daily_ss_wide.columns[~df_daily_ss_wide.columns.isin(non_numeric_cols)]].apply(pd.to_numeric, errors='coerce')

#### EDA

In [814]:
profile = ProfileReport(df_daily_ss_wide.iloc[:,2:], title=f"Social Support Daily Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"ss_daily_run{run_num}.html")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={"index": "df_index"}, inplace=True)
Summarize dataset: 100%|██████████| 166/166 [00:08<00:00, 19.90it/s, Completed]                                                                    
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.30s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.25s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 168.22it/s]


#### Save

In [813]:
# save to csv
if run_num ==1:
    # run 1
    print('No social support daily measure for run 1...')
if run_num ==2:
    # run 2
    df_daily_ss_wide.to_csv(save_path + 'run2_ss_daily.csv', index=False)

## Custom Tasks


### Food Task

##### Load Data

In [733]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

if 'food_df' in globals():
    del(food_df)
    print('deleted daily food_df')

deleted existing df
deleted daily food_df


In [734]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_survey_results.csv')

##### Select Data

In [735]:
food_df = df.loc[df['ResultIdentifier'].str.startswith('rating_')].reset_index(drop=True)
hunger =  df.loc[df['ResultIdentifier']=='Hunger_Screen'].reset_index(drop=True)
hunger.rename(columns={"Answers": "task_food_hunger_level"}, inplace=True)

#### Convert to Wide

In [736]:
# Convert the dataframe from long to wide format
food_df_wide = food_df.pivot_table(index=["ParticipantIdentifier", "trial_date"], 
                              columns='ResultIdentifier', 
                              values='Answers', 
                              aggfunc='first').reset_index()

# get rid of name on index
food_df_wide = food_df_wide.rename_axis(None, axis=1)

In [737]:
import json

# Filter out columns that start with "rating_"
rating_columns = [col for col in food_df_wide.columns if col.startswith('rating_')]

# Define a function to extract required information from the 'name' value
def extract_name_info(json_data):
    # Check if 'name' key exists in json_data and is of type string
    name_str = json_data.get('name', "")
    
    if not isinstance(name_str, str):
        return None, None

    # Extract task_food_item
    item_start = name_str.rfind('_') + 1
    item_end = name_str.rfind('.jpg')
    task_food_item = name_str[item_start:item_end] if item_start != -1 and item_end != -1 else None
    
    # Extract task_food_category
    category_start = name_str.rfind('/') + 1
    category_end = name_str.rfind('_')
    task_food_category = name_str[category_start:category_end] if category_start != -1 and category_end != -1 else None
    
    return task_food_item, task_food_category

# Modify the function to handle potential strings in columns
def parse_json(entry):
    try:
        return json.loads(entry)
    except (TypeError, json.JSONDecodeError):
        return {}

# Re-parse the JSON strings using the modified function
for col in rating_columns:
    food_df_wide[col] = food_df_wide[col].apply(parse_json)

# Re-extract the required values using the modified function
for col in rating_columns:
    # Extract required information
    food_df_wide[col + '_rating'] = food_df_wide[col].apply(lambda x: x.get('rating', None))
    food_df_wide[col + '_rt'] = food_df_wide[col].apply(lambda x: x.get('reactionTime', None))
    food_df_wide[col + '_item'], food_df_wide[col + '_category'] = zip(*food_df_wide[col].apply(extract_name_info))

# Drop the original "rating_" columns as they are not needed anymore
food_df_wide = food_df_wide.drop(columns=rating_columns)

# Convert all 'task_food_rating' columns to dtype int
rating_cols_to_convert = [col for col in food_df_wide.columns if '_rating' in col]

for col in rating_cols_to_convert:
    food_df_wide[col] = food_df_wide[col].astype(np.int64)  # Using 'Int64' to handle potential NaN values

# Rename columns that start with 'rating' to start with 'task_food'
food_df_wide.columns = ['task_food' + col[len('rating'):] if col.startswith('rating') else col for col in food_df_wide.columns]

food_df_wide.head()

Unnamed: 0,ParticipantIdentifier,trial_date,task_food_1_rating,task_food_1_rt,task_food_1_item,task_food_1_category,task_food_10_rating,task_food_10_rt,task_food_10_item,task_food_10_category,...,task_food_7_item,task_food_7_category,task_food_8_rating,task_food_8_rt,task_food_8_item,task_food_8_category,task_food_9_rating,task_food_9_rt,task_food_9_item,task_food_9_category
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-30,-5,6712,carrotSalad,hu,3,2451,breakfastTacos,ht,...,grapeNuts,hu,-4,2477,pancakes,ut,1,2615,hardBoiledEgg,hu
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-31,2,3104,bagel,ht,-1,2682,pizza,ut,...,eggMcMuffin,ut,-3,1964,cocaCola,ut,1,2298,viennoisChocolat,ut
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,1,3448,fruitSalad,ht,-4,3877,ovaltine,hu,...,chilaquiles,ut,1,4292,hasbrowns,ut,0,7124,omelette2,ht
3,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,1,3462,crepe2,ut,-2,2375,fruitSalad,ht,...,croissant,ut,-3,2475,fruitLoops,ut,-5,1825,ovaltine,hu
4,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-03,-5,2245,sardines,hu,-4,1776,Kimchi,hu,...,pancakes,ut,-1,2327,fruitSalad,ht,-2,2176,shreddedWheat,hu


In [738]:
# add hunger value
food_df_wide = food_df_wide.merge(hunger[['ParticipantIdentifier', 'trial_date', 'task_food_hunger_level']],
                                  on=['ParticipantIdentifier', 'trial_date'],
                                  how='left')

# Convert hunger to int
food_df_wide['task_food_hunger_level'] = food_df_wide['task_food_hunger_level'].astype(np.float32)

food_df_wide.head()

Unnamed: 0,ParticipantIdentifier,trial_date,task_food_1_rating,task_food_1_rt,task_food_1_item,task_food_1_category,task_food_10_rating,task_food_10_rt,task_food_10_item,task_food_10_category,...,task_food_7_category,task_food_8_rating,task_food_8_rt,task_food_8_item,task_food_8_category,task_food_9_rating,task_food_9_rt,task_food_9_item,task_food_9_category,task_food_hunger_level
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-30,-5,6712,carrotSalad,hu,3,2451,breakfastTacos,ht,...,hu,-4,2477,pancakes,ut,1,2615,hardBoiledEgg,hu,3.0
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-31,2,3104,bagel,ht,-1,2682,pizza,ut,...,ut,-3,1964,cocaCola,ut,1,2298,viennoisChocolat,ut,3.0
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,1,3448,fruitSalad,ht,-4,3877,ovaltine,hu,...,ut,1,4292,hasbrowns,ut,0,7124,omelette2,ht,2.0
3,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,1,3462,crepe2,ut,-2,2375,fruitSalad,ht,...,ut,-3,2475,fruitLoops,ut,-5,1825,ovaltine,hu,0.0
4,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-03,-5,2245,sardines,hu,-4,1776,Kimchi,hu,...,ut,-1,2327,fruitSalad,ht,-2,2176,shreddedWheat,hu,0.0


#### EDA

In [739]:
profile = ProfileReport(food_df_wide.iloc[:,2:], title=f"Food Task Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_food_run{run_num}.html")

Summarize dataset: 100%|██████████| 1032/1032 [00:47<00:00, 21.55it/s, Completed]                                             
Generate report structure: 100%|██████████| 1/1 [00:11<00:00, 11.50s/it]
Render HTML: 100%|██████████| 1/1 [00:07<00:00,  7.85s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 17.94it/s]


#### Clean

We had numerous outlier RT values.

We used the $z$ transform method to flag outliers (in the column `task_food_rt_flag`) based on Berger and Kiefer ([2021](https://doi.org/10.3389/fpsyg.2021.675558)) where they tested multiple methods of removing outliers from rt data.

We also set the flag threshold to 3.

In [740]:
# clean outlier rt values

from scipy.stats import zscore

# Step 1: Filter out rt cols
rt_columns = [col for col in food_df_wide.columns if col.endswith('_rt')]

# Step 2: Compute the z-scores for these columns
for col in rt_columns:
    z_col_name = col + '_z'
    food_df_wide[z_col_name] = zscore(food_df_wide[col], nan_policy='omit')

# Step 3: Check each row for values above a threshold in the '_z' columns
threshold = 3  # Define a threshold value
z_columns = [col + '_z' for col in rt_columns]
food_df_wide['task_food_rt_flag'] = food_df_wide[z_columns].apply(lambda row: any(abs(val) > threshold for val in row), axis=1)

z_columns_to_drop = [col for col in food_df_wide.columns if col.endswith('_z')]
wide_df = food_df_wide.drop(columns=z_columns_to_drop)


#### Save

In [741]:
# Create df with one participant ID for every trial_date

# Create a series of dates from '2022-09-27' to '2022-12-20'
if run_num == 1:
    date_series = pd.date_range(start='2022-09-27', end='2022-12-20')
elif run_num == 2:
    date_series = pd.date_range(start='2023-01-30', end='2023-04-24')

ids_series = subjects

# Create a dataframe using a cartesian product of the two series
df_complete_idDate = pd.DataFrame({
    'ParticipantIdentifier': np.repeat(ids_series, len(date_series)),
    'trial_date': date_series.tolist() * len(ids_series)
}).reset_index(drop=True)

# Convert trial_date to datetime.date
df_complete_idDate['trial_date'] = pd.to_datetime(df_complete_idDate['trial_date']).dt.date
food_df_wide['trial_date'] = pd.to_datetime(food_df_wide['trial_date']).dt.date

# Join with affect df
food_df_wide = df_complete_idDate.merge(food_df_wide, how='left', on=['ParticipantIdentifier', 'trial_date'])

In [742]:
# save to csv
if run_num == 1:
    # run 1
    print('Task was not part of run 1...')
if run_num == 2:
    # run 2
    food_df_wide.to_csv(save_path + 'run2_task_food.csv', index=False)

### N-Back

The n-back sequence was created as follows (where `n` indicates whether it is 2-back or 3-back)

```javascript
function constructSequence(n) {
    const ls = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
    let number = 0;
    let char = "";
    const alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'];
    const sequence = []
    for (let i = 0; i < SEQ_LEN; i++) {
        number = ls[Math.floor(Math.random() * ls.length)];
        if (i >= n && number <= 2) {
            char = sequence[i - n];
            //console.log("in if ===>", char, sequence, i, n)
        } else {
            char = alphabet[Math.floor(Math.random() * alphabet.length)];
            //console.log("in else ==>", char)
        }
        sequence.push(char)

    }
    return sequence;
}
```

Given that `ls` has a length of 11 this means that on average there is a 3/11 chance of having a match (for positions 3 and onward)


In [171]:
import json

#### Load Data

In [870]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_survey_results.csv')

#### Format Data

In [871]:
nback_df = df.loc[df.ResultIdentifier == 'task_custom_nBack_results'].reset_index(drop=True)

In [872]:
# we only need this as bids were being overwritten when the bid was NOT accepted with a 0 bid...
# this was fixed on Feb 7, but using the actual bid value will also continue to work...

nback_df_bids = df.loc[df.ResultIdentifier == 'task_custom_nBack_diffSelect'].reset_index(drop=True)

In [873]:
nback_df.Answers[5]

'{"bid":70,"randomNumber":74,"correctness":0.9545454545454546,"earnings":74,"mode":"hard","matched":4,"missed":0,"sequence":["T","F","A","O","K","A","C","G","R","Z","E","T","R","Y","T","S","Z","R","G","Z","B","G","A","W","B"],"falseAlarm":1,"indexOfMatchClicked":[6,15,18,20,22]}'

In [874]:
# Parse json to create columns
for i in range(nback_df.shape[0]):
    nback_df.loc[i, 'task_nback_bid'] = json.loads(nback_df_bids.Answers[i])['bid']
    nback_df.loc[i, 'task_nback_rndNum'] = json.loads(nback_df.Answers[i])['randomNumber']    
    nback_df.loc[i, 'task_nback_mode'] = json.loads(nback_df.Answers[i])['mode']
    nback_df.loc[i, 'task_nback_matched'] = json.loads(nback_df.Answers[i])['matched']    
    nback_df.loc[i, 'task_nback_missed'] = json.loads(nback_df.Answers[i])['missed']    
    nback_df.loc[i, 'task_nback_falseAlarm'] = json.loads(nback_df.Answers[i])['falseAlarm']
    nback_df.loc[i, 'task_nback_trialCount'] = len(json.loads(nback_df.Answers[i])['sequence'])

In [875]:
nback_df = nback_df.drop(['ResultIdentifier', 'Answers', 'EndDate', 'datetime'], axis=1)
nback_df = nback_df.rename(columns={"time": "task_nback_time"})

#### Add Features

Adding the following metrics for Binary Classification:

1. **Accuracy**: 
   The proportion of correctly predicted classifications in the total predictions made.
   $$
   \text{Accuracy} = \frac{\text{TP} + \text{TN}}{\text{TP} + \text{TN} + \text{FP} + \text{FN}}
   $$

2. **Precision** (or Positive Predictive Value):
   The proportion of positive identifications that were actually correct.
   $$
   \text{Precision} = \frac{\text{TP}}{\text{TP} + \text{FP}}
   $$

3. **Recall** (or Sensitivity or True Positive Rate):
   The proportion of actual positives that were identified correctly.
   $$
   \text{Recall} = \frac{\text{TP}}{\text{TP} + \text{FN}}
   $$

4. **Specificity** (or True Negative Rate):
   The proportion of actual negatives that were identified correctly.
   $$
   \text{Specificity} = \frac{\text{TN}}{\text{TN} + \text{FP}}
   $$

5. **False Alarm Rate** (or Fall-Out):
   The proportion of actual negatives that were incorrectly classified as positive.
   $$
   \text{False Alarm Rate} = \frac{\text{FP}}{\text{TN} + \text{FP}}
   $$

6. **F1 Score**:
   The harmonic mean of precision and recall, giving a balance between the two.
   $$
   \text{F1 Score} = 2 \times \frac{\text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}}
   $$

7. **Matthews Correlation Coefficient (MCC)**:
   A metric that takes into account true and false positives and negatives and is generally regarded as a balanced measure which can be used even if the classes are of very different sizes.
   $$
   \text{MCC} = \frac{\text{TP} \times \text{TN} - \text{FP} \times \text{FN}}{\sqrt{(\text{TP} + \text{FP})(\text{TP} + \text{FN})(\text{TN} + \text{FP})(\text{TN} + \text{FN})}}
   $$

8. **Bias (C or criterion)**:
   A metric from signal detection theory that indicates the participant's response bias. A positive value indicates a bias toward saying "no" (


In [876]:
truePos = nback_df.task_nback_matched
trueNeg = nback_df.task_nback_trialCount - nback_df.task_nback_matched - nback_df.task_nback_missed - nback_df.task_nback_falseAlarm
falsePos = nback_df.task_nback_falseAlarm
falseNeg = nback_df.task_nback_missed

# proportion of correct classifications in total predictions made
nback_df['task_nback_accuracy'] = (truePos + trueNeg) / (truePos + trueNeg + falsePos + falseNeg)
# positive predictive value (hit rate)
nback_df['task_nback_precision'] = truePos / (truePos + falsePos)
# true positive rate (sensitivity)
nback_df['task_nback_recall'] = truePos / (truePos + falseNeg)
# true negative rate (false_alarm_rate)
nback_df['task_nback_specificity'] = trueNeg / (trueNeg + falsePos)
# Proportion of times the participant incorrectly indicates an n-back match when there wasn't one.
nback_df['task_nback_falseAlarmRate'] = falsePos / (trueNeg + falsePos)
# Harmonic mean of precision and recall, giving a balance between the two
nback_df['task_nback_F1'] = 2 * ((nback_df['task_nback_precision'] * nback_df['task_nback_recall'])/(nback_df['task_nback_precision'] + nback_df['task_nback_recall']))
# Matthews Correlation Coefficient (MCC):
# It takes into account true and false positives and negatives and is generally regarded as a balanced measure which can be used even if the classes are of very different sizes.
nback_df['task_nback_MCC'] = ((truePos * trueNeg) - (falsePos * falseNeg)) / (np.sqrt((truePos + falsePos)*(truePos + falseNeg) * (trueNeg + falsePos) * (trueNeg + falseNeg)))

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [877]:
# metric from signal detection theory
# indicates the participant's response bias. 
# A positive value indicates a bias toward saying "no" (conservative)
# A negative value indicates a bias toward saying "yes" (liberal).

from scipy.stats import norm

def calculate_criterion(hit_rate, false_alarm_rate):
    # Calculate the Z scores for the hit rate and false alarm rate
    z_hit = norm.ppf(hit_rate)
    z_fa = norm.ppf(false_alarm_rate)
    
    # Calculate the criterion C
    C = -0.5 * (z_hit + z_fa)
    
    return C

nback_df['task_nback_bias'] = calculate_criterion(nback_df['task_nback_precision'], nback_df['task_nback_falseAlarmRate'])

  C = -0.5 * (z_hit + z_fa)


In [878]:
# People that chose the easy mode (2-back instead of 3 back) were assigned a bid of ZERO
# However, this suggests that they would have done the easy task for nothing, which is NOT the case
# as they would not have done it for the max possible (100) points
# I arbitrarily assign subjects in the easy condition a bid of 200

nback_df.loc[nback_df.task_nback_bid == 0, 'task_nback_bid'] = 200

#### EDA Profiling

In [879]:
profile = ProfileReport(nback_df.iloc[:,3:], title= f"n-Back Task Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"nback_report_run{run_num}.html")

  x = asanyarray(arr - arrmean)
  x = asanyarray(arr - arrmean)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)
(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'cannot specify integer `bins` when input data contains infinity')
Summarize dataset: 100%|██████████| 194/194 [00:08<00:00, 22.21it/s, Completed]                                                   
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.78s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.40s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 69.57it/s]


**NOTES**

Looking at the data it is clear that something wonky went on in some trials where people have matched values up to 67, and missed values of -56. 

Can calculate a super low probability number of matches and delete trials with any values above that - as well as any trials with negative "missed" values.

False alarm also has a max of 136

In [880]:
np.where(np.sqrt((truePos + falsePos)*(truePos + falseNeg) * (trueNeg + falsePos) * (trueNeg + falseNeg)).isna())

  result = getattr(ufunc, method)(*inputs, **kwargs)


(array([  35,   37,   42,   70,   99,  104,  115,  121,  135,  160,  177,
         198,  409,  677,  743,  745,  823,  841, 1148, 1422, 2655, 2814,
        2899, 3027, 3121, 3278, 3333, 3337, 3354, 3364, 3390, 3396, 3416,
        3424, 3486, 3501, 3555, 3561, 3571, 3572, 3618, 3623, 3694, 3765,
        3769, 3817, 3819, 3979, 4016, 4020, 4029, 4048, 4063, 4105, 4165,
        4167, 4203, 4216, 4235, 4280, 4308, 4314, 4320, 4343, 4377, 4384,
        4405, 4435, 4444, 4454, 4520, 4582, 4610, 4620, 4676, 4730, 4738,
        4752, 4816, 4822, 4838, 4863, 4873, 4916, 4949, 4961, 4993, 5028,
        5150, 5160, 5162, 5176, 5204, 5212, 5213, 5219, 5279, 5294, 5348,
        5354, 5371, 5439, 5520, 5524, 5530, 5534, 5585, 5608, 5617, 5726,
        5804, 5807, 5808, 5812, 5870, 5877, 5878, 5890, 5941, 5972, 6006,
        6009, 6012, 6016, 6018, 6066, 6069, 6083, 6084, 6303, 6314]),)

In [881]:
nback_df.iloc[35]

ParticipantIdentifier        64b148b2-590e-4f87-bcca-c7f633421fb3
trial_date                                             2023-04-06
task_nback_time                                          23:02:29
task_nback_bid                                               90.0
task_nback_rndNum                                            82.0
task_nback_mode                                              hard
task_nback_matched                                          154.0
task_nback_missed                                          -144.0
task_nback_falseAlarm                                        17.0
task_nback_trialCount                                        25.0
task_nback_accuracy                                          6.08
task_nback_precision                                     0.900585
task_nback_recall                                            15.4
task_nback_specificity                                  -0.133333
task_nback_falseAlarmRate                                1.133333
task_nback

#### Clean

The cleaning process consists of removing any entries/rows where there is an impossible or extrememly improbable value in any of the `matched`, `missed` and `false_alarm` columns.

Impossible means that any of: 

1. $\text{matches} > 23$, given that there were only 23 possibile maches for the 3-back (hard mode).
⚡ However, I chose to eliminate any trials that had a likelihood of less than 1 in a million, which was $\text{matches} > 17$
2. $\text{misses} < 0$, given that it is impossible.

3. $\text{false alarm} > 25$, given that it is impossible.

In [882]:
nback_df.shape

(6332, 18)

In [883]:
nback_df = nback_df.loc[nback_df.task_nback_matched <18,]

In [884]:
nback_df.shape

(6179, 18)

In [885]:
nback_df = nback_df.loc[nback_df.task_nback_missed >= 0,]

In [886]:
nback_df.shape

(5971, 18)

In [887]:
nback_df = nback_df.loc[nback_df.task_nback_falseAlarm <= 25,]

In [888]:
nback_df.shape

(5971, 18)

In [889]:
# rerun EDA
profile = ProfileReport(nback_df.iloc[:,3:], title= f"n-Back Task Run {run_num} - cleaned | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"nback_report_run{run_num}_clean.html")

  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)
(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'cannot specify integer `bins` when input data contains infinity')
Summarize dataset:   9%|▉         | 18/191 [00:00<00:00, 209.50it/s, scatter task_nback_bid, task_nback_bid]

Summarize dataset: 100%|██████████| 194/194 [00:09<00:00, 21.43it/s, Completed]                                                   
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.99s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.40s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 117.55it/s]


### GoNoGo

**Parameters**

- 50 trials
- Stimulus = 250ms
- ITI = 450ms
- go/no-go ratio 4:1 (40/10)
- NOTE THAT IF YOU PRESS THE BUTTON DURING THE ITI IT STILL COUNTS FOR THAT TRIAL! So the stimulus disappears but the trial is still “on”…each trial = 250 + 450 = 700ms

**Data**

For each trial

- `trialType` Go or NoGo trial
- `stim` Circle color
- `stimDuration` How long is stim on
- `iti`
- `RT` RT to click
- `correct` Correct/Error
    - Go trial = click
    - NoGo trial = no click

<div class="alert alert-block alert-info">
<b>📝 Note:</b><br>
    According to <a href="https://link.springer.com/article/10.3758/s13428-017-0923-5">this paper</a> optimal go/no-go ratios to maximize false alarms were predicted to occur for the shortest tested ITI (450 ms) and a go/no-go ratio near 4:1. 
    <br><br>These values are predicted to produce a mean of 6.4 to 8.7 false alarms per 150 trials (95% confidence interval of the mean)
    <br><br>Given that we ran 50 trials we would expect 1/3 of this range. We found our mean false alarm rate was in this range at 2.7 (or 8.1 for 150 trials).
</div>

#### Load Data

In [913]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

if 'gng_df' in globals():
    del(gng_df)
    print('deleted gng_df')

deleted existing df
deleted gng_df


In [914]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_survey_results.csv')

In [915]:
gng_df =  df.loc[df.ResultIdentifier == 'task_custom_gonogo'].reset_index(drop=True)

#### Format Data

In [916]:
json.loads(gng_df.Answers[0])

{'reactionTime': {'0': 413,
  '1': 397,
  '2': 378,
  '3': 444,
  '4': 409,
  '5': 394,
  '6': 426,
  '9': 455,
  '11': 386,
  '12': 435,
  '13': 433,
  '14': 414,
  '15': 414,
  '17': 478,
  '18': 461,
  '19': 474,
  '20': 440,
  '21': 456,
  '22': 488,
  '25': 433,
  '26': 399,
  '27': 397,
  '29': 379,
  '30': 444,
  '31': 443,
  '32': 408,
  '33': 390,
  '34': 440,
  '35': 369,
  '37': 437,
  '38': 486,
  '39': 451,
  '40': 415,
  '41': 398,
  '42': 396,
  '43': 394,
  '44': 425,
  '45': 8,
  '46': 358,
  '47': 406,
  '49': 453},
 'correctness': {'0': 0,
  '1': 0,
  '2': 0,
  '3': 0,
  '4': 0,
  '5': 0,
  '6': 0,
  '9': 0,
  '11': 0,
  '12': 0,
  '13': 0,
  '14': 0,
  '15': 0,
  '17': 0,
  '18': 0,
  '19': 0,
  '20': 0,
  '21': 0,
  '22': 0,
  '25': 0,
  '26': 0,
  '27': 0,
  '29': 0,
  '30': 0,
  '31': 0,
  '32': 0,
  '33': 0,
  '34': 0,
  '35': 1,
  '37': 0,
  '38': 0,
  '39': 0,
  '40': 0,
  '41': 0,
  '42': 0,
  '43': 0,
  '44': 0,
  '45': 0,
  '46': 0,
  '47': 0,
  '49': 0},
 

In [917]:
for i in range(gng_df.shape[0]):
    gng_df.loc[i, 'task_gng_incorrectNoGo'] = json.loads(gng_df.Answers[i])['incorrectNoGo']
    gng_df.loc[i, 'task_gng_incorrectGo'] = json.loads(gng_df.Answers[i])['incorrectGo']    
    gng_df.loc[i, 'task_gng_correctGo'] = json.loads(gng_df.Answers[i])['correctGo']
    gng_df.loc[i, 'task_gng_correctNoGo'] = json.loads(gng_df.Answers[i])['correctNoGo']    
    gng_df.loc[i, 'task_gng_avgRt'] = np.mean(list(json.loads(gng_df.Answers[i])['reactionTime'].values()))

In [918]:
gng_df['task_gng_time'] = gng_df.time

In [919]:
gng_df = gng_df[['ParticipantIdentifier', 'trial_date',
                 'task_gng_time',
                 'task_gng_incorrectNoGo',
                 'task_gng_incorrectGo',
                 'task_gng_correctGo',
                 'task_gng_correctNoGo',
                 'task_gng_avgRt'
                ]]

In [920]:
gng_df.head(2)

Unnamed: 0,ParticipantIdentifier,trial_date,task_gng_time,task_gng_incorrectNoGo,task_gng_incorrectGo,task_gng_correctGo,task_gng_correctNoGo,task_gng_avgRt
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-04-06,07:33:26,1.0,0.0,40.0,9.0,412.780488
1,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,2023-04-06,07:39:38,2.0,1.0,39.0,8.0,483.142857


#### EDA

In [899]:
profile = ProfileReport(gng_df.iloc[:,3:], title=f"Go-Nogo Task Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_gng_run{run_num}.html")

Summarize dataset: 100%|██████████| 39/39 [00:01<00:00, 27.80it/s, Completed]                                             
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  1.61it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  4.12it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 250.57it/s]


#### Clean

We can see that there are 84 trials where the RT is 0. These trials are removed.

We also know from the work of Luce (1984) and Whelan (2008) that RTs below 100ms are not realistic.

Therefore all trials with RTs below 100 were also removed.

In [921]:
zero_trials = (gng_df.task_gng_avgRt == 0).sum()
sub100_trials = (gng_df.task_gng_avgRt < 100).sum()
trials_n = gng_df.shape[0]

print(f'{zero_trials}, or {(zero_trials/trials_n) *100:.1f}% of trials had RTs of 0.')
print(f'{sub100_trials}, or {(sub100_trials/trials_n) * 100:.1f}% of trials had RTs of less than 100.')

84, or 1.3% of trials had RTs of 0.
154, or 2.4% of trials had RTs of less than 100.


In [None]:
gng_df.iloc[:,3:].mean()

task_gng_incorrectNoGo      4.045353
task_gng_incorrectGo        2.686218
task_gng_correctGo         37.313782
task_gng_correctNoGo        5.954647
task_gng_avgRt            352.961181
dtype: float64

In [926]:
# Remove all trials with impossibly short rts
gng_df = gng_df.loc[gng_df.task_gng_avgRt >= 100,]

# Rerun EDA
profile = ProfileReport(gng_df.iloc[:,3:], title=f"Go-Nogo Task Run {run_num} - Clean | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_gng_run{run_num}_clean.html")

Summarize dataset: 100%|██████████| 39/39 [00:01<00:00, 29.89it/s, Completed]                                             
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  1.70it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  4.08it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 256.22it/s]


In [925]:
# Note the decrease in average incorrectGo (false alarm)
gng_df.iloc[:,3:].mean()

task_gng_incorrectNoGo      4.045353
task_gng_incorrectGo        2.686218
task_gng_correctGo         37.313782
task_gng_correctNoGo        5.954647
task_gng_avgRt            352.961181
dtype: float64

### BART

> The primary score used to measure BART performance is the average number of pumps on unexploded balloons, with higher scores indicative of greater risk-taking propensity (Bornovalova et al. 2005; Lejuez et al. 2002)

[Scoring Alternatives Paper](https://www.researchgate.net/publication/301645337_The_Multiple_Faces_of_Risk-Taking_Scoring_Alternatives_for_the_Balloon-Analogue_Risk_Task)

#### Load Data

In [1062]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

if 'bart_df' in globals():
    del(bart_df)
    print('deleted bart_df')

deleted existing df
deleted bart_df


In [1063]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_survey_results.csv')

#### Format Data

In [1064]:
# Select rows with BART task
bart_df = df.loc[df.ResultIdentifier.str.contains('custom_bart')].reset_index(drop=True)
# Remove rows that are just directions
bart_df = bart_df.loc[~bart_df.ResultIdentifier.str.contains('_info')].reset_index(drop=True)
# Remove summary rows
bart_df = bart_df.loc[~bart_df.ResultIdentifier.str.contains('summary')].reset_index(drop=True)

In [1065]:
json.loads(bart_df.Answers[0])

{'timingInMs': [283,
  134,
  148,
  149,
  151,
  134,
  132,
  134,
  150,
  183,
  150,
  183,
  150,
  183,
  200,
  383],
 'thisRoundEarnings': 80,
 'numberOfPumps': 16,
 'totalEarnings': 80,
 'balloonPopsAt': 19}

In [1066]:
# avg rt
def foo(x):
    try:
        return np.mean(json.loads(x)['timingInMs'][1:]) # start on second tap as people take longer on first...
    except (ValueError, KeyError):
        return None

v = np.vectorize(foo)
bart_df['avg_rt'] = v(bart_df.Answers)

# pop?
def foo(x):
    try:
        return json.loads(x)['thisRoundEarnings']
    except (ValueError, KeyError):
        return None

v = np.vectorize(foo)
bart_df['thisRoundEarnings'] = v(bart_df.Answers)
    
# numberOfPumps
def foo(x):
    try:
        return json.loads(x)['numberOfPumps']
    except (ValueError, KeyError):
        return None

v = np.vectorize(foo)
bart_df['pumps'] = v(bart_df.Answers)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [1067]:
# only take unpopped trials
bart_df = bart_df.loc[bart_df.thisRoundEarnings>0]

In [1068]:
pumps = pd.DataFrame(bart_df.groupby(['ParticipantIdentifier', 'trial_date'])['pumps'].sum()).reset_index()
pumps = pumps.rename(columns={'pumps': 'task_bart_total_pumps'})
unpopped = pd.DataFrame(bart_df.groupby(['ParticipantIdentifier', 'trial_date'])['pumps'].count()).reset_index()
unpopped = unpopped.rename(columns={'pumps': 'task_bart_unpopped_n'})
unpopped.head()             

Unnamed: 0,ParticipantIdentifier,trial_date,task_bart_unpopped_n
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-30,9
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-31,9
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,7
3,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,6
4,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-03,6


In [1069]:
# get mean of pumps and mean of rt for each DAY
bart_df = pd.DataFrame(bart_df.groupby(['ParticipantIdentifier', 'trial_date'])[['avg_rt', 'pumps']].mean()).reset_index()
bart_df = bart_df.rename(columns={"avg_rt": "task_bart_avg_rt", 'pumps': 'task_bart_avg_pumps'})

# add other data
bart_df = bart_df.merge(pumps, how='left', on=['ParticipantIdentifier', 'trial_date'])
bart_df = bart_df.merge(unpopped, how='left', on=['ParticipantIdentifier', 'trial_date'])

# calculate score
bart_df['task_bart_score'] = bart_df.task_bart_total_pumps * 5

In [1070]:
bart_df.head(2)

Unnamed: 0,ParticipantIdentifier,trial_date,task_bart_avg_rt,task_bart_avg_pumps,task_bart_total_pumps,task_bart_unpopped_n,task_bart_score
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-30,234.530449,12.666667,114,9,570
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-31,191.290741,16.888889,152,9,760


#### EDA

In [985]:
profile = ProfileReport(bart_df.iloc[:,2:], title=f"BART Task Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_BART_run{run_num}.html")

Summarize dataset: 100%|██████████| 39/39 [00:01<00:00, 23.46it/s, Completed]                                           
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.45s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  4.00it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 419.81it/s]


#### Clean

We can see that there are trials with VERY long average RTs. These should be removed before calculations are done.

We used the $z$ transform method to flag outliers based on Berger and Kiefer ([2021](https://doi.org/10.3389/fpsyg.2021.675558)) where they tested multiple methods of removing outliers from rt data.

We also set the flag threshold to 3.

---

We can also see that the `unpopped_n` (referring to trials where the balloon did not pop) values go up to 18 which is not possible since there are only 10 trials. Entries where there are more than 10 trials are removed.

In [1071]:
bart_df.iloc[:,2:].mean()

task_bart_avg_rt         263.456431
task_bart_avg_pumps       15.131685
task_bart_total_pumps    117.985638
task_bart_unpopped_n       7.794880
task_bart_score          589.928192
dtype: float64

In [1072]:
# clean outlier rt values

from scipy.stats import zscore

# Step 1: Filter out rt cols
rt_columns = [col for col in bart_df.columns if col.endswith('_rt')]

# Step 2: Compute the z-scores for these columns
for col in rt_columns:
    z_col_name = col + '_z'
    bart_df[z_col_name] = zscore(bart_df[col], nan_policy='omit')

In [1073]:
# Trial removal stats - RT
threshold = 3  # Define a threshold value

trials_removed = (abs(bart_df.task_bart_avg_rt_z) > threshold).sum()
trials_n = bart_df.shape[0]

print(f'Setting a z-score threshold of 3 removes {trials_removed} entries, or {(trials_removed/trials_n) * 100:.1f}% of the total.')

# Remove trials
bart_df = bart_df.loc[abs(bart_df.task_bart_avg_rt_z) <=threshold,]
# Remove z-score column
z_columns_to_drop = [col for col in bart_df.columns if col.endswith('_z')]
bart_df = bart_df.drop(columns=z_columns_to_drop)

Setting a z-scire threshold of 3 removes 43 entries, or 0.7% of the total.


In [1075]:
# Trial removal stats - trial num
trials_removed = (bart_df.task_bart_unpopped_n > 10).sum()
trials_n = bart_df.shape[0]

print(f'There are {trials_removed} entries, or {(trials_removed/trials_n) * 100:.1f}% that have more than 10 trials.')

# Remove trials
bart_df = bart_df.loc[bart_df.task_bart_unpopped_n <=10,]

There are 3 entries, or 0.0% that have more than 10 trials.


In [1076]:
# Rerun EDA
profile = ProfileReport(bart_df.iloc[:,2:], title=f"BART Task Run {run_num} - Clean | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_BART_run{run_num}_clean.html")

Summarize dataset: 100%|██████████| 39/39 [00:01<00:00, 25.32it/s, Completed]                                           
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  1.55it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  4.06it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 216.67it/s]


### EmoStroop

#### Load Data

In [1448]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

if 'emoStroop_df' in globals():
    del(emoStroop_df)
    print('deleted emoStroop_df')

deleted existing df
deleted emoStroop_df


In [1449]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_survey_results.csv')

In [1450]:
emoStroop_df = df.loc[df.ResultIdentifier.str.contains('emoStroop_trial')].reset_index(drop=True)

#### Format Data

In [1451]:
json.loads(emoStroop_df.Answers[4])

{'emotion': 'angry',
 'text': 'happy',
 'startTime': 5251,
 'endTime': 6338,
 'chosenEmotion': 'angry',
 'correctness': 'incorrect'}

In [1452]:
# congruent
def foo(x):
    try:
        return json.loads(x)['emotion'] == json.loads(x)['text']
    except (ValueError, KeyError):
        return None

v = np.vectorize(foo)
emoStroop_df['task_emoStroop_congruent'] = v(emoStroop_df.Answers)

# rt
def foo(x):
    try:
        return json.loads(x)['endTime'] - json.loads(x)['startTime']
    except (ValueError, KeyError):
        return None

v = np.vectorize(foo)
emoStroop_df['task_emoStroop_rt'] = v(emoStroop_df.Answers)
    
# correct
def foo(x):
    try:
        return json.loads(x)['emotion'] == json.loads(x)['chosenEmotion']
    except (ValueError, KeyError):
        return None

v = np.vectorize(foo)
emoStroop_df['task_emoStroop_correct'] = v(emoStroop_df.Answers)


In [1453]:
emoStroop_df = pd.DataFrame(emoStroop_df.groupby(['ParticipantIdentifier', 'trial_date', 'task_emoStroop_congruent'])
             [['task_emoStroop_rt', 'task_emoStroop_correct']].mean()).reset_index()


In [1454]:
# convert to wide
emoStroop_df = emoStroop_df.pivot_table(index = ['ParticipantIdentifier', 'trial_date'],
                         columns = 'task_emoStroop_congruent',
                         values = ['task_emoStroop_rt', 'task_emoStroop_correct']).reset_index()

In [1455]:
emoStroop_df.columns

MultiIndex([( 'ParticipantIdentifier',    ''),
            (            'trial_date',    ''),
            ('task_emoStroop_correct', False),
            ('task_emoStroop_correct',  True),
            (     'task_emoStroop_rt', False),
            (     'task_emoStroop_rt',  True)],
           names=[None, 'task_emoStroop_congruent'])

In [1456]:
# create correct column names
new_cols = []
for i in range(emoStroop_df.shape[1]):
    new_cols.append(emoStroop_df.columns.get_level_values(0)[i] + 
                    str(emoStroop_df.columns.get_level_values(1)[i]))
# x.columns.get_level_values(0)[1] + str(x.columns.get_level_values(1)[1])

In [1457]:
new_cols = [
    'ParticipantIdentifier',
    'trial_date',
    'task_emoStroop_accuracy_incongruent',
    'task_emoStroop_accuracy_congruent',
    'task_emoStroop_rt_incongruent',
    'task_emoStroop_rt_congruent'
]

In [1458]:
emoStroop_df.columns = emoStroop_df.columns.to_flat_index()

In [1459]:
emoStroop_df.columns = new_cols

In [1460]:
emoStroop_df.head(2)

Unnamed: 0,ParticipantIdentifier,trial_date,task_emoStroop_accuracy_incongruent,task_emoStroop_accuracy_congruent,task_emoStroop_rt_incongruent,task_emoStroop_rt_congruent
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-17,0.888889,1.0,1301.444444,946.0
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-30,0.875,0.857143,1024.125,891.857143


#### EDA

In [1461]:
profile = ProfileReport(emoStroop_df.iloc[:,2:], title=f"EmoStroop Task Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_emoStroop_run{run_num}.html")

Summarize dataset: 100%|██████████| 30/30 [00:01<00:00, 16.92it/s, Completed]                                                                       
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  1.78it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  5.44it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 278.86it/s]


#### Clean

We can see that there are entries with VERY long average `rt_congruent` and `rt_incongruent` values. These should be removed before calculations are done.

We used the Median Absolute Deviation (MAD) with a threshold of 3 to remove rt outliers ([Leys et al., 2014](https://www.sciencedirect.com/science/article/abs/pii/S0022103113000668)).

Use rule of thumb of 10 seconds...


In [1386]:
emoStroop_df.to_csv('emostroop.csv', index=False)

In [1464]:
# Filter rows where the absolute deviation from the median is above 3 times the MAD
emoStroop_df = emoStroop_df[emoStroop_df['task_emoStroop_rt_incongruent'] <= 10000]
emoStroop_df = emoStroop_df[emoStroop_df['task_emoStroop_rt_congruent'] <= 10000]

In [1389]:
from scipy.stats import median_abs_deviation

# Original size
n_original = emoStroop_df.shape[0]

# Clean outlier rt values
threshold = 3  # Define a threshold value

# Compute the MAD using scipy
mad_i = median_abs_deviation(emoStroop_df['task_emoStroop_rt_incongruent'], nan_policy='omit')
mad_c = median_abs_deviation(emoStroop_df['task_emoStroop_rt_congruent'], nan_policy='omit')

# Compute the median value of the 'reaction time' column
median_val_i = emoStroop_df['task_emoStroop_rt_incongruent'].median()
median_val_c = emoStroop_df['task_emoStroop_rt_congruent'].median()

# Filter rows where the absolute deviation from the median is above 3 times the MAD
emoStroop_df = emoStroop_df[(emoStroop_df['task_emoStroop_rt_incongruent'] - median_val_i).abs() <= threshold * mad_i]
emoStroop_df = emoStroop_df[(emoStroop_df['task_emoStroop_rt_congruent'] - median_val_c).abs() <= threshold * mad_c]

n_clean = emoStroop_df.shape[0]

print(f'Cleaning using MAD with a threshold of {threshold} removed {n_original - n_clean} entries, or {((n_original - n_clean) / n_original) * 100:.1f}%.')

Cleaning using MAD with a threshold of 3 removed 1540 entries, or 23.9%.


In [1465]:
# Rerun EDA
profile = ProfileReport(emoStroop_df.iloc[:,2:], title=f"EmoStroop Task Run {run_num} - Clean | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_emoStroop_run{run_num}_clean.html")

Summarize dataset:  26%|██▌       | 7/27 [00:00<00:00, 135.31it/s, scatter task_emoStroop_accuracy_incongruent, task_emoStroop_accuracy_incongruent]

Summarize dataset: 100%|██████████| 29/29 [00:01<00:00, 21.75it/s, Completed]                                                                       
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  1.29it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  4.64it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 331.54it/s]


### Motivation

#### Load Data

In [1392]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

if 'task_motivation' in globals():
    del(task_motivation)
    print('deleted task_motivation')

deleted existing df
deleted task_motivation


In [1393]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_survey_results.csv')

In [1394]:
task_motivation = df.loc[df.ResultIdentifier.str.contains('task_motivation')].reset_index(drop=True)

#### Format Data

In [1395]:
task_motivation = task_motivation[['ParticipantIdentifier', 'trial_date', 'time', 'Answers']]

In [1396]:
task_motivation.shape

(6592, 4)

In [1397]:
new_cols = [
    'ParticipantIdentifier',
    'trial_date',
    'task_motivation_time',
    'task_motivation_level'
]

In [1398]:
task_motivation.columns = new_cols
task_motivation.head(2)

Unnamed: 0,ParticipantIdentifier,trial_date,task_motivation_time,task_motivation_level
0,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,2023-04-06,07:15:48,10
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-04-06,07:24:50,5


#### EDA

In [1399]:
profile = ProfileReport(task_motivation.iloc[:,2:], title=f"Task Motivation Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_motivation_run{run_num}.html")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={"index": "df_index"}, inplace=True)
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Summarize dataset: 100%|██████████| 12/12 [00:00<00:00, 24.57it/s, Completed]                                          
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  4.19it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00, 31.39it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 629.68it/s]


### NASA TLX

#### Load Data

In [1400]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

if 'nasa_tlx' in globals():
    del(nasa_tlx)
    print('deleted nasa_tlx')

deleted existing df


In [1401]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_survey_results.csv')

In [1405]:
nasa_tlx = df.loc[df.ResultIdentifier.str.contains('nasa_')].reset_index(drop=True)

#### Format Data

In [1406]:
nasa_tlx.head()

Unnamed: 0,ParticipantIdentifier,ResultIdentifier,Answers,EndDate,datetime,trial_date,time
0,0151d9f1-1644-4437-805e-02f5e244a690,nasa_mental_demand,11,2023-04-06T07:36:23-04:00,2023-04-06 07:36:23-04:00,2023-04-06,07:36:23
1,0151d9f1-1644-4437-805e-02f5e244a690,nasa_temporal_demand,8,2023-04-06T07:36:27-04:00,2023-04-06 07:36:27-04:00,2023-04-06,07:36:27
2,0151d9f1-1644-4437-805e-02f5e244a690,nasa_performance,10,2023-04-06T07:36:44-04:00,2023-04-06 07:36:44-04:00,2023-04-06,07:36:44
3,0151d9f1-1644-4437-805e-02f5e244a690,nasa_effort,10,2023-04-06T07:36:55-04:00,2023-04-06 07:36:55-04:00,2023-04-06,07:36:55
4,0151d9f1-1644-4437-805e-02f5e244a690,nasa_frustration,6,2023-04-06T07:37:01-04:00,2023-04-06 07:37:01-04:00,2023-04-06,07:37:01


In [1407]:
# convert to wide
nasa_tlx = nasa_tlx.pivot_table(index = ['ParticipantIdentifier', 'trial_date'],
                         columns = 'ResultIdentifier',
                         values = 'Answers').reset_index()

# remove index name
nasa_tlx = nasa_tlx.rename_axis(None, axis=1)

In [1408]:
# tweak column names
list(nasa_tlx.iloc[:, 2:].add_prefix('task_').columns)

['task_nasa_distraction',
 'task_nasa_effort',
 'task_nasa_frustration',
 'task_nasa_luck',
 'task_nasa_mental_demand',
 'task_nasa_performance',
 'task_nasa_temporal_demand']

In [1409]:
new_cols = [
    'ParticipantIdentifier',
    'trial_date',
    'task_nasa_distraction',
    'task_nasa_effort',
    'task_nasa_frustration',
    'task_nasa_luck',
    'task_nasa_mental_demand',
    'task_nasa_performance',
    'task_nasa_temporal_demand'
]

In [1410]:
nasa_tlx.columns = new_cols

nasa_tlx.head(2)

Unnamed: 0,ParticipantIdentifier,trial_date,task_nasa_distraction,task_nasa_effort,task_nasa_frustration,task_nasa_luck,task_nasa_mental_demand,task_nasa_performance,task_nasa_temporal_demand
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-30,12.0,13.0,11.0,13.0,11.0,12.0,13.0
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-31,12.0,14.0,0.0,12.0,8.0,10.0,6.0


#### EDA

In [1411]:
profile = ProfileReport(nasa_tlx.iloc[:,2:], title=f"NASA TLX Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"NASA_tlx_run{run_num}.html")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={"index": "df_index"}, inplace=True)
Summarize dataset: 100%|██████████| 66/66 [00:03<00:00, 21.58it/s, Completed]                                                   
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  1.22it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.14it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 271.46it/s]


#### Clean

There are maximimum values over 600 which doesn't make sense since the max is 20

In [1413]:
nasa_tlx.to_csv('nasa.csv', index=False)

In [1418]:
# Replace values greater than 20 in numeric columns with NaN
nasa_tlx.iloc[:,2:] = nasa_tlx.iloc[:,2:].where(lambda x: x <= 20, other=pd.NA)


In [1422]:
# rerun EDA
profile = ProfileReport(nasa_tlx.iloc[:,2:], title=f"NASA TLX Run {run_num} - Clean | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"NASA_tlx_run{run_num}_clean.html")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={"index": "df_index"}, inplace=True)
Summarize dataset: 100%|██████████| 66/66 [00:03<00:00, 18.68it/s, Completed]                                                   
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  1.12it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.08it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 195.43it/s]


### Join Custom Tasks

In [None]:
gng_df.shape

(6394, 8)

In [None]:
nback_df.shape

(6332, 10)

In [None]:
bart_df.shape

(6406, 7)

In [None]:
emoStroop_df.shape

(6432, 6)

In [None]:
task_motivation.shape

(6592, 4)

In [None]:
nasa_tlx.shape

(6353, 9)

In [None]:
custom_tasks_df = task_motivation.merge(gng_df, how='outer', on=['ParticipantIdentifier', 'trial_date'])

In [None]:
custom_tasks_df = custom_tasks_df.merge(bart_df, how='outer', on=['ParticipantIdentifier', 'trial_date'])
custom_tasks_df = custom_tasks_df.merge(emoStroop_df, how='outer', on=['ParticipantIdentifier', 'trial_date'])
custom_tasks_df = custom_tasks_df.merge(nback_df, how='outer', on=['ParticipantIdentifier', 'trial_date'])
custom_tasks_df = custom_tasks_df.merge(nasa_tlx, how='outer', on=['ParticipantIdentifier', 'trial_date'])

In [None]:
custom_tasks_df.head(2)

Unnamed: 0,ParticipantIdentifier,trial_date,task_motivation_time,task_motivation_level,task_gng_time,task_gng_incorrectNoGo,task_gng_incorrectGo,task_gng_correctGo,task_gng_correctNoGo,task_gng_avgRt,...,task_nback_missed,task_nback_falseAlarm,task_nback_accuracy,task_nasa_distraction,task_nasa_effort,task_nasa_frustration,task_nasa_luck,task_nasa_mental_demand,task_nasa_performance,task_nasa_temporal_demand
0,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,2023-04-06,07:15:48,10,07:39:38,2.0,1.0,39.0,8.0,483.142857,...,1.0,0.0,0.954545,6.0,11.0,8.0,5.0,5.0,7.0,4.0
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-04-06,07:24:50,5,07:33:26,1.0,0.0,40.0,9.0,412.780488,...,1.0,0.0,0.954545,8.0,10.0,6.0,5.0,11.0,10.0,8.0


## HK Tasks

### Trail Making

[RKStudio Documentation](https://rkstudio-support.careevolution.com/hc/en-us/articles/1500002201361-Trailmaking-Active-Task-Export-Format)

What is trailmaking task really measuring?

>The Trail Making Test is a neuropsychological test of visual attention and task switching. It consists of two parts in which the subject is instructed to connect a set of 25 dots as quickly as possible while still maintaining accuracy. The test can provide information about visual search speed, scanning, speed of processing, mental flexibility, as well as executive functioning.[[1](https://doi.apa.org/doiLanding?doi=10.1037%2F1040-3590.7.2.220)]

- visual attention
- task switching
- fluid intelligence/cognitive abilities

**Reference**

[1] [Salthouse, 2011](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3141679/)

<div class="alert alert-block alert-warning">
<b>Note:</b><br>
<ul> 
    <li>We are using fewer "dots" (12)</li>
    <li>We are using both a number only and a letter/number version (e.g. 1-A-2-B-3-C...).</li>
<ul>
</div>

In [231]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

if 'trailmaking_df' in globals():
    del(trailmaking_df)
    print('deleted trailmaking_df')

deleted existing df
deleted trailmaking_df


In [232]:
print(f'Loading cohort {run_num}...')

Loading cohort 2...


In [233]:
# loop through all days
days = [i for i in os.listdir(path) if i.startswith('RK')]
days.sort()
for day in tqdm(days):
    files = os.listdir(path + day)
    surveyQuestions = [i for i in files if i.startswith('SurveyTrailmakingResults')]
    versionInfo = [i for i in files if i.startswith('SurveyResults')]
    # there should be only one
    for file in surveyQuestions:
        if 'df' not in globals():
            df = pd.read_csv(path + day + '/' + file)
            df_version = pd.read_csv(path + day + '/' + versionInfo[0])
        else:
            temp_df = pd.read_csv(path + day + '/' + file)
            temp_df_version = pd.read_csv(path + day + '/' + versionInfo[0])
            df = pd.concat([df,temp_df], axis=0)
            df_version = pd.concat([df_version,temp_df_version], axis=0)

100%|██████████| 55/55 [00:01<00:00, 47.46it/s]


In [234]:
# Identify the numeric and alphanumeric versions
df_version = df_version[['SurveyResultKey', 'SurveyName']]
df_version = df_version.loc[df_version.SurveyName.str.contains('_trail')][['SurveyResultKey', 'SurveyName']]
df = pd.merge(df, df_version, how='left', on='SurveyResultKey')

# rename SurveyName
d = {'task_hk_trail_making': 'task_hk_trailmaking_alphaNumeric', 'task_hk_trailmaking_a1': 'task_hk_trailmaking_numeric'}

df = df.replace({"SurveyName": d})

df = df.dropna(subset=['EndDate']).reset_index(drop=True)

In [235]:
# Select subjects from correct run
df = df.loc[df.ParticipantIdentifier.isin(subjects)].reset_index(drop=True)

In [236]:
# add trial date and time columns
for i in tqdm(range(df.shape[0])):
    dt = parser.parse(df.loc[i, 'EndDate'])
    df.loc[i, 'datetime'] = dt
    df.loc[i, 'trial_date'] = (dt + datetime.timedelta(hours = -4)).date() # trial day associated with sample (4am is when the day flips)
    df.loc[i, 'time'] = dt.time()

100%|██████████| 13031/13031 [00:01<00:00, 9486.80it/s]


In [237]:
df.head(2)

Unnamed: 0,SurveyTrailmakingResultKey,SurveyStepResultKey,SurveyResultKey,ParticipantIdentifier,NumberOfErrors,StartDate,EndDate,Taps,SurveyName,datetime,trial_date,time
0,8b02e79a-1291-ed11-aac3-0afb9334277d,7b02e79a-1291-ed11-aac3-0afb9334277d,7002e79a-1291-ed11-aac3-0afb9334277d,4f4440e7-3a38-4fa7-9271-9730806e441a,1,2023-01-10T13:14:12-05:00,2023-01-10T13:14:26-05:00,"[{""TapTimestamp"":1.3185780048370361,""TapIndex""...",task_hk_trailmaking_numeric,2023-01-10 13:14:26-05:00,2023-01-10,13:14:26
1,96b7e6b8-1291-ed11-aac3-0afb9334277d,81b7e6b8-1291-ed11-aac3-0afb9334277d,73b7e6b8-1291-ed11-aac3-0afb9334277d,4f4440e7-3a38-4fa7-9271-9730806e441a,1,2023-01-10T13:14:54-05:00,2023-01-10T13:15:11-05:00,"[{""TapTimestamp"":1.0001180171966553,""TapIndex""...",task_hk_trailmaking_alphaNumeric,2023-01-10 13:15:11-05:00,2023-01-10,13:15:11


The important data is in `Taps` where we have:
- `TapTimestamp`
- `TapIndex`
- `TapIncorrect`

I want to get the last `TapTimestamp` to calculate total timing

In [238]:
# Taps is a string of a list of dictionaries
df.Taps[250]

'[{"TapTimestamp":1.0669429302215576,"TapIndex":0,"TapIncorrect":false},{"TapTimestamp":1.43839693069458,"TapIndex":2,"TapIncorrect":true},{"TapTimestamp":2.8393769264221191,"TapIndex":4,"TapIncorrect":true},{"TapTimestamp":3.4151489734649658,"TapIndex":0,"TapIncorrect":true},{"TapTimestamp":5.4343969821929932,"TapIndex":1,"TapIncorrect":false},{"TapTimestamp":6.535146951675415,"TapIndex":2,"TapIncorrect":false},{"TapTimestamp":7.366270899772644,"TapIndex":3,"TapIncorrect":false},{"TapTimestamp":7.936242938041687,"TapIndex":4,"TapIncorrect":false},{"TapTimestamp":8.4680279493331909,"TapIndex":5,"TapIncorrect":false},{"TapTimestamp":9.1123839616775513,"TapIndex":6,"TapIncorrect":false},{"TapTimestamp":9.9355719089508057,"TapIndex":7,"TapIncorrect":false},{"TapTimestamp":13.255127906799316,"TapIndex":9,"TapIncorrect":true},{"TapTimestamp":13.660766959190369,"TapIndex":8,"TapIncorrect":false},{"TapTimestamp":14.298128962516785,"TapIndex":9,"TapIncorrect":false},{"TapTimestamp":15.02449500

In [239]:
# Can convert to list of dicts and then access an individual dict
data = json.loads(df.Taps[0])
print(data[-1])
data[-1]['TapTimestamp']

{'TapTimestamp': 12.300704956054688, 'TapIndex': 12, 'TapIncorrect': False}


12.300704956054688

In [240]:
# convert string Taps to list of dicts
df['TapsList'] = df['Taps'].apply(json.loads)

# test if any lists are empty...
for i in range(len(df.TapsList)):
    if df.TapsList[i]:
        x = df.TapsList[i][-1]['TapTimestamp']
    else:
        print(i)

1667
11010


In [241]:
# drop rows with empty lists
df = df.drop(df.index[[
    1667,
    11010
    ]])

In [242]:
df.shape

(13029, 13)

In [244]:
# Add variance
def variance_of_differences(taps_str):
    # Convert string to list of dictionaries
    taps_list = json.loads(taps_str)
    
    # Extract TapTimestamp values
    timestamps = [tap['TapTimestamp'] for tap in taps_list]
    
    # Calculate the differences between consecutive timestamps
    differences = np.diff(timestamps)[1:] # diff between first two taps not relevant
    
    # Return variance of differences
    return np.var(differences)

# Apply the function to the TapsList column
df['variance_of_diff'] = df['Taps'].apply(variance_of_differences)

In [245]:
# assign new columns with final value from TapsList
# rename column
df = df.assign(task_trailmaking_time=lambda x: x.TapsList.apply(lambda x: x[-1]['TapTimestamp'] - x[1]['TapTimestamp']),
               task_trailmaking_errors=lambda x: x.NumberOfErrors)

# keep relevant columns
trailmaking_df = df[['ParticipantIdentifier', 'trial_date', 'time', 'SurveyName', 'task_trailmaking_time', 'task_trailmaking_errors', 'variance_of_diff']]

In [246]:
# CLEAN DATA (see cleaning heading for detail)
trailmaking_df = trailmaking_df.loc[(trailmaking_df['task_trailmaking_time'] < 30) & (trailmaking_df['task_trailmaking_time'] !=0) & (trailmaking_df['task_trailmaking_errors'] <=10)]

In [247]:
trailmaking_df.head()

Unnamed: 0,ParticipantIdentifier,trial_date,time,SurveyName,task_trailmaking_time,task_trailmaking_errors,variance_of_diff
0,4f4440e7-3a38-4fa7-9271-9730806e441a,2023-01-10,13:14:26,task_hk_trailmaking_numeric,8.969085,1,0.288832
1,4f4440e7-3a38-4fa7-9271-9730806e441a,2023-01-10,13:15:11,task_hk_trailmaking_alphaNumeric,14.234255,1,0.567672
2,1dd79a79-dd14-4932-b81b-16f95bbcd796,2023-01-10,16:53:17,task_hk_trailmaking_numeric,13.172253,0,0.514233
3,1dd79a79-dd14-4932-b81b-16f95bbcd796,2023-01-10,16:54:22,task_hk_trailmaking_alphaNumeric,13.54007,0,0.310045
4,68794228-9bbc-4199-b58c-192307df77f2,2023-01-10,23:01:39,task_hk_trailmaking_numeric,8.000827,0,0.202404


In [248]:
# make wide
trailmaking_df = trailmaking_df.pivot_table(index=['trial_date', 'ParticipantIdentifier'],
                                            columns='SurveyName', 
                                            values=['task_trailmaking_time', 'task_trailmaking_errors', 'variance_of_diff']).reset_index()

In [249]:
# flatten columns
trailmaking_df.columns = trailmaking_df.columns.to_series().str.join('_')

In [251]:
# rename columns
trailmaking_df.columns = ['trial_date', 'ParticipantIdentifier', 'task_trailmaking_alphaNumeric_errors', 'task_trailmaking_numeric_errors', 'task_trailmaking_alphaNumeric_time', 'task_trailmaking_numeric_time', 'task_trailmaking_alphaNumeric_var', 'task_trailmaking_numeric_var']

In [252]:
trailmaking_df.head()

Unnamed: 0,trial_date,ParticipantIdentifier,task_trailmaking_alphaNumeric_errors,task_trailmaking_numeric_errors,task_trailmaking_alphaNumeric_time,task_trailmaking_numeric_time,task_trailmaking_alphaNumeric_var,task_trailmaking_numeric_var
0,2023-01-10,1dd79a79-dd14-4932-b81b-16f95bbcd796,0.0,0.0,13.54007,13.172253,0.310045,0.514233
1,2023-01-10,4f4440e7-3a38-4fa7-9271-9730806e441a,1.0,1.0,14.234255,8.969085,0.567672,0.288832
2,2023-01-10,68794228-9bbc-4199-b58c-192307df77f2,0.0,0.0,16.918175,8.000827,0.572064,0.202404
3,2023-01-11,f8f71506-9382-40c7-99db-5c170b2a9abb,1.0,0.0,11.848368,6.397513,0.545629,0.114705
4,2023-01-13,5af5c134-5a74-416b-b6b2-4e5a29f55688,1.0,0.0,17.117682,4.998748,0.356313,0.023884


In [253]:
trailmaking_df.iloc[:,3:].mean()

task_trailmaking_numeric_errors       0.395709
task_trailmaking_alphaNumeric_time    7.686384
task_trailmaking_numeric_time         6.320767
task_trailmaking_alphaNumeric_var     0.234049
task_trailmaking_numeric_var          0.149666
dtype: float64

In [254]:
trailmaking_df.shape

(6406, 8)

#### EDA

In [255]:
profile = ProfileReport(trailmaking_df.iloc[:,2:], title=f"Trailmaking Task Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_trailmaking_run{run_num}.html")

Summarize dataset: 100%|██████████| 52/52 [00:02<00:00, 22.98it/s, Completed]                                                                         
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  1.37it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.77it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 193.30it/s]


#### Clean

We had numerous outlier values.

**ERRORS**

Given that there were only 12 responses in a given trial and that the mean error rate was close to 0.5, maximum values such as 54 seem a bit unlikely and may indicate either someone was not trying or not understanding the task.

Given that we are dealing with a zero-inflated skewed distribution removing outliers appropriately is challenging.

I used an ad-hoc decision to remove those trials with more than 10 errors. This was total of 26 out fo the 6498 total trials.

**RTs**

We used median absolute deviation to remove outliers (see [Leys et al., 2013](https://www.sciencedirect.com/science/article/abs/pii/S0022103113000668)).

We set the flag threshold to 3.

However this removed more than 10% of the obervations.

Therefore we decided to again use an ad-hoc approach setting the maximum RT threshold to 30 seconds which removes less than 1% of trials.

**Also** there were a number of trials with RTs of 0. These were removed.

<div class="alert alert-block alert-info">
<b>📝 Note:</b><br>
    I went back and applied these cleaning heuristics to the data in long format so as not to remove both trials in the event they were not both outliers.
</div>

In [228]:
# Remove high error trials

threshold = 10

count = (trailmaking_df.task_trailmaking_numeric_errors > threshold).sum()
total_n =  len(trailmaking_df)

print(f'NUMERIC')
print(f'Setting a threshold of {threshold} removes {count} trials out of {total_n} total numeric trials.')
print(f'This is {(count/total_n)*100:.1f}% of the trials.\n')

count = (trailmaking_df.task_trailmaking_alphaNumeric_errors > threshold).sum()

print(f'ALPHANUMERIC')
print(f'Setting a threshold of {threshold} removes {count} trials out of {total_n} total alpha-numeric trials.')
print(f'This is {(count/total_n)*100:.1f}% of the trials.')

NUMERIC
Setting a threshold of 10 removes 0 trials out of 6491 total numeric trials.
This is 0.0% of the trials.

ALPHANUMERIC
Setting a threshold of 10 removes 0 trials out of 6491 total alpha-numeric trials.
This is 0.0% of the trials.


In [229]:
# MAD approach to RT outlier removal

# Calculate MAD for the specified columns
def mad(series):
    median_value = series.median()
    return (series - median_value).abs().median()

# MAD-based outlier detection
def detect_outliers(series, threshold):
    median_value = series.median()
    mad_value = mad(series)
    
    # Detect outliers
    outliers = ((series - median_value).abs() > threshold * mad_value)
    return outliers

threshold = 3.0

# Detect outliers in the 'task_trailmaking_numeric_time' column
x = detect_outliers(trailmaking_df['task_trailmaking_numeric_time'], threshold=threshold).sum()
print(f'NUMERIC:\nUsing a threshold of {threshold} with MAD we find {x} outliers in the {trailmaking_df.shape[0]} observations.\n')

# Detect outliers in the 'task_trailmaking_alphaNumeric_time' column
x = detect_outliers(trailmaking_df['task_trailmaking_alphaNumeric_time'], threshold=threshold).sum()
print(f'ALPHANUMERIC:\nUsing a threshold of {threshold} with MAD we find {x} outliers in the {trailmaking_df.shape[0]} observations.')

NUMERIC:
Using a threshold of 3.0 with MAD we find 590 outliers in the 6491 observations.

ALPHANUMERIC:
Using a threshold of 3.0 with MAD we find 826 outliers in the 6491 observations.


In [230]:
# Ad hoc approach to RT outlier removal
threshold = 30

count = (trailmaking_df.task_trailmaking_numeric_time > threshold).sum()
total_n = trailmaking_df.shape[0]

print(f'NUMERIC')
print(f'Setting a threshold of {threshold} removes {count} trials out of {total_n} total numeric trials.')
print(f'This is {(count/total_n)*100:.1f}% of the trials.\n')

count = (trailmaking_df.task_trailmaking_alphaNumeric_time > threshold).sum()

print(f'ALPHANUMERIC')
print(f'Setting a threshold of {threshold} removes {count} trials out of {total_n} total alpha-numeric trials.')
print(f'This is {(count/total_n)*100:.1f}% of the trials.')



NUMERIC
Setting a threshold of 30 removes 0 trials out of 6491 total numeric trials.
This is 0.0% of the trials.

ALPHANUMERIC
Setting a threshold of 30 removes 0 trials out of 6491 total alpha-numeric trials.
This is 0.0% of the trials.


### Stroop

[RK Studio Documentation](http://researchkit.org/docs/docs/ActiveTasks/ActiveTasks.html#stroophttp://researchkit.org/docs/docs/ActiveTasks/ActiveTasks.html#stroop)

[Scarpina & Tagini, 2017](https://www.frontiersin.org/articles/10.3389/fpsyg.2017.00557/full) on scoring in their paper The Stroop Color and Words Test.

>The Stroop Color and Word Test (SCWT) is a neuropsychological test extensively used to assess the ability to inhibit cognitive interference that occurs when the processing of a specific stimulus feature impedes the simultaneous processing of a second stimulus attribute, well-known as the Stroop Effect.

**Interpretation**
>While the SCWT is widelyused to measure the ability to inhibit cognitive interference; previous literature also reports itsapplication to measure other cognitive functions such as attention, processing speed, cognitive flexibility (Jensen and Rohwer, 1966), and working memory(Kane and Engle, 2003). Thus, it may be possible to use the SCWTto measure multiple cognitive functions.

>According to the review, the studies with Italian normativedata present different theoretical interpretations of the SCWTscores.Amato et al. (2006)andCaffarra et al. (2002)describe theSCWT score as a measure of the fronto-executive functioning,while others use it as an index of the attentional functioning(Barbarotto et al., 1998; Valgimigli et al., 2010) or of generalcognitive efficiency (Brugnolo et al., 2015). Slowing to a responseconflict would be due to a failure of selective attention or a lack inthe cognitive efficiency instead of a failure of response inhibition(Chafetz and Matthews, 2004); however, the performance inthe SCWT is not exclusively related to concentration, attentionor cognitive effectiveness, but it relies to a more specificexecutive-frontal domain. Indeed, subjects have to processselectively a specific visual feature blocking out continuouslythe automatic processing of reading (Zajano and Gorman, 1986;Shum et al., 1990), in order to solve correctly the task. The specificinvolvement of executive processes is supported by clinical data.Patients with anterior frontal lesions, and not with posteriorcerebral damages, report significant difficulties in maintaining aconsistent activation of the intended response (Valgimigli et al.,2010). Furthermore, Parkinson’s Disease patients, characterizedby executive dysfunction due to the disruption of dopaminergicpathway (Fera et al., 2007), reported difficulties in SCWT despiteunimpaired attentional abilities (Fera et al., 2007; Djamshidianet al., 2011).

#### Load Data

In [321]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')
    
if 'stroop_df' in globals():
    del(stroop_df)
    print('deleted existing stroop_df')

deleted existing df
deleted existing stroop_df


In [322]:
# loop through all days
days = [i for i in os.listdir(path) if i.startswith('RK')]
for day in tqdm(days):
    files = os.listdir(path + day)
    surveyQuestions = [i for i in files if i.startswith('SurveyStroopResults')]
    # there should be only one
    for file in surveyQuestions:
        if 'df' not in globals():
            df = pd.read_csv(path + day + '/' + file)
        else:
            temp_df = pd.read_csv(path + day + '/' + file)
            df = pd.concat([df,temp_df], axis=0)

100%|██████████| 55/55 [00:00<00:00, 64.01it/s]


#### Format Data

In [323]:
# check for na dates...
df.isna().sum()

SurveyStroopResultKey         0
SurveyStepResultKey           0
SurveyResultKey               0
ParticipantIdentifier         0
StartTime                     0
EndTime                       0
ColorSelected                 0
Color                         0
Text                          0
StroopStyle              132596
StartDate                     0
EndDate                       0
dtype: int64

In [324]:
df = df.dropna(subset=['EndDate']).reset_index(drop=True)

In [325]:
# Select only subjects in correct cohort
df = df.loc[df.ParticipantIdentifier.isin(subjects)].reset_index(drop=True)

# # temporarily removing spanish participant
# df = df.loc[df.ParticipantIdentifier!='35d11ffc-7034-4708-a086-cd4bd47b51fd'].reset_index(drop=True)

In [326]:
# add trial date and time columns
for i in tqdm(range(df.shape[0])):
    dt = parser.parse(df.loc[i, 'EndDate'])
    df.loc[i, 'datetime'] = dt
    df.loc[i, 'trial_date'] = (dt + datetime.timedelta(hours = -4)).date() # trial day associated with sample (4am is when the day flips)
    df.loc[i, 'time'] = dt.time()

100%|██████████| 130058/130058 [00:13<00:00, 9527.16it/s] 


In [327]:
df.head(2)

Unnamed: 0,SurveyStroopResultKey,SurveyStepResultKey,SurveyResultKey,ParticipantIdentifier,StartTime,EndTime,ColorSelected,Color,Text,StroopStyle,StartDate,EndDate,datetime,trial_date,time
0,e71501c5-6cd4-ed11-aac6-0afb9334277d,c01501c5-6cd4-ed11-aac6-0afb9334277d,aa1501c5-6cd4-ed11-aac6-0afb9334277d,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,684851.7233056249,684853.0250275834,GREEN,GREEN,RED,,2023-04-06T07:18:17-04:00,2023-04-06T07:18:17-04:00,2023-04-06 07:18:17-04:00,2023-04-06,07:18:17
1,e91501c5-6cd4-ed11-aac6-0afb9334277d,c01501c5-6cd4-ed11-aac6-0afb9334277d,aa1501c5-6cd4-ed11-aac6-0afb9334277d,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,684853.5275298752,684854.273287625,YELLOW,YELLOW,YELLOW,,2023-04-06T07:18:18-04:00,2023-04-06T07:18:18-04:00,2023-04-06 07:18:18-04:00,2023-04-06,07:18:18


In [328]:
# replace commas with dots
df['StartTime'] = df['StartTime'].replace(',', '.', regex=True)
df.StartTime = df.StartTime.astype('float')

df['EndTime'] = df['EndTime'].replace(',', '.', regex=True)
df.EndTime = df.EndTime.astype('float')

In [329]:
# create correct, congruous and time columns
df = df[['ParticipantIdentifier', 'trial_date', 'StartTime', 'EndTime', 'ColorSelected', 'Color', 'Text']]
df = df.assign(congruent=lambda x: x.Color == x.Text,
               correct=lambda x: x.Color == x.ColorSelected,
               rt=lambda x: (x.EndTime - x.StartTime)
              )
df.head(3)

Unnamed: 0,ParticipantIdentifier,trial_date,StartTime,EndTime,ColorSelected,Color,Text,congruent,correct,rt
0,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,2023-04-06,684851.723306,684853.025028,GREEN,GREEN,RED,False,True,1.301722
1,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,2023-04-06,684853.52753,684854.273288,YELLOW,YELLOW,YELLOW,True,True,0.745758
2,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,2023-04-06,684854.775675,684855.522062,YELLOW,YELLOW,GREEN,False,True,0.746387


In [330]:
# Add additional columns
# define function that returns a Series of all aggregations

def f(x):
    d = {}
    
    d['task_stroop_totalCorrectProp'] = x['correct'].sum()/len(x['correct'])
    d['task_stroop_congruentCorrectProp'] = len(x.loc[(x.congruent==True) & (x.correct==True)])/ (x.congruent == True).sum()
    d['task_stroop_incongruentCorrectProp'] = len(x.loc[(x.congruent==False) & (x.correct==True)])/ (x.congruent == False).sum()
    d['task_stroop_totalAvgRT'] = x['rt'].sum()/len(x['rt'])    
    d['task_stroop_congruentAvgRT'] = x.loc[x.congruent==True,'rt'].sum()/ (x.congruent == True).sum()
    d['task_stroop_incongruentAvgRT'] = x.loc[x.congruent==False,'rt'].sum()/ (x.congruent == False).sum()
    
    return pd.Series(d, index=['task_stroop_totalCorrectProp', 'task_stroop_congruentCorrectProp',
                               'task_stroop_incongruentCorrectProp', 'task_stroop_totalAvgRT',
                               'task_stroop_congruentAvgRT', 'task_stroop_incongruentAvgRT'
                              ])

# note that value_counts gives us the number of trues and falses for boolean columns
# then indexing into 0 for false and 1 for true

In [331]:
stroop_df = df.groupby(['ParticipantIdentifier', 'trial_date']).apply(f).reset_index()
stroop_df.head(3)

Unnamed: 0,ParticipantIdentifier,trial_date,task_stroop_totalCorrectProp,task_stroop_congruentCorrectProp,task_stroop_incongruentCorrectProp,task_stroop_totalAvgRT,task_stroop_congruentAvgRT,task_stroop_incongruentAvgRT
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-17,0.95,1.0,0.909091,0.767591,0.649432,0.864267
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-30,0.95,1.0,0.875,0.759695,0.670685,0.893211
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-31,1.0,1.0,1.0,0.765498,0.770194,0.764324


In [332]:
stroop_df.iloc[:,2:].mean()

task_stroop_totalCorrectProp          0.939470
task_stroop_congruentCorrectProp      0.977041
task_stroop_incongruentCorrectProp    0.901700
task_stroop_totalAvgRT                0.907755
task_stroop_congruentAvgRT            0.902239
task_stroop_incongruentAvgRT          0.915852
dtype: float64

#### EDA

In [333]:
profile = ProfileReport(stroop_df.iloc[:,2:], title=f"Stroop Task Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_stroop_run{run_num}.html")

Summarize dataset: 100%|██████████| 51/51 [00:01<00:00, 30.22it/s, Completed]                                                                     
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  1.31it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.91it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 197.66it/s]


#### Clean

We had some clear RT outlier values given that the mean of the trials' average RT (even with outliers) was less than 1 second, and yet there where maxmum average RT that were multiple minutes.

**RTs**

We first used median absolute deviation to remove outliers (see [Leys et al., 2013](https://www.sciencedirect.com/science/article/abs/pii/S0022103113000668)).

We set the flag threshold to 3.

However this would remove more than 10% of the obervations.

Therefore we decided to use an ad-hoc approach setting the maximum RT threshold to 3 seconds which removes less than 1% of trials.

We also know from the work of Luce (1984) and Whelan (2008) that RTs below 100ms are not realistic.

Therefore all trials with average RTs below 100 were also removed (this only removed one trial).

In [334]:
# MAD approach to RT outlier removal

# Calculate MAD for the specified columns
def mad(series):
    median_value = series.median()
    return (series - median_value).abs().median()

# MAD-based outlier detection
def detect_outliers(series, threshold):
    median_value = series.median()
    mad_value = mad(series)
    
    # Detect outliers
    outliers = ((series - median_value).abs() > threshold * mad_value)
    return outliers

threshold = 3.0

# Detect outliers in the 'task_trailmaking_numeric_time' column
x = detect_outliers(stroop_df['task_stroop_totalAvgRT'], threshold=threshold).sum()
print(f'NUMERIC:\nUsing a threshold of {threshold} with MAD we find {x} outliers in the {trailmaking_df.shape[0]} observations.\n')

# # Detect outliers in the 'task_trailmaking_alphaNumeric_time' column
# x = detect_outliers(trailmaking_df['task_trailmaking_alphaNumeric_time'], threshold=threshold).sum()
# print(f'ALPHANUMERIC:\nUsing a threshold of {threshold} with MAD we find {x} outliers in the {trailmaking_df.shape[0]} observations.')

NUMERIC:
Using a threshold of 3.0 with MAD we find 818 outliers in the 6406 observations.



In [336]:
# Ad hoc approach to RT outlier removal
threshold = 3

count = (stroop_df.task_stroop_totalAvgRT > threshold).sum()
total_n = stroop_df.shape[0]

print(f'CEILING')
print(f'Setting a threshold of {threshold}s removes {count} trials out of {total_n} total numeric trials.')
print(f'This is {(count/total_n)*100:.1f}% of the trials.\n')

# Remove values based on these thresholds
stroop_df = stroop_df.loc[stroop_df.task_stroop_totalAvgRT <= threshold]

threshold = .1
count = (stroop_df.task_stroop_totalAvgRT < threshold).sum()

print(f'FLOOR')
print(f'Setting a threshold of {threshold}s removes {count} trials out of {total_n} total alpha-numeric trials.')
print(f'This is {(count/total_n)*100:.1f}% of the trials.')

stroop_df = stroop_df.loc[stroop_df.task_stroop_totalAvgRT > threshold]

CEILING
Setting a threshold of 3s removes 0 trials out of 6484 total numeric trials.
This is 0.0% of the trials.

FLOOR
Setting a threshold of 0.1s removes 0 trials out of 6484 total alpha-numeric trials.
This is 0.0% of the trials.


In [337]:
# Rerun EDA with clean df
profile = ProfileReport(stroop_df.iloc[:,2:], title=f"Stroop Task Run {run_num} - Clean | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_stroop_run{run_num}_clean.html")

Summarize dataset: 100%|██████████| 51/51 [00:01<00:00, 27.91it/s, Completed]                                                                     
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.06s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.85it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 458.90it/s]


### PSAT

[RKStudio Documentation](https://rkstudio-support.careevolution.com/hc/en-us/articles/1500002352262-Paced-Serial-Addition-Test-PSAT-Active-Task-Export-Format)

#### Load Data

In [418]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')
    
if 'psat_df' in globals():
    del(psat_df)
    print('deleted existing psat_df')

deleted existing df
deleted existing psat_df


In [419]:
# loop through all days
days = [i for i in os.listdir(path) if i.startswith('RK')]
for day in tqdm(days):
    files = os.listdir(path + day)
    surveyQuestions = [i for i in files if i.startswith('SurveyPSATResults')]
    # there should be only one
    for file in surveyQuestions:
        if 'df' not in globals():
            df = pd.read_csv(path + day + '/' + file)
        else:
            temp_df = pd.read_csv(path + day + '/' + file)
            df = pd.concat([df,temp_df], axis=0)

100%|██████████| 55/55 [00:00<00:00, 210.06it/s]


#### Format Data

In [420]:
# check for na dates...
df.isna().sum()

SurveyPSATResultKey      0
SurveyStepResultKey      0
SurveyResultKey          0
ParticipantIdentifier    0
PresentationMode         0
InterStimulusInterval    0
StimulusDuration         0
Length                   0
TotalCorrect             0
TotalDyad                0
TotalTime                0
InitialDigit             0
StartDate                0
EndDate                  0
Samples                  0
dtype: int64

In [421]:
df = df.dropna(subset=['EndDate']).reset_index(drop=True)

In [422]:
# Select only subjects in current run
df = df.loc[df.ParticipantIdentifier.isin(subjects)].reset_index(drop=True)

In [423]:
# add trial date and time columns
for i in tqdm(range(df.shape[0])):
    dt = parser.parse(df.loc[i, 'EndDate'])
    df.loc[i, 'datetime'] = dt
    df.loc[i, 'trial_date'] = (dt + datetime.timedelta(hours = -4)).date() # trial day associated with sample (4am is when the day flips)
    df.loc[i, 'time'] = dt.time()

100%|██████████| 6418/6418 [00:00<00:00, 9199.50it/s]


In [424]:
df.head(2)

Unnamed: 0,SurveyPSATResultKey,SurveyStepResultKey,SurveyResultKey,ParticipantIdentifier,PresentationMode,InterStimulusInterval,StimulusDuration,Length,TotalCorrect,TotalDyad,TotalTime,InitialDigit,StartDate,EndDate,Samples,datetime,trial_date,time
0,f1280b3d-6dd4-ed11-aac6-0afb9334277d,d7280b3d-6dd4-ed11-aac6-0afb9334277d,c9280b3d-6dd4-ed11-aac6-0afb9334277d,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,Visual,3,1,30,29,27,48.8601155419601,2,2023-04-06T07:22:05-04:00,2023-04-06T07:22:05-04:00,"[{""Answer"":11,""Correct"":true,""Time"":1.38083220...",2023-04-06 07:22:05-04:00,2023-04-06,07:22:05
1,d180e650-6ed4-ed11-aac6-0afb9334277d,c280e650-6ed4-ed11-aac6-0afb9334277d,b480e650-6ed4-ed11-aac6-0afb9334277d,0151d9f1-1644-4437-805e-02f5e244a690,Visual,3,1,30,30,29,33.7597789999999,3,2023-04-06T07:29:40-04:00,2023-04-06T07:29:40-04:00,"[{""Answer"":4,""Correct"":true,""Time"":1.594179666...",2023-04-06 07:29:40-04:00,2023-04-06,07:29:40


---

The important data is:
- `Length` + `TotalCorrect` to determine accuracy
- `TotalTime` / `Length` to get time/trial 
    - (use this instead of `TotalTime` in case we change number of trials

In [425]:
# Replace commas with decimals (for European participants)
df['TotalTime'] = df['TotalTime'].replace(',', '.', regex=True)
df.TotalTime = df.TotalTime.astype('float')

In [426]:
# Calculate 'task_psat_accuracy' column
df['task_psat_accuracy'] = df['TotalCorrect'] / df['Length']

# Define a function to calculate 'task_psat_avgRT', 'task_psat_flag_3plusRT_n', 'task_psat_flag_sub100RT_n' and 'task_psat_varRT'
def calculate_avgRT_and_flag(samples, length):
    # Convert the string representation of the samples list into a list of dictionaries
    samples_list = json.loads(samples)
    
    # Extract all 'Time' values
    times = [item["Time"] for item in samples_list]
    
    # Count the number of 'Time' values greater than 3.0
    flag_count = sum(1 for time in times if time > 3.0)
    
    # Replace 'Time' values greater than 3.0 with 3.0
    times = [min(time, 3.0) for time in times]
    
    # Calculate the average 'Time' value
    avgRT = sum(times) / length
    
    # Flag 'Time' values less than 100ms
    min100_count = sum(1 for time in times if time < 0.1)
    
    # Calculate the variance of 'Time' values
    varRT = np.var(times)
    
    return avgRT, flag_count, varRT, min100_count

# Apply the function to the DataFrame and split the results into two new columns
df['task_psat_avgRT'], df['task_psat_flag_3plusRT_n'], df['task_psat_varRT'], df['task_psat_flag_sub100RT_n'] = zip(*df.apply(lambda row: calculate_avgRT_and_flag(row['Samples'], row['Length']), axis=1))

# keep relevant columns
psat_df = df[['ParticipantIdentifier', 'trial_date', 'task_psat_accuracy', 'task_psat_avgRT', 'task_psat_varRT', 'task_psat_flag_sub100RT_n', 'task_psat_flag_3plusRT_n']]
psat_df.head(2)


Unnamed: 0,ParticipantIdentifier,trial_date,task_psat_accuracy,task_psat_avgRT,task_psat_varRT,task_psat_flag_sub100RT_n,task_psat_flag_3plusRT_n
0,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,2023-04-06,0.966667,1.43242,0.254876,0,1
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-04-06,1.0,1.125326,0.115776,0,0


#### EDA

In [427]:
profile = ProfileReport(psat_df.iloc[:,2:], title=f"PSAT Task Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_psat_run{run_num}.html")

Summarize dataset: 100%|██████████| 30/30 [00:00<00:00, 33.77it/s, Completed]                                                   
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  1.65it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  5.59it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 598.50it/s]


#### Clean

We had some clear RT outlier values given that the mean of the trials' average RT (even with outliers) was less than 1 second, and yet there where maxmum average RT that were multiple minutes.

Going back it appeared that there were glitches as on some trials there were response times above 3.0 seconds which should have been the in-app limit. 

I replaced any values above 3.0 with 3.0 before calculating the average RT time per trial.
I also created a column called `task_psat_flag_3plusRT_n` that indicated how many responses had an RT greater than 3.0 orginally, for each trial. For cohort 2, for example, 212 trials (out of 6418) had at least one RT greater than 3.0.

**RTs**

We also know from the work of Luce (1984) and Whelan (2008) that RTs below 100ms are not realistic.

We therefore also counted up how many responses in each trial had RTs BELOW 100ms and indicated the numebr in the column `task_psat_flag_sub100RT_n`

### Tower of Hanoi

[RK Studio Documentation](http://researchkit.org/docs/docs/ActiveTasks/ActiveTasks.html#towerhttp://researchkit.org/docs/docs/ActiveTasks/ActiveTasks.html#tower)

>In the Tower of Hanoi task the user is asked to solve the classic Tower of Hanoi puzzle in a minimum number of moves. To solve the puzzle, the user must move the entire stack to the highlighted platform in as few moves as possible. This task measures the user’s problem solving skills. A Tower of Hanoi task finishes when the user completes the puzzle correctly or concedes that they cannot solve the puzzle.

>Data collected by this task is in the form of an ORKTowerOfHanoiResult object. It contains every move taken by the user and indicates whether the puzzle was successfully completed or not.

#### Load Data

In [491]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')
    
if 'toh_df' in globals():
    del(toh_df)
    print('deleted existing toh_df')

deleted existing df
deleted existing toh_df


In [492]:
# loop through all days
days = [i for i in os.listdir(path) if i.startswith('RK')]
for day in tqdm(days):
    files = os.listdir(path + day)
    surveyQuestions = [i for i in files if i.startswith('SurveyTowerOfHanoiResults')]
    # there should be only one
    for file in surveyQuestions:
        if 'df' not in globals():
            df = pd.read_csv(path + day + '/' + file)
        else:
            temp_df = pd.read_csv(path + day + '/' + file)
            df = pd.concat([df,temp_df], axis=0)

100%|██████████| 55/55 [00:00<00:00, 212.39it/s]


#### Format Data

In [493]:
# check for na dates...
df.isna().sum()

SurveyTowerOfHanoiResultKey    0
SurveyStepResultKey            0
SurveyResultKey                0
ParticipantIdentifier          0
PuzzleWasSolved                0
StartDate                      0
EndDate                        0
Moves                          0
dtype: int64

In [494]:
df = df.dropna(subset=['EndDate']).reset_index(drop=True)

In [495]:
# Select only subjects in current run
df = df.loc[df.ParticipantIdentifier.isin(subjects)].reset_index(drop=True)

In [496]:
# add trial date and time columns
for i in tqdm(range(df.shape[0])):
    dt = parser.parse(df.loc[i, 'EndDate'])
    df.loc[i, 'datetime'] = dt
    df.loc[i, 'trial_date'] = (dt + datetime.timedelta(hours = -4)).date() # trial day associated with sample (4am is when the day flips)
    df.loc[i, 'time'] = dt.time()
    
df.head(3)

 29%|██▉       | 1883/6533 [00:00<00:00, 9422.60it/s]

100%|██████████| 6533/6533 [00:00<00:00, 9425.96it/s]


Unnamed: 0,SurveyTowerOfHanoiResultKey,SurveyStepResultKey,SurveyResultKey,ParticipantIdentifier,PuzzleWasSolved,StartDate,EndDate,Moves,datetime,trial_date,time
0,86e6ebd6-6cd4-ed11-aac6-0afb9334277d,6ee6ebd6-6cd4-ed11-aac6-0afb9334277d,5ae6ebd6-6cd4-ed11-aac6-0afb9334277d,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,True,2023-04-06T07:18:50-04:00,2023-04-06T07:19:06-04:00,"[{""Timestamp"":0.0,""DonorTowerIndex"":0,""Recipie...",2023-04-06 07:19:06-04:00,2023-04-06,07:19:06
1,381fe5fc-6dd4-ed11-aac6-0afb9334277d,1f1fe5fc-6dd4-ed11-aac6-0afb9334277d,081fe5fc-6dd4-ed11-aac6-0afb9334277d,0151d9f1-1644-4437-805e-02f5e244a690,True,2023-04-06T07:27:07-04:00,2023-04-06T07:27:21-04:00,"[{""Timestamp"":0.0,""DonorTowerIndex"":0,""Recipie...",2023-04-06 07:27:21-04:00,2023-04-06,07:27:21
2,3c696e9e-71d4-ed11-aac6-0afb9334277d,29696e9e-71d4-ed11-aac6-0afb9334277d,18696e9e-71d4-ed11-aac6-0afb9334277d,35d11ffc-7034-4708-a086-cd4bd47b51fd,True,2023-04-06T07:52:20-04:00,2023-04-06T07:53:22-04:00,"[{""Timestamp"":0.0,""DonorTowerIndex"":0,""Recipie...",2023-04-06 07:53:22-04:00,2023-04-06,07:53:22


The important data is in `Moves` where we have:
- `TapTimestamp`
- `TapIndex`
- `TapIncorrect`

I want to get the last `TapTimestamp` to calculate total timing

In [497]:
# Taps is a string of a list of dictionaries
df.Moves[0]

'[{"Timestamp":0.0,"DonorTowerIndex":0,"RecipientTowerIndex":2},{"Timestamp":1.0491399765014648,"DonorTowerIndex":0,"RecipientTowerIndex":1},{"Timestamp":1.4823609590530396,"DonorTowerIndex":2,"RecipientTowerIndex":1},{"Timestamp":2.0486660003662109,"DonorTowerIndex":0,"RecipientTowerIndex":2},{"Timestamp":2.53186297416687,"DonorTowerIndex":1,"RecipientTowerIndex":0},{"Timestamp":2.9984489679336548,"DonorTowerIndex":1,"RecipientTowerIndex":2},{"Timestamp":3.5813790559768677,"DonorTowerIndex":0,"RecipientTowerIndex":2},{"Timestamp":4.1143800020217896,"DonorTowerIndex":0,"RecipientTowerIndex":1},{"Timestamp":4.563789963722229,"DonorTowerIndex":2,"RecipientTowerIndex":1},{"Timestamp":5.097709059715271,"DonorTowerIndex":2,"RecipientTowerIndex":0},{"Timestamp":5.5808260440826416,"DonorTowerIndex":1,"RecipientTowerIndex":0},{"Timestamp":6.0962109565734863,"DonorTowerIndex":2,"RecipientTowerIndex":1},{"Timestamp":6.5959550142288208,"DonorTowerIndex":0,"RecipientTowerIndex":2},{"Timestamp":7.0

In [498]:
# Can convert to list of dicts and then access an individual dict
data = json.loads(df.Moves[0])

# print last dict
print(data[-1])

# extract the time
data[-1]['Timestamp']

{'Timestamp': 15.773602962493896, 'DonorTowerIndex': 0, 'RecipientTowerIndex': 2}


15.773602962493896

In [499]:
# numbmer of moves
len(data)

31

In [500]:
# convert string Taps to list of dicts
df['MovesList'] = df['Moves'].apply(json.loads)

# test if any lists are empty...this means NO MOVES
drop_ix = []

for i in range(len(df.MovesList)):
    if df.MovesList[i]:
        x = df.MovesList[i][-1]['Timestamp']
    else:
        drop_ix.append(i)

In [501]:
# drop rows with empty move lists
df = df.drop(df.index[drop_ix]).reset_index(drop=True)

---

The important data is:
- `PuzzleWasSolved`
    - just to indicate completion
- Get total time required
    - `Timestamp` in last dictionary
- Get number of moves
    - 1 dict/move so get count of dicts)

In [502]:
# assign new columns
df = df.assign(task_hanoi_solved=lambda x: x.PuzzleWasSolved,
               task_hanoi_time=lambda x: x.MovesList.apply(lambda x: x[-1]['Timestamp']),
               task_hanoi_moves=[len(moves) for moves in df.MovesList] # maybe give this as a multiple on optimality (ideal = 1)?
              )

# keep relevant columns
toh_df = df[['ParticipantIdentifier', 'trial_date', 'task_hanoi_solved', 'task_hanoi_time', 'task_hanoi_moves']]

# add extra moves column
toh_df['task_hanoi_extraMoves'] = toh_df['task_hanoi_moves'] - 31
toh_df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  toh_df['task_hanoi_extraMoves'] = toh_df['task_hanoi_moves'] - 31


Unnamed: 0,ParticipantIdentifier,trial_date,task_hanoi_solved,task_hanoi_time,task_hanoi_moves,task_hanoi_extraMoves
0,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,2023-04-06,True,15.773603,31,0
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-04-06,True,13.860316,31,0


In [506]:
toh_df.iloc[:, 2:].mean()

task_hanoi_solved         0.936546
task_hanoi_time          43.112464
task_hanoi_moves         42.777189
task_hanoi_extraMoves    11.777189
dtype: float64

#### EDA

In [504]:
profile = ProfileReport(toh_df.iloc[:,2:], title=f"TOH Task Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_toh_run{run_num}.html")

Summarize dataset: 100%|██████████| 22/22 [00:00<00:00, 42.98it/s, Completed]                                           
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  2.12it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  9.04it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 307.32it/s]


#### Clean

**RTs**

While our mean time to solve the puzzle was about 48 seconds (without removing outliers), we have extreme values of over 12,000 seconds.

We chose to use an ad-hoc method of removing trials where we cut out any trials that took longer than 10 minutes. This only removed 13 trials (out of 6333) in cohort 2.

---

We also have values of zero (95 in cohort 2). Most of these actually indicate that the puzzle was solved, which is of course impossible.

All rows with a `task_hanoi_time` of zero are removed.

---

In [505]:
# Remove all trials with zero time
toh_df = toh_df.loc[toh_df['task_hanoi_time'] >0]

# Remove all trials with more than 10 minutes time
toh_df = toh_df.loc[toh_df['task_hanoi_time'] <=600]

# Rerun EDA
profile = ProfileReport(toh_df.iloc[:,2:], title=f"TOH Task Run {run_num} - Clean | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_toh_run{run_num}_clean.html")

Summarize dataset: 100%|██████████| 22/22 [00:00<00:00, 41.82it/s, Completed]                                           
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  2.08it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  9.11it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 912.60it/s]


### Reaction Time

This was **Not Working** for at least the first half of **Run 1**.

A new task has been created by RK Studio, called the **Normalized Reaction Time** task.

The data export format info is [here](https://support.mydatahelps.org/hc/en-us/articles/1500002230281-Normalized-Reaction-Time-Active-Task-Export-Format).

#### Load Data

In [587]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')
    
if 'rt_df' in globals():
    del(rt_df)
    print('deleted existing rt_df')

deleted existing df
deleted existing rt_df


In [588]:
# loop through all days
days = [i for i in os.listdir(path) if i.startswith('RK')]
for day in tqdm(days):
    files = os.listdir(path + day)
    surveyQuestions = [i for i in files if i.startswith('SurveyNormalizedReactionTime')]
    # there should be only one
    for file in surveyQuestions:
        if 'df' not in globals():
            df = pd.read_csv(path + day + '/' + file)
        else:
            temp_df = pd.read_csv(path + day + '/' + file)
            df = pd.concat([df,temp_df], axis=0)

100%|██████████| 55/55 [00:00<00:00, 217.67it/s]


#### Format Data

In [589]:
# check for na dates...
df.isna().sum()

SurveyNormalizedReactionTimeResultKey    0
SurveyStepResultKey                      0
SurveyResultKey                          0
ParticipantIdentifier                    0
ReactionDate                             0
StimulusStartDate                        0
TimerStartDate                           0
TimerEndDate                             0
CurrentInterval                          0
StartDate                                0
EndDate                                  0
dtype: int64

In [590]:
df = df.dropna(subset=['EndDate']).reset_index(drop=True)

In [591]:
# Select only subjects in current run
df = df.loc[df.ParticipantIdentifier.isin(subjects)].reset_index(drop=True)

In [592]:
# add trial date and time columns
for i in tqdm(range(df.shape[0])):
    dt = parser.parse(df.loc[i, 'EndDate'])
    df.loc[i, 'datetime'] = dt
    df.loc[i, 'trial_date'] = (dt + datetime.timedelta(hours = -4)).date() # trial day associated with sample (4am is when the day flips)
    df.loc[i, 'time'] = dt.time()

100%|██████████| 26119/26119 [00:02<00:00, 9860.60it/s] 


In [593]:
df.head(2)

Unnamed: 0,SurveyNormalizedReactionTimeResultKey,SurveyStepResultKey,SurveyResultKey,ParticipantIdentifier,ReactionDate,StimulusStartDate,TimerStartDate,TimerEndDate,CurrentInterval,StartDate,EndDate,datetime,trial_date,time
0,bdea09a1-6cd4-ed11-aac6-0afb9334277d,9eea09a1-6cd4-ed11-aac6-0afb9334277d,6aea09a1-6cd4-ed11-aac6-0afb9334277d,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,684795.3356290417,684794.692473375,684795.0018499583,684795.3357035833,2,2023-04-06T07:17:19-04:00,2023-04-06T07:17:19-04:00,2023-04-06 07:17:19-04:00,2023-04-06,07:17:19
1,c0ea09a1-6cd4-ed11-aac6-0afb9334277d,9eea09a1-6cd4-ed11-aac6-0afb9334277d,6aea09a1-6cd4-ed11-aac6-0afb9334277d,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,684805.0794604167,684804.3868509585,684804.74665975,684805.0794899584,6,2023-04-06T07:17:29-04:00,2023-04-06T07:17:29-04:00,2023-04-06 07:17:29-04:00,2023-04-06,07:17:29


In [594]:
# replace commas with dots
df['ReactionDate'] = df['ReactionDate'].replace(',', '.', regex=True)
df.ReactionDate = df.ReactionDate.astype('float')

df['StimulusStartDate'] = df['StimulusStartDate'].replace(',', '.', regex=True)
df.StimulusStartDate = df.StimulusStartDate.astype('float')

In [595]:
df['task_rt_time'] = df.ReactionDate - df.StimulusStartDate

In [596]:
# keep relevant columns
rt_df = df[['ParticipantIdentifier', 'trial_date', 'task_rt_time']]
# If negative it was a missed/error trial
rt_df.loc[rt_df.task_rt_time <=0, 'task_rt_time'] = None

rt_df.head(10)

Unnamed: 0,ParticipantIdentifier,trial_date,task_rt_time
0,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,2023-04-06,0.643156
1,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,2023-04-06,0.692609
2,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,2023-04-06,0.728588
3,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,2023-04-06,0.793508
4,0151d9f1-1644-4437-805e-02f5e244a690,2023-04-06,0.430718
5,0151d9f1-1644-4437-805e-02f5e244a690,2023-04-06,0.483464
6,0151d9f1-1644-4437-805e-02f5e244a690,2023-04-06,0.498956
7,0151d9f1-1644-4437-805e-02f5e244a690,2023-04-06,0.431452
8,35d11ffc-7034-4708-a086-cd4bd47b51fd,2023-04-06,0.725344
9,35d11ffc-7034-4708-a086-cd4bd47b51fd,2023-04-06,0.552103


In [597]:
labels = [
    'task_rt_1',
    'task_rt_2',
    'task_rt_3',
    'task_rt_4',
]

In [598]:
oldSub = None
oldDay = None
i = 1
rt_df['label'] = None

for row in range(len(df)):
    sub = rt_df.loc[row,'ParticipantIdentifier']
    day = rt_df.loc[row, 'trial_date']
    if (sub == oldSub) & (day == oldDay):
        if i >3:
            continue
        else:
            rt_df.loc[row, 'label'] = labels[i]
            oldSub = sub
            oldDay = day
            i+=1
    else:
        rt_df.loc[row, 'label'] = labels[0]
        i = 1
        oldSub = sub
        oldDay = day
        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rt_df['label'] = None


In [599]:
rt_df = rt_df.dropna(subset=['label'])

In [600]:
rt_df.duplicated(subset=['ParticipantIdentifier', 'trial_date', 'label']).sum()

758

In [601]:
rt_df = rt_df.drop_duplicates(subset=['ParticipantIdentifier', 'trial_date', 'label'], keep='last')

In [602]:
rt_df = rt_df.pivot(index=['ParticipantIdentifier', 'trial_date'], columns='label', values='task_rt_time').reset_index()

# Remove index name
rt_df = rt_df.rename_axis(None, axis=1)

rt_df.head()

Unnamed: 0,ParticipantIdentifier,trial_date,task_rt_1,task_rt_2,task_rt_3,task_rt_4
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-30,0.43023,0.500456,0.563285,0.448408
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-31,0.414063,0.766173,0.466758,0.431114
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,0.426607,0.429335,0.428868,0.445612
3,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,0.496612,0.447787,0.481698,0.43187
4,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-03,0.431316,0.481745,,


In [603]:
# Extract columns starting with "task_rt_"
task_rt_columns = [col for col in rt_df.columns if col.startswith("task_rt_")]

# Replace values in task_rt_columns that are below 0.1 with NaN
rt_df[task_rt_columns] = rt_df[task_rt_columns].applymap(lambda x: x if x >= 0.1 else float('nan'))

# Calculate 'task_rt_avgRT' column
rt_df['task_rt_avgRT'] = rt_df[task_rt_columns].mean(axis=1)

# Calculate 'task_rt_flag_plus2_n' column
rt_df['task_rt_flag_plus2_n'] = rt_df[task_rt_columns].apply(lambda row: sum(row > 2), axis=1)

#### EDA

In [604]:
profile = ProfileReport(rt_df.iloc[:,2:], title=f"RT Task Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_rt_run{run_num}.html")

Summarize dataset: 100%|██████████| 41/41 [00:12<00:00,  3.35it/s, Completed]                           
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  1.36it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  3.89it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 300.67it/s]


In [605]:
rt_df.head()

Unnamed: 0,ParticipantIdentifier,trial_date,task_rt_1,task_rt_2,task_rt_3,task_rt_4,task_rt_avgRT,task_rt_flag_plus2_n
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-30,0.43023,0.500456,0.563285,0.448408,0.485595,0
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-31,0.414063,0.766173,0.466758,0.431114,0.519527,0
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,0.426607,0.429335,0.428868,0.445612,0.432606,0
3,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,0.496612,0.447787,0.481698,0.43187,0.464492,0
4,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-03,0.431316,0.481745,,,0.45653,0


#### Clean

**RTs**

We have some very long RTs (many minutes long). Clearly people are distracted for these trials. 

We leave these in but create a column called `task_rt_flag_plus2_n` that indicates how many trials had rts greater than 2 seconds.

We also know from the work of Luce (1984) and Whelan (2008) that RTs below 100ms are not realistic, so we remove those values and replace with NaN

In [606]:
# Rerun EDA filtering out flagged trials
profile = ProfileReport(rt_df.loc[rt_df['task_rt_flag_plus2_n']==0, "task_rt_1":], title=f"RT Task Run {run_num} - Cleaned | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_rt_run{run_num}_clean.html")

Summarize dataset: 100%|██████████| 41/41 [00:01<00:00, 25.12it/s, Completed]                           
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  1.32it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  3.95it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 764.69it/s]


### Spatial Span Memory


[RK Studio Documentation](http://researchkit.org/docs/docs/ActiveTasks/ActiveTasks.html#spatialhttp://researchkit.org/docs/docs/ActiveTasks/ActiveTasks.html#spatial)

> In the spatial memory task the user is asked to observe and then recall pattern sequences of increasing length in a game-like environment. The task collects data that can be used to assess visuospatial memory and executive function.

>The span (that is, the length of the pattern sequence) is automatically varied during the task, increasing after successful completion of a sequence, and decreasing after failures, in the range from minimumSpan to maximumSpan. The playSpeed property lets you control the speed of sequence playback, and the customTargetImage property lets you customize the shape of the tap target. The game finishes when either maxTests tests have been completed, or the user has made maxConsecutiveFailures errors in a row.

>The results collected are scores derived from the game, the details of the game, and the touch inputs made by the user.

#### Load Data

In [None]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')
    

deleted existing df


In [None]:
# loop through all days
days = [i for i in os.listdir(path) if i.startswith('RK')]
for day in tqdm(days):
    files = os.listdir(path + day)
    surveyQuestions = [i for i in files if i.startswith('SurveySpatialSpanMemoryResults')]
    # there should be only one
    for file in surveyQuestions:
        if 'df' not in globals():
            df = pd.read_csv(path + day + '/' + file)
        else:
            temp_df = pd.read_csv(path + day + '/' + file)
            df = pd.concat([df,temp_df], axis=0)

100%|██████████| 55/55 [00:00<00:00, 65.48it/s]


#### Format Data

In [None]:
# check for na dates...
df.isna().sum()

SurveySpatialSpanMemoryResultKey    0
SurveyStepResultKey                 0
SurveyResultKey                     0
ParticipantIdentifier               0
Score                               0
NumberOfGames                       0
NumberOfFailures                    0
StartDate                           0
EndDate                             0
GameRecords                         0
dtype: int64

In [None]:
df = df.dropna(subset=['EndDate']).reset_index(drop=True)

In [None]:
# select only subjects in run 1
df = df.loc[df.ParticipantIdentifier.isin(subjects)].reset_index(drop=True)

In [None]:
# add trial date and time columns
for i in tqdm(range(df.shape[0])):
    dt = parser.parse(df.loc[i, 'EndDate'])
    df.loc[i, 'datetime'] = dt
    df.loc[i, 'trial_date'] = (dt + datetime.timedelta(hours = -4)).date() # trial day associated with sample (4am is when the day flips)
    df.loc[i, 'time'] = dt.time()

100%|██████████| 6430/6430 [00:01<00:00, 4453.78it/s]


In [None]:
df.head(2)

Unnamed: 0,SurveySpatialSpanMemoryResultKey,SurveyStepResultKey,SurveyResultKey,ParticipantIdentifier,Score,NumberOfGames,NumberOfFailures,StartDate,EndDate,GameRecords,datetime,trial_date,time
0,89a2d274-6ed4-ed11-aac6-0afb9334277d,7aa2d274-6ed4-ed11-aac6-0afb9334277d,6ca2d274-6ed4-ed11-aac6-0afb9334277d,0151d9f1-1644-4437-805e-02f5e244a690,540,6,0,2023-04-06T07:29:46-04:00,2023-04-06T07:30:42-04:00,"[{""Seed"":980801876,""Sequence"":[8,7,1],""GameSiz...",2023-04-06 07:30:42-04:00,2023-04-06,07:30:42
1,532b549c-70d4-ed11-aac6-0afb9334277d,3c2b549c-70d4-ed11-aac6-0afb9334277d,2c2b549c-70d4-ed11-aac6-0afb9334277d,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,385,6,1,2023-04-06T07:45:19-04:00,2023-04-06T07:46:11-04:00,"[{""Seed"":1347940271,""Sequence"":[2,3,8],""GameSi...",2023-04-06 07:46:11-04:00,2023-04-06,07:46:11


---

To capture performance we are using:
- `Score` 

**NB** | Might be worth checking out exactly how this is calculated, but for our purposes it seems to be a good proxy of how well you actually do on the task (e.g. you get a better score if you fail on the last attempt (to get 8 in a row) then if you fail on the second attempt and only make it to 6 in a row...)

In [None]:
# assign new column with accuracy value
df = df.assign(task_spatialSpan_score=lambda x: x.Score,
               task_spatialSpan_time=lambda x: x.time
              )

# keep relevant columns
spatialSpan_df = df[['ParticipantIdentifier', 'trial_date', 'task_spatialSpan_score', 'task_spatialSpan_time']]
spatialSpan_df.head(2)

Unnamed: 0,ParticipantIdentifier,trial_date,task_spatialSpan_score,task_spatialSpan_time
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-04-06,540,07:30:42
1,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,2023-04-06,385,07:46:11
