# Setup

## Imports

In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import datetime
from dateutil import parser
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport

  from .autonotebook import tqdm as notebook_tqdm


## Set Paths

In [432]:
# set run
run_num = 2

if run_num ==1:
    # run 1
    path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_1_raw_data/run_1/app_data/'
    save_path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_2_processed_data/run_1/'
    subjects_run1 = pd.read_csv(path + '../run1_subjects.csv')
    subjects = subjects_run1.ParticipantIdentifier
elif run_num ==2:
    # run 2
    path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_1_raw_data/run_2/app_data/'
    save_path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_2_processed_data/run_2/'
    subjects_run2 = pd.read_csv(path + '../run2_subjects.csv')
    subjects = subjects_run2.ParticipantIdentifier
    
eda_reports_path = '/Users/djw/Documents/pCloud_synced/Academics/Projects/2020_thesis/thesis_experiments/3_experiments/3_3_experience_sampling/3_3_4_outputs/EDA/'

# Active Data

## Import Data

In [433]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

deleted existing df


In [434]:
# loop through all days
days = [i for i in os.listdir(path) if i.startswith('RK')]
for day in tqdm(days):
    files = os.listdir(path + day)
    surveyQuestions = [i for i in files if i.startswith('SurveyQuestionResults')]
    # there should be only one
    for file in surveyQuestions:
        if 'df' not in globals():
            df = pd.read_csv(path + day + '/' + file)
        else:
            temp_df = pd.read_csv(path + day + '/' + file)
            df = pd.concat([df,temp_df], axis=0)

100%|██████████| 55/55 [00:05<00:00, 10.42it/s]


In [435]:
df.shape

(1124570, 8)

In [436]:
# select relevant columns
df = df[['ParticipantIdentifier', 'ResultIdentifier', 'Answers', 'EndDate']]
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,ParticipantIdentifier,ResultIdentifier,Answers,EndDate
0,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,current_situation1_tasks,in_public,2023-04-06T07:15:45-04:00
1,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,current_situation2_tasks,False,2023-04-06T07:15:46-04:00
2,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,task_motivation,10,2023-04-06T07:15:48-04:00
3,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,affect_neg_frustrated_am,1,2023-04-06T07:15:58-04:00
4,5599c2a7-88a5-4cde-9f2a-41b4bd03d660,affect_pos_relaxedCalm_am,4,2023-04-06T07:16:01-04:00


In [437]:
df.isna().sum()

ParticipantIdentifier     0
ResultIdentifier          0
Answers                  37
EndDate                   0
dtype: int64

In [438]:
# Remove rows without valid EndDate value
df = df.dropna(subset=['EndDate']).reset_index(drop=True)

# Select relevant subjects
df = df.loc[df.ParticipantIdentifier.isin(subjects)].reset_index(drop=True)

In [439]:
# add trial date and time columns
for i in tqdm(range(df.shape[0])):
    dt = parser.parse(df.loc[i, 'EndDate'])
    df.loc[i, 'datetime'] = dt
    df.loc[i, 'trial_date'] = (dt + datetime.timedelta(hours = -4.75)).date() # trial day associated with sample (4:45am is when the day flips)
    df.loc[i, 'time'] = dt.time()

100%|██████████| 1104412/1104412 [02:01<00:00, 9122.44it/s]


In [440]:
# save to csv
if run_num ==1:
    # run 1
    df.to_csv(save_path + 'run1_survey_results.csv', index=False)
if run_num ==2:
    # run 2
    df.to_csv(save_path + 'run2_survey_results.csv', index=False)

# Gap App

## Self Report

### Affect

#### Load Data

In [615]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

if 'df_daily_affect_wide' in globals():
    del(df_daily_affect_wide)
    print('deleted affect df')

deleted existing df
deleted affect df


In [616]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_survey_results.csv')

#### Select Data

In [617]:
df_affect = df.loc[df.ResultIdentifier.str.startswith('affect_')].reset_index(drop=True)
df_affect_am = df.loc[(df.ResultIdentifier.str.startswith('affect_')) & (df.ResultIdentifier.str.endswith('am'))].reset_index(drop=True)
df_affect_pm = df.loc[(df.ResultIdentifier.str.startswith('affect_')) & (~df.ResultIdentifier.str.endswith('am'))].reset_index(drop=True)

#### Convert to Wide

In [618]:
df_affect_pm_wide = df_affect_pm.pivot_table(index=["ParticipantIdentifier", "trial_date"], 
                    columns='ResultIdentifier', 
                    values='Answers').reset_index()
# get rid of name on index
df_affect_pm_wide = df_affect_pm_wide.rename_axis(None, axis=1)

df_affect_am_wide = df_affect_am.pivot_table(index=["ParticipantIdentifier", "trial_date"], 
                    columns='ResultIdentifier', 
                    values='Answers').reset_index()
# get rid of name on index
df_affect_am_wide = df_affect_am_wide.rename_axis(None, axis=1)

In [619]:
# join
df_daily_affect_wide = df_affect_pm_wide.merge(df_affect_am_wide, how='left', on=['ParticipantIdentifier', 'trial_date'])

df_daily_affect_wide.head(3)

Unnamed: 0,ParticipantIdentifier,trial_date,affect_neg_angry,affect_neg_ashamed,affect_neg_bored,affect_neg_depressed,affect_neg_embarrassed,affect_neg_frustrated,affect_neg_guilty,affect_neg_lazy,...,affect_neg_sad_am,affect_neg_stressed_am,affect_pos_amused_am,affect_pos_appreciated_am,affect_pos_excited_am,affect_pos_focused_am,affect_pos_happy_am,affect_pos_hopeful_am,affect_pos_motivated_am,affect_pos_relaxedCalm_am
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-30,2.0,1.0,2.0,1.0,1.0,1.0,1.0,2.0,...,1.0,4.0,4.0,4.0,3.0,3.0,3.0,4.0,3.0,4.0
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-31,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,4.0,2.0,3.0,3.0,3.0,3.0,3.0,2.0,4.0
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,1.0,3.0,3.0,3.0,3.0,1.0,1.0,2.0,...,1.0,2.0,1.0,3.0,1.0,3.0,3.0,3.0,3.0,2.0


#### EDA Profiling

In [61]:
profile = ProfileReport(df_daily_affect_wide.iloc[:,2:], title=f"Affect Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"affect_report_run{run_num}.html")

Summarize dataset:   2%|▏         | 1/45 [00:00<00:01, 29.53it/s, Describe variable:affect_neg_ashamed]

Summarize dataset: 100%|██████████| 450/450 [00:21<00:00, 20.49it/s, Completed]                                                   
Generate report structure: 100%|██████████| 1/1 [00:03<00:00,  3.91s/it]
Render HTML: 100%|██████████| 1/1 [00:03<00:00,  3.37s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 60.93it/s]


**NOTE**

There are a number of variables where the maximum values are well above 5, which is the maximum option that should be available. These glitch entries should be removed

#### Clean

<div class="alert alert-block alert-info">
<b>🧹 Process:</b><br>
For our cleaning process we do the following:<br><br>

<ol>
    <li>Remove values greater than 5, given that the Likert scale only went to 5</li>
    <br>
    <li>Look for zero variance <b>columns</b> (affective measures) and <b>rows</b> (days) for subjects</li></h5>
    <br>
    👉 If the variance was zero for three or more columns then the subject was flagged (`affect_zeroVarCols_flag = True`)<br>
    <br>
    👉 If the variance was zero for more than 10% of a subjects completed days then the subject was flagged (`affect_zeroVarRows_flag = True`)
</ol>
</div>

##### Impossible Values

In [620]:
# Number of instances where the cell value is out of range (greater than 5)
df_daily_affect_wide.iloc[:,2:][df_daily_affect_wide.iloc[:,2:] > 5].count().sum()

106

In [621]:
# Replace all values below threshold with NaN
df_daily_affect_wide.iloc[:,2:] = np.where(df_daily_affect_wide.iloc[:,2:]>5, np.nan, df_daily_affect_wide.iloc[:,2:])

In [622]:
# Recheck for instances above 5
df_daily_affect_wide.iloc[:,2:][df_daily_affect_wide.iloc[:,2:] > 5].count().sum()

0

In [474]:
# Rerun profiling
profile = ProfileReport(df_daily_affect_wide.iloc[:,2:],
                        title=f"Affect Run {run_num} | Pandas Profiling Report",
                        infer_dtypes = False)
profile.to_file(eda_reports_path + f"affect_report_clean_run{run_num}.html")

Summarize dataset: 100%|██████████| 1650/1650 [01:36<00:00, 17.13it/s, Completed]                                                         
Generate report structure: 100%|██████████| 1/1 [00:04<00:00,  4.23s/it]
Render HTML: 100%|██████████| 1/1 [00:12<00:00, 12.67s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 22.69it/s]


##### Variance

We can see that there are a number of participants who have no variance in a given category.

This is obviously a problem for some analyses...

In [623]:
# Drop 'trial_date' column
data = df_daily_affect_wide.drop(columns='trial_date', errors='ignore')

# Group by 'ParticipantIdentifier' and compute the variance
grouped_variance = data.groupby('ParticipantIdentifier').var()

# Filter the grouped_variance dataframe to only include columns with 0 variance for any participant
zero_variance_df = grouped_variance[grouped_variance == 0].dropna(how='all')

# Melt the dataframe to have ParticipantIdentifier, Column with 0 variance
melted_zero_variance_df = zero_variance_df.reset_index().melt(id_vars=['ParticipantIdentifier'], value_name='Variance')
final_zero_variance_df = melted_zero_variance_df.dropna(subset=['Variance']).drop(columns='Variance')

# Count the number of values present in the variable column for each participant
value_counts = data.groupby('ParticipantIdentifier').count()

# Merge the value counts with the final_zero_variance_df
merged_df = final_zero_variance_df.merge(value_counts, on='ParticipantIdentifier', how='left')

# Extract only the relevant columns
result_df = merged_df[['ParticipantIdentifier', 'variable', 'affect_neg_angry']]
result_df.columns = ['ParticipantIdentifier', 'ZeroVariance', 'Count']

result_df

Unnamed: 0,ParticipantIdentifier,ZeroVariance,Count
0,1081454a-03ff-445c-9602-ac9fe9e3e5cf,affect_neg_angry,60
1,1081454a-03ff-445c-9602-ac9fe9e3e5cf,affect_neg_ashamed,60
2,852c24f8-36d6-4bbd-b79f-7f6fe64d1275,affect_neg_ashamed,72
3,1081454a-03ff-445c-9602-ac9fe9e3e5cf,affect_neg_bored,60
4,739f2417-2416-4646-b108-e73bb870d326,affect_neg_bored,85
...,...,...,...
223,ec83dfe2-3df1-44dc-a1ef-3a199327c229,affect_pos_motivated_am,32
224,02f48bee-6e86-437c-9394-10ae57dadd14,affect_pos_relaxedCalm_am,79
225,4c1d752c-a092-433b-8a13-36a9677eeb1c,affect_pos_relaxedCalm_am,28
226,c57c38e4-e887-40d2-ab8b-8ae62f5dfaa8,affect_pos_relaxedCalm_am,82


Some subjects had mulitple categories without any variance.

In [624]:
zero_var_cols = result_df.ParticipantIdentifier.value_counts().reset_index(name='affect_zeroVar_cols')
# Remove rows where both morning and evening surveys have NaN values
df_count = df_daily_affect_wide[~((df_daily_affect_wide['affect_neg_angry'].isnull()) & (df_daily_affect_wide['affect_neg_angry_am'].isnull()))]
# Calculate how many completed days
df_count =  df_count.groupby('ParticipantIdentifier').size().reset_index(name='total_count')

# merge
zero_var_cols = zero_var_cols.merge(df_count, on='ParticipantIdentifier', how='left')

# add flag
zero_var_cols['affect_zeroVarCols_flag'] = False
zero_var_cols.loc[zero_var_cols.affect_zeroVar_cols > 2, 'affect_zeroVarCols_flag'] = True
zero_var_cols

Unnamed: 0,ParticipantIdentifier,affect_zeroVar_cols,total_count,affect_zeroVarCols_flag
0,1081454a-03ff-445c-9602-ac9fe9e3e5cf,27,60,True
1,02f48bee-6e86-437c-9394-10ae57dadd14,20,79,True
2,852c24f8-36d6-4bbd-b79f-7f6fe64d1275,9,72,True
3,f8f71506-9382-40c7-99db-5c170b2a9abb,9,83,True
4,739f2417-2416-4646-b108-e73bb870d326,8,85,True
5,630ece82-994f-4aef-b2e3-46760583e453,8,81,True
6,fb6c8f5a-f92d-4af8-9f87-73ffd4e21f98,8,75,True
7,c2bfc053-7c57-4ec9-aa69-2fcba2aaba5d,8,66,True
8,e883a6d9-ec85-44eb-9366-9928c15fbe95,7,85,True
9,2ca5c7c8-3834-4c79-a416-ff7f9b9e8140,7,63,True


In [625]:
# merge with main affect df
df_daily_affect_wide = df_daily_affect_wide.merge(zero_var_cols.drop(columns=['total_count']), on='ParticipantIdentifier', how='left')

How many subjects had at least one column with no variance?

In [626]:
len(np.unique(result_df.ParticipantIdentifier))

51

Here is the subject who had 15 variables with no variance (run 1):

In [627]:
df_daily_affect_wide.loc[df_daily_affect_wide.ParticipantIdentifier == '27f7805e-5951-47b4-9f42-4c6200001cc6', :].dropna()

Unnamed: 0,ParticipantIdentifier,trial_date,affect_neg_angry,affect_neg_ashamed,affect_neg_bored,affect_neg_depressed,affect_neg_embarrassed,affect_neg_frustrated,affect_neg_guilty,affect_neg_lazy,...,affect_pos_amused_am,affect_pos_appreciated_am,affect_pos_excited_am,affect_pos_focused_am,affect_pos_happy_am,affect_pos_hopeful_am,affect_pos_motivated_am,affect_pos_relaxedCalm_am,affect_zeroVar_cols,affect_zeroVarCols_flag


But maybe variance within day is more important in terms of actually cleaning data. If someone enters the same value for every variable perhaps it is because they are not answering accurately and just trying to finish as quickly as possible.

In [628]:
# find rows with zero variance
idx = np.where(df_daily_affect_wide.drop(columns=['ParticipantIdentifier', 'trial_date']).var(axis=1) == 0)[0]
# calculate how many zero variance days per subject
df_zeroVar = df_daily_affect_wide.iloc[idx,:].groupby('ParticipantIdentifier').size().reset_index(name='zeroVar_count')

# Remove rows where both morning and evening surveys have NaN values
df_count = df_daily_affect_wide[~((df_daily_affect_wide['affect_neg_angry'].isnull()) & (df_daily_affect_wide['affect_neg_angry_am'].isnull()))]
# Calculate how many completed days
df_count =  df_count.groupby('ParticipantIdentifier').size().reset_index(name='total_count')

# merge
df_zeroVar = df_zeroVar.merge(df_count, on='ParticipantIdentifier', how='left')
df_zeroVar['affect_pct_zeroVarRows'] = (df_zeroVar.zeroVar_count / df_zeroVar.total_count) * 100

# add flag
df_zeroVar['affect_zeroVarRows_flag'] = False
df_zeroVar.loc[df_zeroVar.affect_pct_zeroVarRows > 10, 'affect_zeroVarRows_flag'] = True
df_zeroVar

Unnamed: 0,ParticipantIdentifier,zeroVar_count,total_count,affect_pct_zeroVarRows,affect_zeroVarRows_flag
0,3e1d1276-0e73-4457-9911-f189b0ed0778,4,85,4.705882,False
1,783dd47a-1180-4965-874e-eb405ee6143e,4,53,7.54717,False
2,8c58f5aa-f20e-4c39-a38d-f9c9a44a6cee,7,84,8.333333,False
3,c61e40df-fa64-4037-838e-65d912521dc2,5,10,50.0,True
4,f55d6d94-8602-46cb-b3bd-53ea561eb296,4,70,5.714286,False


In [629]:
# merge with main affect df
df_daily_affect_wide = df_daily_affect_wide.merge(df_zeroVar.drop(columns=['zeroVar_count', 'total_count']), on='ParticipantIdentifier', how='left')

In [630]:
df_daily_affect_wide.sample(5)

Unnamed: 0,ParticipantIdentifier,trial_date,affect_neg_angry,affect_neg_ashamed,affect_neg_bored,affect_neg_depressed,affect_neg_embarrassed,affect_neg_frustrated,affect_neg_guilty,affect_neg_lazy,...,affect_pos_excited_am,affect_pos_focused_am,affect_pos_happy_am,affect_pos_hopeful_am,affect_pos_motivated_am,affect_pos_relaxedCalm_am,affect_zeroVar_cols,affect_zeroVarCols_flag,affect_pct_zeroVarRows,affect_zeroVarRows_flag
843,0ff91d6a-e400-403c-bd87-4cd1803bc5e7,2023-04-22,3.0,1.0,4.0,5.0,1.0,4.0,2.0,3.0,...,,,,,,,,,,
4736,b50ef395-6d97-4314-b397-e5d755595dc2,2023-01-29,2.0,5.0,1.0,5.0,1.0,4.0,5.0,4.0,...,,,,,,,1.0,False,,
195,042d7595-3fdc-4cf9-b288-c4b7961916d8,2023-03-02,3.0,1.0,4.0,5.0,2.0,2.0,4.0,5.0,...,,,,,,,,,,
5566,dfef360e-27cb-4d35-bb4a-d6633803eb96,2023-04-14,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
764,0e14ee82-85d9-41c5-a27e-a0e9c4178117,2023-04-14,1.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,...,1.0,3.0,1.0,1.0,3.0,2.0,,,,


#### Save

In [631]:
# Create df with one participant ID for every trial_date

# Create a series of dates from '2022-09-27' to '2022-12-20'
if run_num == 1:
    date_series = pd.date_range(start='2022-09-27', end='2022-12-20')
elif run_num == 2:
    date_series = pd.date_range(start='2023-01-30', end='2023-04-24')

ids_series = subjects

# Create a dataframe using a cartesian product of the two series
df_complete_idDate = pd.DataFrame({
    'ParticipantIdentifier': np.repeat(ids_series, len(date_series)),
    'trial_date': date_series.tolist() * len(ids_series)
}).reset_index(drop=True)

# Convert trial_date to datetime.date
df_complete_idDate['trial_date'] = pd.to_datetime(df_complete_idDate['trial_date']).dt.date
df_daily_affect_wide['trial_date'] = pd.to_datetime(df_daily_affect_wide['trial_date']).dt.date

# Join with affect df
df_daily_affect_wide = df_complete_idDate.merge(df_daily_affect_wide, how='left', on=['ParticipantIdentifier', 'trial_date'])

In [633]:
# save to csv
if run_num ==1:
    # run 1
    df_daily_affect_wide.to_csv(save_path + 'run1_affect.csv', index=False)
if run_num ==2:
    # run 2
    df_daily_affect_wide.to_csv(save_path + 'run2_affect.csv', index=False)

### Daily General and Detail

#### Load Data

In [714]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

if 'df_daily_sr_wide' in globals():
    del(df_daily_sr_wide)
    print('deleted daily self report df')

deleted existing df
deleted daily self report df


In [715]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_survey_results.csv')

#### Select Data

In [716]:
past24_general = [
    'DAILY_survey_situation1_surveys',
    'DAILY_survey_situation2_surveys',
    'DAILY_survey_missed',
    'DAILY_past48to24_gap',
    'DAILY_past48to24_gapCause',
    'DAILY_past24_ideal',
    'DAILY_past24_satisfaction',
    'DAILY_past24_change',
    'DAILY_past24_productivity',
    'DAILY_past24_procrastination',
    'DAILY_past24_punctuality',
    'DAILY_past24_mentalEffort',
    'DAILY_past24_physicalEffort',
    'DAILY_past24_values',
    'DAILY_past24_gap',
    'DAILY_past24_gapCause',
    'DAILY_past24_illness',
    'DAILY_past24_fatigue',
    'DAILY_past24_unusualEvents'
]

past24_categories = [
    'DAILY_past24_sleep',
    'DAILY_past24_occupation',
    'DAILY_past24_nonoccupation',
    'DAILY_past24_exercise',
    'DAILY_past24_leisureSolo',
    'DAILY_past24_leisureSoloMental',
    'DAILY_past24_leisureSoloPhysical',
    'DAILY_past24_leisureNonSolo',
    'DAILY_past24_leisureNonSoloMental',
    'DAILY_past24_leisureNonSoloPhysical',
    'DAILY_past24_diet',
    'DAILY_past24_socialMedia',
    'DAILY_past24_drinks'
]

next24_categories = [
    'DAILY_next24_sleep',
    'DAILY_next24_occupation',
    'DAILY_next24_nonoccupation',
    'DAILY_next24_leisureSolo',
    'DAILY_next24_leisureNonSolo',
    'DAILY_next24_exercise',
    'DAILY_next24_socialMedia',
    'DAILY_next24_drinks',
    'DAILY_next24_diet'
]

monthly_goals = [
    'MONTHLY_ib_gap_change',
    'MONTHLY_ib_gap_change_app',
    'MONTHLY_goal_report1',
    'MONTHLY_goal_set1_importance',
    'MONTHLY_goal_set1_consequences',
    'MONTHLY_goal_set1_motivationInternal',
    'MONTHLY_goal_set1_motivationExternal',
    'MONTHLY_goal_set1_confidence',
    'MONTHLY_goal_set1_effort',
    'MONTHLY_goal_report2',
    'MONTHLY_goal_set2_importance',
    'MONTHLY_goal_set2_consequences',
    'MONTHLY_goal_set2_motivationInternal',
    'MONTHLY_goal_set2_motivationExternal',
    'MONTHLY_goal_set2_confidence',
    'MONTHLY_goal_set2_effort',
    'MONTHLY_goal_set2_interaction_eachOther'
]

monthly_ideals = [
    'IDEAL_weekday_sleep',
    'IDEAL_weekday_occupation',
    'IDEAL_weekday_nonoccupation',
    'IDEAL_weekday_leisureSolo',
    'IDEAL_weekday_leisureNonSolo',
    'IDEAL_weekday_exercise',
    'IDEAL_weekday_socialMedia',
    'IDEAL_weekday_drinks',
    'IDEAL_weekend_sleep',
    'IDEAL_weekend_occupation',
    'IDEAL_weekend_nonoccupation',
    'IDEAL_weekend_leisureSolo',
    'IDEAL_weekend_leisureNonSolo',
    'IDEAL_weekend_exercise',
    'IDEAL_weekend_socialMedia',
    'IDEAL_weekend_drinks'
]

if run_num == 1:
    specific_goals = [
        'DAILY_goal1_report',
        'DAILY_goal1_importance',
        'DAILY_goal1_consequences',
        'DAILY_goal1_motivationInternal',
        'DAILY_goal1_motivationExternal',
        'DAILY_goal1_confidence',
        'DAILY_goal1_effort',
        'DAILY_goal1_interaction_week',
        'DAILY_goal1_interaction_month',
        'DAILY_goal2_report',
        'DAILY_goal2_importance',
        'DAILY_goal2_consequences',
        'DAILY_goal2_motivationInternal',
        'DAILY_goal2_motivationExternal',
        'DAILY_goal2_confidence',
        'DAILY_goal2_effort',
        'DAILY_goal2_interaction_week',
        'DAILY_goal2_interaction_month',
        'DAILY_goal2_interaction_eachOther'
    ]
    
    weekly_goals = [
        'WEEKLY_goal_report1',
        'WEEKLY_goal_set1_importance',
        'WEEKLY_goal_set1_consequences',
        'WEEKLY_goal_set1_motivationInternal',
        'WEEKLY_goal_set1_motivationExternal',
        'WEEKLY_goal_set1_confidence',
        'WEEKLY_goal_set1_effort',
        'WEEKLY_goal_set1_interaction_month',
        'WEEKLY_goal_report2',
        'WEEKLY_goal_set2_importance',
        'WEEKLY_goal_set2_consequences',
        'WEEKLY_goal_set2_motivationInternal',
        'WEEKLY_goal_set2_motivationExternal',
        'WEEKLY_goal_set2_confidence',
        'WEEKLY_goal_set2_effort',
        'WEEKLY_goal_set2_interaction_month',
        'WEEKLY_goal_set2_interaction_eachOther'
    ]

elif run_num == 2:
    specific_goals = [
        'DAILY_goal1_report',
        'DAILY_goal1_importance',
        'DAILY_goal1_consequences',
        'DAILY_goal1_motivationInternal',
        'DAILY_goal1_motivationExternal',
        'DAILY_goal1_confidence',
        'DAILY_goal1_effort',
        'DAILY_goal1_interaction_week1',
        'DAILY_goal1_interaction_week2',
        'DAILY_goal1_interaction_month1',
        'DAILY_goal1_interaction_month2',
        'DAILY_goal2_report',
        'DAILY_goal2_importance',
        'DAILY_goal2_consequences',
        'DAILY_goal2_motivationInternal',
        'DAILY_goal2_motivationExternal',
        'DAILY_goal2_confidence',
        'DAILY_goal2_effort',
        'DAILY_goal2_interaction_week1',
        'DAILY_goal2_interaction_week2',
        'DAILY_goal2_interaction_month1',
        'DAILY_goal2_interaction_month2',
        'DAILY_goal2_interaction_eachOther'
    ]
    
    weekly_goals = [
        'WEEKLY_goal_report1',
        'WEEKLY_goal_set1_importance',
        'WEEKLY_goal_set1_consequences',
        'WEEKLY_goal_set1_motivationInternal',
        'WEEKLY_goal_set1_motivationExternal',
        'WEEKLY_goal_set1_confidence',
        'WEEKLY_goal_set1_effort',
        'WEEKLY_goal_set1_interaction_month1',
        'WEEKLY_goal_set1_interaction_month2',
        'WEEKLY_goal_report2',
        'WEEKLY_goal_set2_importance',
        'WEEKLY_goal_set2_consequences',
        'WEEKLY_goal_set2_motivationInternal',
        'WEEKLY_goal_set2_motivationExternal',
        'WEEKLY_goal_set2_confidence',
        'WEEKLY_goal_set2_effort',
        'WEEKLY_goal_set2_interaction_month1',
        'WEEKLY_goal_set2_interaction_month2',
        'WEEKLY_goal_set2_interaction_eachOther'
    ]

non_numeric_cols = [
    'IDEAL_values_monthly1',
    'IDEAL_values_monthly2',
    'IDEAL_values_monthly3',
    'DAILY_goal1_set',
    'DAILY_goal2_set',
    'WEEKLY_goal_set1',
    'WEEKLY_goal_set2',
    'MONTHLY_goal_set1',
    'MONTHLY_goal_set2',
    'DAILY_next24_diet',
    'DAILY_past48to24_gapCause',
    'DAILY_survey_situation1_surveys',
    'DAILY_survey_situation2_surveys',
    'DAILY_survey_missed',
    'DAILY_past24_gapCause',
    'ParticipantIdentifier',
    'trial_date'
]

In [717]:
df_daily_sr = df.loc[df.ResultIdentifier.isin(past24_general + 
                                              past24_categories + 
                                              next24_categories + 
                                              specific_goals +
                                              non_numeric_cols)].reset_index(drop=True)

#### Convert to Wide

In [718]:
# Convert values in 'Answers' column to numeric where possible, else leave as string
# df_daily_sr['Answers'] = pd.to_numeric(df_daily_sr['Answers'], errors='coerce').fillna(df_daily_sr['Answers'])

# Pivot the data
df_daily_sr_wide = df_daily_sr.pivot_table(index=["ParticipantIdentifier", "trial_date"],
                                           columns='ResultIdentifier',
                                           values='Answers',
                                           aggfunc=lambda x: ' '.join(map(str, x))).reset_index()
# get rid of name on index
df_daily_sr_wide = df_daily_sr_wide.rename_axis(None, axis=1)

In [719]:
# convert data to numeric where appropriate
df_daily_sr_wide.loc[:,df_daily_sr_wide.columns[~df_daily_sr_wide.columns.isin(non_numeric_cols + ['ParticipantIdentifier', 'trial_date'])]] = df_daily_sr_wide.loc[:,df_daily_sr_wide.columns[~df_daily_sr_wide.columns.isin(non_numeric_cols)]].apply(pd.to_numeric, errors='coerce')

In [720]:
# Break gap cause into two columns
if run_num == 1:
    df_daily_sr_wide[['DAILY_past24_gapCause_internal', 'DAILY_past24_gapCause_external', 'drop_col']] = df_daily_sr_wide.DAILY_past24_gapCause.str.split("_", expand = True)
    df_daily_sr_wide.drop(columns='drop_col', inplace=True)

elif run_num == 2:
    df_daily_sr_wide[['DAILY_past24_gapCause_internal', 'DAILY_past24_gapCause_external', 'drop_col', 'drop_col']] = df_daily_sr_wide.DAILY_past24_gapCause.str.split("_", expand = True)
    df_daily_sr_wide.drop(columns='drop_col', inplace=True)

# convert to numeric 0-1
cols = ['DAILY_past24_gapCause_internal', 'DAILY_past24_gapCause_external']
df_daily_sr_wide[cols] = df_daily_sr_wide[cols].apply(pd.to_numeric, errors = 'coerce')
df_daily_sr_wide[cols] = df_daily_sr_wide[cols]/100

In [721]:
# REPEAT FOR MISSED DAY DATA
# Break gap cause into two columns
df_daily_sr_wide[['DAILY_past48to24_gapCause_internal', 'DAILY_past48to24_gapCause_external']] = df_daily_sr_wide.DAILY_past48to24_gapCause.str.split("_", expand = True)

# convert to numeric 0-1
cols = ['DAILY_past48to24_gapCause_internal', 'DAILY_past48to24_gapCause_external']
df_daily_sr_wide[cols] = df_daily_sr_wide[cols].apply(pd.to_numeric, errors = 'coerce')
df_daily_sr_wide[cols] = df_daily_sr_wide[cols]/100

#### Category Gap Calculation

⚡ Make sure that we are not calculating gaps where there was no PREDICTION MADE

<div class="alert alert-block alert-info">
<b>📝 Note:</b><br>
    I am assuming that peoples' goals are directional in a way that MAY NOT BE ACCURATE for everyone.<br><br>
    For example, I am assuming that people want to sleep more and drink less - in other words they have a <b>gap</b> if they have <b>more</b> drinks than planned, but for <b>sleep</b> the gap calculation is reversed since we assume a gap means that you had <b>fewer</b> hours of sleep than planned.<br><br>
    While this may be accurate <i>in general</i> I would reasonably expect there to be exceptions.
</div>

In [722]:
# calculate diet gap (since it is originally a success measure)
df_daily_sr_wide['DAILY_gap_diet'] = 100 - df_daily_sr_wide.DAILY_past24_diet

In [723]:
# take the predicted amount from the day before and subtract the actual amount...
for i in range(df_daily_sr_wide.shape[0]-1):
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_sleep'] =  df_daily_sr_wide.loc[i, 'DAILY_next24_sleep'] - df_daily_sr_wide.loc[i+1, 'DAILY_past24_sleep']
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_occupation'] =  df_daily_sr_wide.loc[i, 'DAILY_next24_occupation'] - df_daily_sr_wide.loc[i+1, 'DAILY_past24_occupation']    
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_nonoccupation'] =  df_daily_sr_wide.loc[i, 'DAILY_next24_nonoccupation'] - df_daily_sr_wide.loc[i+1, 'DAILY_past24_nonoccupation']    
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_leisureSolo'] =  df_daily_sr_wide.loc[i+1, 'DAILY_past24_leisureSolo'] - df_daily_sr_wide.loc[i, 'DAILY_next24_leisureSolo'] # reversed
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_leisureNonSolo'] =  df_daily_sr_wide.loc[i+1, 'DAILY_past24_leisureNonSolo']  - df_daily_sr_wide.loc[i, 'DAILY_next24_leisureNonSolo'] # reversed
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_exercise'] =  df_daily_sr_wide.loc[i, 'DAILY_next24_exercise'] - df_daily_sr_wide.loc[i+1, 'DAILY_past24_exercise']    
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_socialMedia'] =  df_daily_sr_wide.loc[i+1, 'DAILY_past24_socialMedia'] - df_daily_sr_wide.loc[i, 'DAILY_next24_socialMedia'] # reversed 
    df_daily_sr_wide.loc[i+1, 'DAILY_gap_drinks'] =  df_daily_sr_wide.loc[i+1, 'DAILY_past24_drinks'] - df_daily_sr_wide.loc[i, 'DAILY_next24_drinks'] # reversed
    

#### EDA Profiling

In [679]:
profile = ProfileReport(df_daily_sr_wide.iloc[:,2:], title=f"Daily Reports Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"daily_reports_run{run_num}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Summarize dataset: 100%|██████████| 5281/5281 [04:37<00:00, 19.03it/s, Completed]                                                                             
Generate report structure: 100%|██████████| 1/1 [00:12<00:00, 12.49s/it]
Render HTML: 100%|██████████| 1/1 [00:45<00:00, 45.73s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00,  1.94it/s]


#### Clean

##### Variance

We can see that there are a number of participants who have no variance in a given category.

This is obviously a problem for some analyses...

In [724]:
# check for variance in numeric cols only, leaving ParticipantIdentifier for grouping
non_numeric_cols_alt = [item for item in non_numeric_cols if item != 'ParticipantIdentifier']

In [725]:
# Drop 'trial_date' column
data = df_daily_sr_wide.drop(columns=non_numeric_cols_alt, errors='ignore')

# Group by 'ParticipantIdentifier' and compute the variance
grouped_variance = data.groupby('ParticipantIdentifier').var()

# Filter the grouped_variance dataframe to only include columns with 0 variance for any participant
zero_variance_df = grouped_variance[grouped_variance == 0].dropna(how='all')

# Melt the dataframe to have ParticipantIdentifier, Column with 0 variance
melted_zero_variance_df = zero_variance_df.reset_index().melt(id_vars=['ParticipantIdentifier'], value_name='Variance')
final_zero_variance_df = melted_zero_variance_df.dropna(subset=['Variance']).drop(columns='Variance')
final_zero_variance_df = pd.DataFrame(final_zero_variance_df.groupby('ParticipantIdentifier').count()).reset_index().sort_values(by='variable', ascending=False)
final_zero_variance_df = final_zero_variance_df.rename(columns={'variable': 'ZeroVariance'})

# Count the number of values present in the variable column for each participant
value_counts = data.groupby('ParticipantIdentifier').count()

# Merge the value counts with the final_zero_variance_df
merged_df = final_zero_variance_df.merge(value_counts, on='ParticipantIdentifier', how='left')

# Extract only the relevant columns
result_df = merged_df[['ParticipantIdentifier', 'ZeroVariance', 'DAILY_goal1_confidence']]
result_df.columns = ['ParticipantIdentifier', 'dailySR_zeroVar_cols', 'Count']

result_df

Unnamed: 0,ParticipantIdentifier,dailySR_zeroVar_cols,Count
0,1081454a-03ff-445c-9602-ac9fe9e3e5cf,16,60
1,3e1d1276-0e73-4457-9911-f189b0ed0778,9,85
2,39ecc6ed-1f7e-4d59-8442-371fd16efb08,8,28
3,06af7782-cd70-4938-8e67-b6d98b34b665,5,75
4,fee5cd07-329a-4f07-bb1a-913dfa09e3b4,4,83
...,...,...,...
70,5350441c-7181-463e-9165-5611b5bcab10,1,85
71,aa94f196-94ac-4e0e-b66d-5e2c06f717b7,1,36
72,4e465685-8d64-4b22-8b6c-9409f9eb3c02,1,68
73,4f4440e7-3a38-4fa7-9271-9730806e441a,1,85


In [726]:
# we can see that drinks have the most people with zero variance...
pd.DataFrame(grouped_variance[grouped_variance == 0].dropna(how='all').eq(0).sum()).reset_index(names='Variable').sort_values(by=0, ascending=False).reset_index(drop=True)

Unnamed: 0,Variable,0
0,DAILY_past24_drinks,53
1,DAILY_gap_drinks,15
2,DAILY_next24_drinks,15
3,DAILY_next24_sleep,8
4,DAILY_next24_socialMedia,8
...,...,...
66,DAILY_past24_occupation,0
67,DAILY_past24_physicalEffort,0
68,DAILY_past24_procrastination,0
69,DAILY_past24_change,0


In [727]:
# add flag for subjects with more than 5 zero variance columns
flag_threshold = 5

result_df['dailySR_zeroVarCols_flag'] = False
result_df.loc[result_df.dailySR_zeroVar_cols > flag_threshold, 'dailySR_zeroVarCols_flag'] = True
result_df.sort_values(by='dailySR_zeroVar_cols', ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['dailySR_zeroVarCols_flag'] = False


Unnamed: 0,ParticipantIdentifier,dailySR_zeroVar_cols,Count,dailySR_zeroVarCols_flag
0,1081454a-03ff-445c-9602-ac9fe9e3e5cf,16,60,True
1,3e1d1276-0e73-4457-9911-f189b0ed0778,9,85,True
2,39ecc6ed-1f7e-4d59-8442-371fd16efb08,8,28,True
3,06af7782-cd70-4938-8e67-b6d98b34b665,5,75,False
4,fee5cd07-329a-4f07-bb1a-913dfa09e3b4,4,83,False
...,...,...,...,...
48,fc490430-6a41-4853-a2cf-ae0b15265cb6,1,65,False
47,042d7595-3fdc-4cf9-b288-c4b7961916d8,1,42,False
46,9d5c19e4-da3f-4760-b769-0ed24d80c917,1,84,False
45,98f7c7df-3bbf-44bf-99be-e2995f557e91,1,82,False


In [728]:
# look at subject with most zero var columns
df_daily_sr_wide.loc[(df_daily_sr_wide['ParticipantIdentifier'] == '27329533-d0a4-4605-9da5-0eb857154cae') & (df_daily_sr_wide['DAILY_goal1_confidence'].notna())]

Unnamed: 0,ParticipantIdentifier,trial_date,DAILY_goal1_confidence,DAILY_goal1_consequences,DAILY_goal1_effort,DAILY_goal1_importance,DAILY_goal1_interaction_month1,DAILY_goal1_interaction_month2,DAILY_goal1_interaction_week1,DAILY_goal1_interaction_week2,...,DAILY_past48to24_gapCause_external,DAILY_gap_diet,DAILY_gap_sleep,DAILY_gap_occupation,DAILY_gap_nonoccupation,DAILY_gap_leisureSolo,DAILY_gap_leisureNonSolo,DAILY_gap_exercise,DAILY_gap_socialMedia,DAILY_gap_drinks


In [729]:
# merge with main sr df
df_daily_sr_wide = df_daily_sr_wide.merge(result_df.drop(columns=['Count']), on='ParticipantIdentifier', how='left')

In [730]:
# Add sr prefix
# df_daily_sr_wide.columns[2:]

df_daily_sr_wide.columns = ['ParticipantIdentifier', 'trial_date'] + ['sr_' + col for col in df_daily_sr_wide.columns[2:]]

#### Save

In [731]:
# Create df with one participant ID for every trial_date

# Create a series of dates from '2022-09-27' to '2022-12-20'
if run_num == 1:
    date_series = pd.date_range(start='2022-09-27', end='2022-12-20')
elif run_num == 2:
    date_series = pd.date_range(start='2023-01-30', end='2023-04-24')

ids_series = subjects

# Create a dataframe using a cartesian product of the two series
df_complete_idDate = pd.DataFrame({
    'ParticipantIdentifier': np.repeat(ids_series, len(date_series)),
    'trial_date': date_series.tolist() * len(ids_series)
}).reset_index(drop=True)

# Convert trial_date to datetime.date
df_complete_idDate['trial_date'] = pd.to_datetime(df_complete_idDate['trial_date']).dt.date
df_daily_sr_wide['trial_date'] = pd.to_datetime(df_daily_sr_wide['trial_date']).dt.date

# Join with affect df
df_daily_sr_wide = df_complete_idDate.merge(df_daily_sr_wide, how='left', on=['ParticipantIdentifier', 'trial_date'])

In [732]:
# save to csv
if run_num ==1:
    # run 1
    df_daily_sr_wide.to_csv(save_path + 'run1_selfReport.csv', index=False)
if run_num ==2:
    # run 2
    df_daily_sr_wide.to_csv(save_path + 'run2_selfReport.csv', index=False)

### Daily/Weekly Social Support

**NOTE**: These were only collected in run 2

Weekly support is a 12 item scale scored on a 5 point Likert-scale (0-4).
It is based on the The [Interpersonal Support Evaluation List](https://www.cmu.edu/common-cold-project/measures-by-study/psychological-and-social-constructs/social-relationships-loneliness-measures/social-support.html), using the [ISEL-12 version](https://www.cmu.edu/common-cold-project/measures-by-study/psychological-and-social-constructs/social-relationships-loneliness-measures/isel_12_item.pdf).

Daily social support is a custom measure developed by [Leo Huang](https://www.leohuangneuro.com/about-me), [Cendri Hutcherson](https://www.linkedin.com/in/cendri-hutcherson-3327a161/?originalSubdomain=ca), and [Daniel J Wilson](https://github.com/danieljwilson).

The items fall under 3 categories 👇

![Example Image](../../3_3_6_inputs/images/ss_daily_items.jpg)

In [756]:
ss_df = df.loc[df['ResultIdentifier'].str.startswith('ss_weekly_')].reset_index(drop=True)

In [758]:
ss_df.to_csv('ss_df.csv', index=False)

In [762]:
ss_df.iloc[0, 2]

'4'

In [759]:
ss_df.head(14)

Unnamed: 0,ParticipantIdentifier,ResultIdentifier,Answers,EndDate,datetime,trial_date,time
0,ffe41956-4096-4732-8cfb-8192829bd482,ss_weekly_1,4,2023-01-29T19:56:52-05:00,2023-01-29 19:56:52-05:00,2023-01-29,19:56:52
1,ffe41956-4096-4732-8cfb-8192829bd482,ss_weekly_2,1,2023-01-29T19:56:57-05:00,2023-01-29 19:56:57-05:00,2023-01-29,19:56:57
2,ffe41956-4096-4732-8cfb-8192829bd482,ss_weekly_3,4,2023-01-29T19:56:59-05:00,2023-01-29 19:56:59-05:00,2023-01-29,19:56:59
3,ffe41956-4096-4732-8cfb-8192829bd482,ss_weekly_4,4,2023-01-29T19:57:02-05:00,2023-01-29 19:57:02-05:00,2023-01-29,19:57:02
4,ffe41956-4096-4732-8cfb-8192829bd482,ss_weekly_5,3,2023-01-29T19:57:09-05:00,2023-01-29 19:57:09-05:00,2023-01-29,19:57:09
5,ffe41956-4096-4732-8cfb-8192829bd482,ss_weekly_6,4,2023-01-29T19:57:11-05:00,2023-01-29 19:57:11-05:00,2023-01-29,19:57:11
6,ffe41956-4096-4732-8cfb-8192829bd482,ss_weekly_7,1,2023-01-29T19:57:14-05:00,2023-01-29 19:57:14-05:00,2023-01-29,19:57:14
7,ffe41956-4096-4732-8cfb-8192829bd482,ss_weekly_8,2,2023-01-29T19:57:19-05:00,2023-01-29 19:57:19-05:00,2023-01-29,19:57:19
8,ffe41956-4096-4732-8cfb-8192829bd482,ss_weekly_9,4,2023-01-29T19:57:22-05:00,2023-01-29 19:57:22-05:00,2023-01-29,19:57:22
9,ffe41956-4096-4732-8cfb-8192829bd482,ss_weekly_10,4,2023-01-29T19:57:24-05:00,2023-01-29 19:57:24-05:00,2023-01-29,19:57:24


In [403]:
reverse_items = [
    '_1', '_4', '_5', '_7', 10, 12
    
]
ss_df.head()

Unnamed: 0,ParticipantIdentifier,ResultIdentifier,Answers,EndDate,datetime,trial_date,time
35850,ffe41956-4096-4732-8cfb-8192829bd482,ss_weekly_1,4,2023-01-29T19:56:52-05:00,2023-01-29 19:56:52-05:00,2023-01-29,19:56:52
35856,ffe41956-4096-4732-8cfb-8192829bd482,ss_weekly_2,1,2023-01-29T19:56:57-05:00,2023-01-29 19:56:57-05:00,2023-01-29,19:56:57
35861,ffe41956-4096-4732-8cfb-8192829bd482,ss_weekly_3,4,2023-01-29T19:56:59-05:00,2023-01-29 19:56:59-05:00,2023-01-29,19:56:59
35865,ffe41956-4096-4732-8cfb-8192829bd482,ss_weekly_4,4,2023-01-29T19:57:02-05:00,2023-01-29 19:57:02-05:00,2023-01-29,19:57:02
35873,ffe41956-4096-4732-8cfb-8192829bd482,ss_weekly_5,3,2023-01-29T19:57:09-05:00,2023-01-29 19:57:09-05:00,2023-01-29,19:57:09


In [764]:
# Make Answers numeric
ss_df['Answers'] = pd.to_numeric(ss_df['Answers'], errors='coerce')

# Reverse score the specified items
reverse_items = ['ss_weekly_1', 'ss_weekly_4', 'ss_weekly_5', 'ss_weekly_7', 'ss_weekly_10', 'ss_weekly_12']
ss_df.loc[ss_df['ResultIdentifier'].isin(reverse_items) & (ss_df['Answers'] != 0), 'Answers'] = 5 - ss_df['Answers']

In [770]:
# Calculate the total score for each participant
total_scores = ss_df.groupby(['ParticipantIdentifier', 'trial_date'])['Answers'].sum().reset_index()
total_scores.rename(columns={'Answers': 'ss_weekly_ISEL12_totalScore'}, inplace=True)

In [773]:
# Calculate subscale scores
appraisal_items = ['ss_weekly_1', 'ss_weekly_2', 'ss_weekly_3', 'ss_weekly_4']
belonging_items = ['ss_weekly_5', 'ss_weekly_6', 'ss_weekly_7', 'ss_weekly_8']
tangible_items = ['ss_weekly_9', 'ss_weekly_10', 'ss_weekly_11', 'ss_weekly_12']

appraisal_scores = ss_df[ss_df['ResultIdentifier'].isin(appraisal_items)].groupby(['ParticipantIdentifier', 'trial_date'])['Answers'].sum().reset_index()
appraisal_scores.rename(columns={'Answers': 'ss_weekly_ISEL12_appraisal'}, inplace=True)

belonging_scores = ss_df[ss_df['ResultIdentifier'].isin(belonging_items)].groupby(['ParticipantIdentifier', 'trial_date'])['Answers'].sum().reset_index()
belonging_scores.rename(columns={'Answers': 'ss_weekly_ISEL12_belonging'}, inplace=True)

tangible_scores = ss_df[ss_df['ResultIdentifier'].isin(tangible_items)].groupby(['ParticipantIdentifier', 'trial_date'])['Answers'].sum().reset_index()
tangible_scores.rename(columns={'Answers': 'ss_weekly_ISEL12_tangible'}, inplace=True)

In [None]:
# Merge the scores into one dataframe
merged_scores = total_scores.merge(appraisal_scores, on='ParticipantIdentifier').merge(belonging_scores, on='ParticipantIdentifier').merge(tangible_scores, on='ParticipantIdentifier')

# Extract trial date for each participant
trial_dates = ss_df.groupby('ParticipantIdentifier')['trial_date'].first().reset_index()

# Merge trial date into the final dataframe
final_df = merged_scores.merge(trial_dates, on='ParticipantIdentifier')

# Keep only the required columns
final_df = final_df[['ParticipantIdentifier', 'trial_date', 'ss_weekly_ISEL12_totalScore', 'ss_weekly_ISEL12_appraisal', 'ss_weekly_ISEL12_belonging', 'ss_weekly_ISEL12_tangible']]

final_df.head()

Unnamed: 0,ParticipantIdentifier,trial_date,ss_weekly_ISEL12_totalScore
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-30,28
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-05,21
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-12,26
3,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-19,29
4,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-26,25


## Custom Tasks


### Food Task

##### Load Data

In [733]:
# erase df if it already exists
if 'df' in globals():
    del(df)
    print('deleted existing df')

if 'food_df' in globals():
    del(food_df)
    print('deleted daily food_df')

deleted existing df
deleted daily food_df


In [734]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_survey_results.csv')

##### Select Data

In [735]:
food_df = df.loc[df['ResultIdentifier'].str.startswith('rating_')].reset_index(drop=True)
hunger =  df.loc[df['ResultIdentifier']=='Hunger_Screen'].reset_index(drop=True)
hunger.rename(columns={"Answers": "task_food_hunger_level"}, inplace=True)

#### Convert to Wide

In [736]:
# Convert the dataframe from long to wide format
food_df_wide = food_df.pivot_table(index=["ParticipantIdentifier", "trial_date"], 
                              columns='ResultIdentifier', 
                              values='Answers', 
                              aggfunc='first').reset_index()

# get rid of name on index
food_df_wide = food_df_wide.rename_axis(None, axis=1)

In [737]:
import json

# Filter out columns that start with "rating_"
rating_columns = [col for col in food_df_wide.columns if col.startswith('rating_')]

# Define a function to extract required information from the 'name' value
def extract_name_info(json_data):
    # Check if 'name' key exists in json_data and is of type string
    name_str = json_data.get('name', "")
    
    if not isinstance(name_str, str):
        return None, None

    # Extract task_food_item
    item_start = name_str.rfind('_') + 1
    item_end = name_str.rfind('.jpg')
    task_food_item = name_str[item_start:item_end] if item_start != -1 and item_end != -1 else None
    
    # Extract task_food_category
    category_start = name_str.rfind('/') + 1
    category_end = name_str.rfind('_')
    task_food_category = name_str[category_start:category_end] if category_start != -1 and category_end != -1 else None
    
    return task_food_item, task_food_category

# Modify the function to handle potential strings in columns
def parse_json(entry):
    try:
        return json.loads(entry)
    except (TypeError, json.JSONDecodeError):
        return {}

# Re-parse the JSON strings using the modified function
for col in rating_columns:
    food_df_wide[col] = food_df_wide[col].apply(parse_json)

# Re-extract the required values using the modified function
for col in rating_columns:
    # Extract required information
    food_df_wide[col + '_rating'] = food_df_wide[col].apply(lambda x: x.get('rating', None))
    food_df_wide[col + '_rt'] = food_df_wide[col].apply(lambda x: x.get('reactionTime', None))
    food_df_wide[col + '_item'], food_df_wide[col + '_category'] = zip(*food_df_wide[col].apply(extract_name_info))

# Drop the original "rating_" columns as they are not needed anymore
food_df_wide = food_df_wide.drop(columns=rating_columns)

# Convert all 'task_food_rating' columns to dtype int
rating_cols_to_convert = [col for col in food_df_wide.columns if '_rating' in col]

for col in rating_cols_to_convert:
    food_df_wide[col] = food_df_wide[col].astype(np.int64)  # Using 'Int64' to handle potential NaN values

# Rename columns that start with 'rating' to start with 'task_food'
food_df_wide.columns = ['task_food' + col[len('rating'):] if col.startswith('rating') else col for col in food_df_wide.columns]

food_df_wide.head()

Unnamed: 0,ParticipantIdentifier,trial_date,task_food_1_rating,task_food_1_rt,task_food_1_item,task_food_1_category,task_food_10_rating,task_food_10_rt,task_food_10_item,task_food_10_category,...,task_food_7_item,task_food_7_category,task_food_8_rating,task_food_8_rt,task_food_8_item,task_food_8_category,task_food_9_rating,task_food_9_rt,task_food_9_item,task_food_9_category
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-30,-5,6712,carrotSalad,hu,3,2451,breakfastTacos,ht,...,grapeNuts,hu,-4,2477,pancakes,ut,1,2615,hardBoiledEgg,hu
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-31,2,3104,bagel,ht,-1,2682,pizza,ut,...,eggMcMuffin,ut,-3,1964,cocaCola,ut,1,2298,viennoisChocolat,ut
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,1,3448,fruitSalad,ht,-4,3877,ovaltine,hu,...,chilaquiles,ut,1,4292,hasbrowns,ut,0,7124,omelette2,ht
3,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,1,3462,crepe2,ut,-2,2375,fruitSalad,ht,...,croissant,ut,-3,2475,fruitLoops,ut,-5,1825,ovaltine,hu
4,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-03,-5,2245,sardines,hu,-4,1776,Kimchi,hu,...,pancakes,ut,-1,2327,fruitSalad,ht,-2,2176,shreddedWheat,hu


In [738]:
# add hunger value
food_df_wide = food_df_wide.merge(hunger[['ParticipantIdentifier', 'trial_date', 'task_food_hunger_level']],
                                  on=['ParticipantIdentifier', 'trial_date'],
                                  how='left')

# Convert hunger to int
food_df_wide['task_food_hunger_level'] = food_df_wide['task_food_hunger_level'].astype(np.float32)

food_df_wide.head()

Unnamed: 0,ParticipantIdentifier,trial_date,task_food_1_rating,task_food_1_rt,task_food_1_item,task_food_1_category,task_food_10_rating,task_food_10_rt,task_food_10_item,task_food_10_category,...,task_food_7_category,task_food_8_rating,task_food_8_rt,task_food_8_item,task_food_8_category,task_food_9_rating,task_food_9_rt,task_food_9_item,task_food_9_category,task_food_hunger_level
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-30,-5,6712,carrotSalad,hu,3,2451,breakfastTacos,ht,...,hu,-4,2477,pancakes,ut,1,2615,hardBoiledEgg,hu,3.0
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-01-31,2,3104,bagel,ht,-1,2682,pizza,ut,...,ut,-3,1964,cocaCola,ut,1,2298,viennoisChocolat,ut,3.0
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,1,3448,fruitSalad,ht,-4,3877,ovaltine,hu,...,ut,1,4292,hasbrowns,ut,0,7124,omelette2,ht,2.0
3,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,1,3462,crepe2,ut,-2,2375,fruitSalad,ht,...,ut,-3,2475,fruitLoops,ut,-5,1825,ovaltine,hu,0.0
4,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-03,-5,2245,sardines,hu,-4,1776,Kimchi,hu,...,ut,-1,2327,fruitSalad,ht,-2,2176,shreddedWheat,hu,0.0


#### EDA

In [739]:
profile = ProfileReport(food_df_wide.iloc[:,2:], title=f"Food Task Run {run_num} | Pandas Profiling Report")
profile.to_file(eda_reports_path + f"task_food_run{run_num}.html")

Summarize dataset: 100%|██████████| 1032/1032 [00:47<00:00, 21.55it/s, Completed]                                             
Generate report structure: 100%|██████████| 1/1 [00:11<00:00, 11.50s/it]
Render HTML: 100%|██████████| 1/1 [00:07<00:00,  7.85s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 17.94it/s]


#### Clean

We had numerous outlier RT values.

We used the $z$ transform method to flag outliers (in the column `task_food_rt_flag`) based on Berger and Kiefer ([2021](https://doi.org/10.3389/fpsyg.2021.675558)) where they tested multiple methods of removing outliers from rt data.

We also set the flag threshold to 3.

In [740]:
# clean outlier rt values

from scipy.stats import zscore

# Step 1: Filter out rt cols
rt_columns = [col for col in food_df_wide.columns if col.endswith('_rt')]

# Step 2: Compute the z-scores for these columns
for col in rt_columns:
    z_col_name = col + '_z'
    food_df_wide[z_col_name] = zscore(food_df_wide[col], nan_policy='omit')

# Step 3: Check each row for values above a threshold in the '_z' columns
threshold = 3  # Define a threshold value
z_columns = [col + '_z' for col in rt_columns]
food_df_wide['task_food_rt_flag'] = food_df_wide[z_columns].apply(lambda row: any(abs(val) > threshold for val in row), axis=1)

z_columns_to_drop = [col for col in food_df_wide.columns if col.endswith('_z')]
wide_df = food_df_wide.drop(columns=z_columns_to_drop)


#### Save

In [741]:
# Create df with one participant ID for every trial_date

# Create a series of dates from '2022-09-27' to '2022-12-20'
if run_num == 1:
    date_series = pd.date_range(start='2022-09-27', end='2022-12-20')
elif run_num == 2:
    date_series = pd.date_range(start='2023-01-30', end='2023-04-24')

ids_series = subjects

# Create a dataframe using a cartesian product of the two series
df_complete_idDate = pd.DataFrame({
    'ParticipantIdentifier': np.repeat(ids_series, len(date_series)),
    'trial_date': date_series.tolist() * len(ids_series)
}).reset_index(drop=True)

# Convert trial_date to datetime.date
df_complete_idDate['trial_date'] = pd.to_datetime(df_complete_idDate['trial_date']).dt.date
food_df_wide['trial_date'] = pd.to_datetime(food_df_wide['trial_date']).dt.date

# Join with affect df
food_df_wide = df_complete_idDate.merge(food_df_wide, how='left', on=['ParticipantIdentifier', 'trial_date'])

In [742]:
# save to csv
if run_num == 1:
    # run 1
    print('Task was not part of run 1...')
if run_num == 2:
    # run 2
    food_df_wide.to_csv(save_path + 'run2_task_food.csv', index=False)

### N-Back

The n-back sequence was created as follows (where `n` indicates whether it is 2-back or 3-back)

```javascript
function constructSequence(n) {
    const ls = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
    let number = 0;
    let char = "";
    const alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'];
    const sequence = []
    for (let i = 0; i < SEQ_LEN; i++) {
        number = ls[Math.floor(Math.random() * ls.length)];
        if (i >= n && number <= 2) {
            char = sequence[i - n];
            //console.log("in if ===>", char, sequence, i, n)
        } else {
            char = alphabet[Math.floor(Math.random() * alphabet.length)];
            //console.log("in else ==>", char)
        }
        sequence.push(char)

    }
    return sequence;
}
```

Given that `ls` has a length of 11 this means that on average there is a 3/11 chance of having a match (for positions 3)


In [171]:
import json

#### Load Data

In [177]:
if run_num ==1:
    # run 1
    df = pd.read_csv(save_path + 'run1_survey_results.csv')
if run_num ==2:
    # run 2
    df = pd.read_csv(save_path + 'run2_survey_results.csv')

#### Format Data

In [426]:
nback_df = df.loc[df.ResultIdentifier == 'task_custom_nBack_results'].reset_index(drop=True)

In [419]:
# we only need this as bids were being overwritten when the bid was NOT accepted with a 0 bid...
# this was fixed on Feb 7, but using the actual bid value will also continue to work...

nback_df_bids = df.loc[df.ResultIdentifier == 'task_custom_nBack_diffSelect'].reset_index(drop=True)

In [428]:
nback_df.Answers[5]

'{"bid":70,"randomNumber":74,"correctness":0.9545454545454546,"earnings":74,"mode":"hard","matched":4,"missed":0,"sequence":["T","F","A","O","K","A","C","G","R","Z","E","T","R","Y","T","S","Z","R","G","Z","B","G","A","W","B"],"falseAlarm":1,"indexOfMatchClicked":[6,15,18,20,22]}'

In [429]:
# Parse json to create columns
for i in range(nback_df.shape[0]):
    nback_df.loc[i, 'task_nback_bid'] = json.loads(nback_df_bids.Answers[i])['bid']
    nback_df.loc[i, 'task_nback_rndNum'] = json.loads(nback_df.Answers[i])['randomNumber']    
    nback_df.loc[i, 'task_nback_mode'] = json.loads(nback_df.Answers[i])['mode']
    nback_df.loc[i, 'task_nback_matched'] = json.loads(nback_df.Answers[i])['matched']    
    nback_df.loc[i, 'task_nback_missed'] = json.loads(nback_df.Answers[i])['missed']    
    nback_df.loc[i, 'task_nback_falseAlarm'] = json.loads(nback_df.Answers[i])['falseAlarm']
    nback_df.loc[i, 'task_nback_trialCount'] = len(json.loads(nback_df.Answers[i])['sequence'])

In [430]:
nback_df = nback_df.drop(['ResultIdentifier', 'Answers', 'EndDate', 'datetime'], axis=1)
nback_df = nback_df.rename(columns={"time": "task_nback_time"})

#### Add Features

Adding the following metrics for Binary Classification:

1. **Accuracy**: 
   The proportion of correctly predicted classifications in the total predictions made.
   $$
   \text{Accuracy} = \frac{\text{TP} + \text{TN}}{\text{TP} + \text{TN} + \text{FP} + \text{FN}}
   $$

2. **Precision** (or Positive Predictive Value):
   The proportion of positive identifications that were actually correct.
   $$
   \text{Precision} = \frac{\text{TP}}{\text{TP} + \text{FP}}
   $$

3. **Recall** (or Sensitivity or True Positive Rate):
   The proportion of actual positives that were identified correctly.
   $$
   \text{Recall} = \frac{\text{TP}}{\text{TP} + \text{FN}}
   $$

4. **Specificity** (or True Negative Rate):
   The proportion of actual negatives that were identified correctly.
   $$
   \text{Specificity} = \frac{\text{TN}}{\text{TN} + \text{FP}}
   $$

5. **False Alarm Rate** (or Fall-Out):
   The proportion of actual negatives that were incorrectly classified as positive.
   $$
   \text{False Alarm Rate} = \frac{\text{FP}}{\text{TN} + \text{FP}}
   $$

6. **F1 Score**:
   The harmonic mean of precision and recall, giving a balance between the two.
   $$
   \text{F1 Score} = 2 \times \frac{\text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}}
   $$

7. **Matthews Correlation Coefficient (MCC)**:
   A metric that takes into account true and false positives and negatives and is generally regarded as a balanced measure which can be used even if the classes are of very different sizes.
   $$
   \text{MCC} = \frac{\text{TP} \times \text{TN} - \text{FP} \times \text{FN}}{\sqrt{(\text{TP} + \text{FP})(\text{TP} + \text{FN})(\text{TN} + \text{FP})(\text{TN} + \text{FN})}}
   $$

8. **Bias (C or criterion)**:
   A metric from signal detection theory that indicates the participant's response bias. A positive value indicates a bias toward saying "no" (


In [221]:
truePos = nback_df.task_nback_matched
trueNeg = nback_df.task_nback_trialCount - nback_df.task_nback_matched - nback_df.task_nback_missed - nback_df.task_nback_falseAlarm
falsePos = nback_df.task_nback_falseAlarm
falseNeg = nback_df.task_nback_missed

# proportion of correct classifications in total predictions made
nback_df['task_nback_accuracy'] = (truePos + trueNeg) / (truePos + trueNeg + falsePos + falseNeg)
# positive predictive value (hit rate)
nback_df['task_nback_precision'] = truePos / (truePos + falsePos)
# true positive rate (sensitivity)
nback_df['task_nback_recall'] = truePos / (truePos + falseNeg)
# true negative rate (false_alarm_rate)
nback_df['task_nback_specificity'] = trueNeg / (trueNeg + falsePos)
# Proportion of times the participant incorrectly indicates an n-back match when there wasn't one.
nback_df['task_nback_falseAlarmRate'] = falsePos / (trueNeg + falsePos)
# Harmonic mean of precision and recall, giving a balance between the two
nback_df['task_nback_F1'] = 2 * ((nback_df['task_nback_precision'] * nback_df['task_nback_recall'])/(nback_df['task_nback_precision'] + nback_df['task_nback_recall']))
# Matthews Correlation Coefficient (MCC):
# It takes into account true and false positives and negatives and is generally regarded as a balanced measure which can be used even if the classes are of very different sizes.
nback_df['task_nback_MCC'] = ((truePos * trueNeg) - (falsePos * falseNeg)) / (np.sqrt((truePos + falsePos)*(truePos + falseNeg) * (trueNeg + falsePos) * (trueNeg + falseNeg)))

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [227]:
# metric from signal detection theory
# indicates the participant's response bias. 
# A positive value indicates a bias toward saying "no" (conservative)
# A negative value indicates a bias toward saying "yes" (liberal).

from scipy.stats import norm

def calculate_criterion(hit_rate, false_alarm_rate):
    # Calculate the Z scores for the hit rate and false alarm rate
    z_hit = norm.ppf(hit_rate)
    z_fa = norm.ppf(false_alarm_rate)
    
    # Calculate the criterion C
    C = -0.5 * (z_hit + z_fa)
    
    return C

nback_df['task_nback_bias'] = calculate_criterion(nback_df['task_nback_precision'], nback_df['task_nback_falseAlarmRate'])

  C = -0.5 * (z_hit + z_fa)


#### EDA Profiling

In [228]:
profile = ProfileReport(nback_df.iloc[:,3:], title="n-Back Task | Pandas Profiling Report")
profile.to_file(eda_reports_path + "nback_report.html")

  x = asanyarray(arr - arrmean)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)
  x = asanyarray(arr - arrmean)
(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'cannot specify integer `bins` when input data contains infinity')
Summarize dataset: 100%|██████████| 194/194 [00:08<00:00, 23.41it/s, Completed]                                                   
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.94s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.37s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 48.07it/s]


**NOTES**

Looking at the data it is clear that something wonky went on in some trials where people have matched values up to 67, and missed values of -56. 

Can calculate a super low probability number of matches and delete trials with any values above that - as well as any trials with negative "missed" values.

False alarm also has a max of 136

In [220]:
np.where(np.sqrt((truePos + falsePos)*(truePos + falseNeg) * (trueNeg + falsePos) * (trueNeg + falseNeg)).isna())

  result = getattr(ufunc, method)(*inputs, **kwargs)


(array([  17,   90,  100,  143,  306,  363,  424,  704,  705,  720,  735,
         932,  965, 1177, 1180, 1310, 1335, 1352, 1505, 1528, 1742, 1747,
        1865, 1946, 1962, 1998, 2134, 2177, 2239, 2412, 2492, 2530, 2914,
        3089, 3153, 3337, 3611, 3739, 3838, 3978, 4033]),)

In [225]:
nback_df.iloc[17]

ParticipantIdentifier        76acef3c-d659-4fdd-b258-3668a1597584
trial_date                                             2022-10-24
task_nback_time                                          11:15:02
task_nback_bid                                               65.0
task_nback_rndNum                                            80.0
task_nback_mode                                              hard
task_nback_matched                                           48.0
task_nback_missed                                           -42.0
task_nback_falseAlarm                                         0.0
task_nback_accuracy                                          2.68
task_nback_trialCount                                        25.0
task_nback_precision                                          1.0
task_nback_recall                                             8.0
task_nback_specificity                                        1.0
task_nback_F1                                            1.777778
task_nback

In [214]:
nback_df.head(10)

Unnamed: 0,ParticipantIdentifier,trial_date,task_nback_time,task_nback_bid,task_nback_rndNum,task_nback_mode,task_nback_matched,task_nback_missed,task_nback_falseAlarm,task_nback_accuracy,task_nback_trialCount,task_nback_precision,task_nback_recall,task_nback_specificity,task_nback_F1,task_nback_MCC
0,39d785c6-9e3d-44c2-88ec-2ce14ef923f9,2022-10-24,07:07:47,57.0,63.0,hard,5.0,0.0,0.0,1.0,25.0,1.0,1.0,1.0,1.0,1.0
1,c62ae7a2-6fe6-4fd5-9b9e-93773b08d8b9,2022-10-24,07:08:51,6.0,100.0,hard,6.0,0.0,0.0,1.0,25.0,1.0,1.0,1.0,1.0,1.0
2,6b827de8-fe47-4007-aad3-202655b954e3,2022-10-24,07:22:00,0.0,0.0,easy,6.0,0.0,0.0,1.0,25.0,1.0,1.0,1.0,1.0,1.0
3,412330b3-cc02-4030-96cd-f4cfdcc45fa6,2022-10-24,07:28:35,0.0,0.0,easy,4.0,4.0,2.0,0.76,25.0,0.666667,0.5,0.882353,0.571429,0.41762
4,c2097f36-4ca3-4537-856d-a649d1557553,2022-10-24,08:52:18,0.0,0.0,easy,5.0,3.0,4.0,0.72,25.0,0.555556,0.625,0.764706,0.588235,0.378726
5,bf670311-c590-473a-98ab-d719ebf0f2ab,2022-10-24,08:58:26,56.0,98.0,hard,6.0,1.0,3.0,0.84,25.0,0.666667,0.857143,0.833333,0.75,0.645881
6,7d501571-5d23-4f09-9266-1644f4f71add,2022-10-24,08:59:10,66.0,64.0,easy,9.0,0.0,0.0,1.0,25.0,1.0,1.0,1.0,1.0,1.0
7,f889f1a4-9754-456e-ae08-092f992d3359,2022-10-24,09:00:14,13.0,14.0,hard,8.0,2.0,0.0,0.92,25.0,1.0,0.8,1.0,0.888889,0.840168
8,a33e1d38-6ee8-4da6-993b-a94a8ae7fc30,2022-10-24,09:06:27,25.0,89.0,hard,6.0,1.0,3.0,0.84,25.0,0.666667,0.857143,0.833333,0.75,0.645881
9,a9de00be-460d-4b74-bed3-bf013fe2052a,2022-10-24,09:11:25,6.0,95.0,hard,5.0,0.0,2.0,0.92,25.0,0.714286,1.0,0.9,0.833333,0.801784
