In [57]:
import pandas as pd
import numpy as np
import pickle
import datetime
import json
import os

### Required Files:
- 'SurveyTrailmakingResults'
- 'SurveyStroopResults'
- 'SurveyPSATResults'
- 'SurveyTowerOfHanoiResults'
- 'SurveyReactionTimeResults'
- 'SurveySpatialSpanMemoryResults'

### Tasks that are currently relevant
- Trail making
- Stroop
- PSAT
- Tower of Hanoi
- Reaction Time
- Spatial Span Memory

### Variable Information

In [58]:
## Input Directory
directory = "../daily_data"

In [59]:
## Output Directory
out_dir = "../indv_table_exports/"

In [60]:
## Good Participants
good_subjects = ['01801252-3a7e-4f5f-8b6d-49e8da3902f3',
                 'd26d4b78-7fcf-488e-b687-2d1c93c47b74',
                 '531d7f6d-b880-4a0b-b467-80005a316f1c']

### Handy Functions

In [61]:
## Function to fix date from UTC to ET
def fix_date_to_ET(dataframe, date_col_name):
    # Add date column to samples dataframe
    df = dataframe
    df[date_col_name] = 0
    
    for i in range(len(df.EndDate)):
        df.iloc[i]
        if pd.to_datetime(df.EndDate[i], format= '%Y-%m-%d', utc=True).tz_convert('US/Eastern').hour < 5:
            # subtract one day from date
            df.loc[i, date_col_name] = pd.to_datetime(df.EndDate[i], format= '%Y-%m-%d', utc=True).tz_convert('US/Eastern').date() - datetime.timedelta(days=1)
        else:
            df.loc[i, date_col_name] = pd.to_datetime(df.EndDate[i], format= '%Y-%m-%d', utc=True).tz_convert('US/Eastern').date()
    return df

def fix_columns_by_category(dataframe, categories):
    df = dataframe
    for item in categories:
        if item not in df.columns.to_list():
            df[item] = 'NaN'
    return df

def pivot_df(dataframe, pos: list, col, val):
    df = dataframe
    df = df.pivot_table(index=pos,
                    columns=col, 
                    values=val).reset_index()
    return df

def get_time_diff(d2, d1):
    return (d2 - d1).days

def convert_survey_file_to_dataframe(directory, filename, date_col):
    df_survey_list = []
    for folder in os.listdir(directory):
        survey_file_name = ""
        path = directory + "/" + folder
        for f_name in os.listdir(path):
            if f_name.startswith(filename):
                survey_file_name = f_name
                break

        if survey_file_name == "":
            continue

        path = path + '/' + survey_file_name

        current_df = pd.read_csv(path) 
        current_df[date_col] = 0
        current_df = fix_date_to_ET(current_df, date_col)
        df_survey_list.append(current_df)

    df_survey = pd.concat(df_survey_list)
    return df_survey

### Trailmaking Task
The subject is instructed to connect a set of 25 dots as quickly as possible while still maintaining accuracy. 
It measures:
- visual attention
- task switching
- fluid intelligence/cognitive abilities

We are doing a version that is like the typical Version B:
- Salthouse, 2011

_We want to get the last TapTimestamp to calculate total timing_

In [62]:
## Got all 'SurveyTrailmakingResults' till date in a dataframe
df_tr_survey = convert_survey_file_to_dataframe(directory, 
                                                'SurveyTrailmakingResults', 
                                                'ActualDueDate')

## convert string Taps to list of dicts (Wilson, D.)
df = df_tr_survey
df['TapsList'] = df['Taps'].apply(json.loads)

## assign new columns with final value from TapsList (Wilson, D.)
df = df.assign(Task_Trailmaking_Time=lambda x: x.TapsList.apply(lambda x: x[-1]['TapTimestamp']),
               Task_Trailmaking_Errors=lambda x: x.NumberOfErrors)

## keep relevant columns
df = df[['ParticipantIdentifier', 'ActualDueDate', 'Task_Trailmaking_Time', 'Task_Trailmaking_Errors']]
df_trail_making = df

### Stroop Task
- (Scarpina & Tagini, 2017)
- The Stroop Color and Word Test (SCWT) 
- Will calculate the stroop effect and relevant descriptive stats

In [63]:
## Function that returns a Series of all Stroop results aggregates (Wilson, D.)
def get_stroop_results(x):
    d = {}
    
    d['Task_Stroop_TotalCorrectProp'] = x['Correct'].sum()/len(x['Correct'])
    d['Task_Stroop_CongruentCorrectProp'] = len(x.loc[(x.Congruent==True) & (x.Correct==True)])/x.Congruent.value_counts()[1]
    d['Task_Stroop_IncongruentCorrectProp'] = len(x.loc[(x.Congruent==False) & (x.Correct==True)])/x.Congruent.value_counts()[0]
    d['Task_Stroop_TotalAvgRT'] = x['Time'].sum()/len(x['Time'])    
    d['Task_Stroop_CongruentAvgRT'] = x.loc[x.Congruent==True,'Time'].sum()/x.Congruent.value_counts()[1]    
    d['Task_Stroop_IncongruentAvgRT'] = x.loc[x.Congruent==False,'Time'].sum()/x.Congruent.value_counts()[0]
    
    return pd.Series(d, index=['Task_Stroop_TotalCorrectProp', 'Task_Stroop_CongruentCorrectProp',
                               'Task_Stroop_IncongruentCorrectProp', 'Task_Stroop_TotalAvgRT',
                               'Task_Stroop_CongruentAvgRT', 'Task_Stroop_IncongruentAvgRT'
                              ])

In [64]:
## Got all 'SurveyStroopResults' till date in a dataframe
df_stroop_survey = convert_survey_file_to_dataframe(directory, 
                                                'SurveyStroopResults', 
                                                'ActualDueDate')

## Keep Relevant columns only (Wilson, D.)
df = df_stroop_survey
df = df[['ParticipantIdentifier', 'ActualDueDate', 'StartTime', 
         'EndTime', 'ColorSelected', 'Color', 'Text']]

# Create correct, congruent and time columns (Wilson, D.)
df = df.assign(Congruent=lambda x: x.Color == x.Text,
               Correct=lambda x: x.Color == x.ColorSelected,
               Time=lambda x: (x.EndTime - x.StartTime)
              )

df = df.groupby(['ActualDueDate', 
                 'ParticipantIdentifier']).apply(get_stroop_results).reset_index()
df_stroop = df

### PSAT Results
- Calculate TotalTime/Length

In [65]:
## Get all PSAT results till date in a dataframe
df_psat_survey = convert_survey_file_to_dataframe(directory, 
                                                'SurveyPSATResults', 
                                                'ActualDueDate')

## Assign new column with accuracy value (Wilson D.)
df = df_psat_survey
df = df.assign(Task_PSAT_Accuracy=lambda x: x.TotalCorrect/x.Length,
               Task_PSAT_AvgTime=lambda x: x.TotalTime/x.Length
              )

## Keep relevant columns
df_psat = df[['ParticipantIdentifier', 'ActualDueDate', 'Task_PSAT_Accuracy', 'Task_PSAT_AvgTime']]

### Tower of Hanoi Task
- Moves is a string of a list of dictionaries
- Get total time required: Timestamp in last dictionary
- Get number of moves: 1 dict/move so get count of dicts)

**Important data is:**
- PuzzleWasSolved: Indicate completion

In [66]:
## Get all TOH results till date in a dataframe
df_TOH_survey = convert_survey_file_to_dataframe(directory, 
                                                'SurveyTowerOfHanoiResults', 
                                                'ActualDueDate')
df = df_TOH_survey

## Convert string Moves to list of dicts (Wilson D.)
df['MovesList'] = df['Moves'].apply(json.loads)

## Assign new columns (Wilson D.)
df = df.assign(Task_Hanoi_Solved=lambda x: x.PuzzleWasSolved,
               Task_Hanoi_Time=lambda x: x.MovesList.apply(lambda x: x[-1]['Timestamp']),
               Task_Hanoi_Moves=[len(moves) for moves in df.MovesList] # maybe give this as a multiple on optimality (ideal = 1)?
              )

## Keep relevant columns
df_TOH = df[['ParticipantIdentifier', 'ActualDueDate', 'Task_Hanoi_Solved', 'Task_Hanoi_Time', 'Task_Hanoi_Moves']]

### Reaction Time Results
- The important data is:
- ReactionTime
    - Use this to calculate both mean and standard deviation

In [67]:
## Get all Reaction Time results till date in a dataframe
df_rt_survey = convert_survey_file_to_dataframe(directory, 
                                                'SurveyReactionTimeResults', 
                                                'ActualDueDate')

## Get mean and SD of Reaction Time
df = df_rt_survey
df = pd.DataFrame(df.groupby(['ActualDueDate', 'ParticipantIdentifier'])['ReactionTime'].agg(['mean', 'std'])).reset_index()

## Rename columns (Wilson D.)
df.rename(columns={'mean': 'Task_RT_Mean', 'std': 'Task_RT_SD'}, inplace=True)

## Get final RT Results dataframe
df_reaction_time = df

### Spatial Span Memory Results
- To capture performance we are using: Score

In [68]:
## Get all Spatial Span Memory results till date in a dataframe
df_ssm_survey = convert_survey_file_to_dataframe(directory, 
                                                'SurveySpatialSpanMemoryResults', 
                                                'ActualDueDate')
## Assign new column with accuracy value (Wilson D.)
df = df_ssm_survey
df = df.assign(Task_SSMemory_Score=lambda x: x.Score)

## Keep relevant columns
df = df[['ParticipantIdentifier', 'ActualDueDate', 'Task_SSMemory_Score']]
df_spatial_span = df

## Get Task Dataframe
For this, we need to get the following df of each good participant:
- df_trail_making
- df_stroop
- df_psat
- df_TOH
- df_reaction_time
- df_spatial_span
    
Merge these 6 dataframes on ParticipantIdentifier and ActualDueDate
- This will give Task Dataframe of one participant

_Do the same for all good participants and concatenate to get final df_

In [69]:
df_task_list = []
for item in good_subjects:
    ## Get all relevant DF for the current good subject
    df_trail_making_temp = df_trail_making[df_trail_making.ParticipantIdentifier.isin([item])].reset_index(drop=True)
    df_stroop_temp = df_stroop[df_stroop.ParticipantIdentifier.isin([item])].reset_index(drop=True)
    df_psat_temp = df_psat[df_psat.ParticipantIdentifier.isin([item])].reset_index(drop=True)
    df_TOH_temp = df_TOH[df_TOH.ParticipantIdentifier.isin([item])].reset_index(drop=True)
    df_reaction_time_temp = df_reaction_time[df_reaction_time.ParticipantIdentifier.isin([item])].reset_index(drop=True)
    df_spatial_span_temp = df_spatial_span[df_spatial_span.ParticipantIdentifier.isin([item])].reset_index(drop=True)
    
    ## Merge them to get this subject's task df
    df_task_temp = df_trail_making_temp
    df_task_temp = df_task_temp.merge(df_stroop_temp, how='left', on=['ParticipantIdentifier', 'ActualDueDate'])
    df_task_temp = df_task_temp.merge(df_psat_temp, how='left', on=['ParticipantIdentifier', 'ActualDueDate'])
    df_task_temp = df_task_temp.merge(df_TOH_temp, how='left', on=['ParticipantIdentifier', 'ActualDueDate'])
    df_task_temp = df_task_temp.merge(df_reaction_time_temp, how='left', on=['ParticipantIdentifier', 'ActualDueDate'])
    df_task_temp = df_task_temp.merge(df_spatial_span_temp, how='left', on=['ParticipantIdentifier', 'ActualDueDate'])
    
    df_task_list.append(df_task_temp)

df_task = pd.concat(df_task_list)

### Export Task Dataframe

In [70]:
df_task.to_csv('tasks.csv', index=False)