In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os 

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import sys
import os

def get_n_dir_up(path, n):
    for _ in range(n):
        path = os.path.dirname(path)
    return path

CUR_PATH= os.path.abspath("__file__")

In [5]:
sys.path.append(os.path.join(get_n_dir_up(CUR_PATH, 2)))

In [6]:
import json

DEFAULT_DATA_FOLDER = os.path.join(
    get_n_dir_up(CUR_PATH, 3), 'data')
with open(os.path.join(DEFAULT_DATA_FOLDER, 'QA.json')) as f:
    qa_records = json.load(f)
invalid_subjs = qa_records['invalid']

In [7]:
DEFAULT_FEATURE_FOLDER = os.path.join(
    get_n_dir_up(CUR_PATH, 2), 'gaze_analysis', 'features')
all_subjs = os.listdir(DEFAULT_FEATURE_FOLDER)
all_subjs = [subj for subj in all_subjs if subj.isdigit()]
all_subjs = [subj for subj in all_subjs if int(subj) not in invalid_subjs]# load valid subjects

## load the raw data

In [78]:
from utils.drawing_analysis import (
    parse_drawing_list_string,
    parse_click_data,
)

mode_parsing = {
    'draw': parse_drawing_list_string,
    'click': parse_click_data,
}

def timelist_to_startend(timelist):
    endpoints = []
    for l in timelist:
        if isinstance(l, list):
            # flatten to 1d
            if len(l) > 0:
                l_start, l_end = l[0], l[-1]
                if isinstance(l_start, list):
                    l_start = l_start[0]
                    l_end = l_end[-1]
                endpoints.append((l_start, l_end))
        else:
            endpoints.append((l, l))
    return endpoints

def get_sorted_durations(stim1_startend, stim2_startend):
    timeline = [(start, end, '1') for start, end in stim1_startend] + \
        [(start, end, '2') for start, end in stim2_startend]
    if len(timeline) > 0:
        timeline.sort(key=lambda x: x[0]) 
    return timeline

def find_first_chunk(sorted_timeline):
    start_chunk_id = None
    first_chunk = []

    if len(sorted_timeline) > 0:
        start_chunk_id = sorted_timeline[0][-1]

        for td in sorted_timeline:
            # td is timpoints for one attempt
            if td[-1] == start_chunk_id:
                first_chunk.append(td)
            else:
                break

    return start_chunk_id, first_chunk

def extract_timing_info(stim1_ts, stim2_ts):
    stim1_startend = timelist_to_startend(stim1_ts)
    stim2_startend = timelist_to_startend(stim2_ts)

    sorted_timeline = get_sorted_durations(
        stim1_startend, stim2_startend)
    start_chunk_id, first_chunk = find_first_chunk(sorted_timeline)

    stim1_responsed = len(stim1_startend) > 0
    stim2_responsed = len(stim2_startend) > 0

    extracted = {}
    n_responses = stim1_responsed + stim2_responsed
    if n_responses > 0: 
        # things to extract
        ## start time
        ## number of attempts
        ## first chunk duration
        ## total duration
        ## n responses
        extracted_raw = {
            'start': first_chunk[0][0],
            'first_stim_id': start_chunk_id,
            'first_n_attempts': len(first_chunk),
            'first_chunk_duration': first_chunk[-1][1] - first_chunk[0][0],
            'active_start': sorted_timeline[0][0],
            'active_end': sorted_timeline[-1][1],
            'active_duration': sorted_timeline[-1][1] - sorted_timeline[0][0],
            'n_responses': n_responses,
        }

        # add prefix
        extracted = {f'resp_time_{k}': v for k, v in extracted_raw.items()}

    return extracted


def parse_timing(df):
    modes = df['mode'].tolist()
    stim1_raw = df[f'resp_1_time'].tolist()
    stim2_raw = df[f'resp_2_time'].tolist()

    ts = []
    for i, mode in enumerate(modes):
        # parsing
        stim1_parsed = mode_parsing[mode](stim1_raw[i])
        stim2_parsed = mode_parsing[mode](stim2_raw[i])

        # extracting timing info
        timing_info = extract_timing_info(
            stim1_parsed, stim2_parsed)
        ts.append(timing_info)

    ts_all = pd.DataFrame(ts)

    # some other information...
    phase_duration = df['response.stopped'] - df['response.started']
    ts_all['resp_time_phase_duration'] = phase_duration.tolist()
    ts_all['resp_time_confirm'] = ts_all['resp_time_phase_duration'] - \
        ts_all['resp_time_start'] - \
        ts_all['resp_time_active_duration']

    return ts_all
    

In [79]:
DATA_RAW_FOLDER = os.path.join(
    DEFAULT_DATA_FOLDER, 'psychopy_raw', 'filtered'
)

# the mapping between subject and raw files
raw_file_mapping = {}
for l in os.listdir(DATA_RAW_FOLDER):
    if l.endswith('.csv'):
        subj_id = l.split('_')[0]
        raw_file_mapping[subj_id] = os.path.join(DATA_RAW_FOLDER, l)

DATA_PROCESSED_FOLDER = os.path.join(
    DEFAULT_DATA_FOLDER, 'behavior', 'subjects'
)

def load_subject_timing(subj_id):
    subj_id = str(subj_id)
    raw_loaded = pd.read_csv(
        os.path.join(DATA_RAW_FOLDER, 
        raw_file_mapping[subj_id]), index_col=0)
    timing = parse_timing(raw_loaded)
    processed = pd.read_csv(
        os.path.join(DATA_PROCESSED_FOLDER, f'{subj_id}.csv'),
        index_col=0)
    combined = pd.concat([processed, timing], axis=1)
    return combined


In [80]:
example_df = load_subject_timing('908')

In [81]:
example_df

Unnamed: 0,participant,mode,sample_stage,stim_sample_method,block,trial,trial_code,ITI,stim_1,stim_region_1,...,resp_time_start,resp_time_first_stim_id,resp_time_first_n_attempts,resp_time_first_chunk_duration,resp_time_active_start,resp_time_active_end,resp_time_active_duration,resp_time_n_responses,resp_time_phase_duration,resp_time_confirm
0,908,draw,0,mixed,0,0,0,1.466436,53.768502,1.0,...,3.542431,1,2,4.683243,3.542431,8.225674,4.683243,1,9.609472,1.383798
1,908,draw,0,mixed,0,1,0,2.577546,95.392638,2.0,...,2.662204,1,1,2.533496,2.662204,9.128985,6.466781,2,10.596558,1.467573
2,908,draw,0,mixed,0,2,0,1.757377,169.828842,3.0,...,1.579261,1,1,3.783250,1.579261,5.362511,3.783250,1,6.146195,0.783684
3,908,draw,0,mixed,0,3,0,1.932901,59.178761,1.0,...,1.828275,1,1,3.833325,1.828275,14.194901,12.366626,2,15.378250,1.183349
4,908,draw,0,mixed,0,4,0,1.512830,114.967475,2.0,...,1.995037,2,1,4.499797,1.995037,6.494834,4.499797,1,7.979138,1.484305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,908,draw,0,mixed,15,5,1,2.678928,60.466357,1.0,...,1.345336,1,2,5.066692,1.345336,9.728787,8.383452,2,10.229144,0.500357
156,908,draw,0,mixed,15,6,1,2.138273,25.967540,0.0,...,2.563664,1,1,0.149992,2.563664,4.846785,2.283121,2,7.146997,2.300212
157,908,draw,0,mixed,15,7,1,1.811123,147.764036,3.0,...,1.648113,1,1,0.349987,1.648113,3.581296,1.933182,2,5.198409,1.617113
158,908,draw,0,mixed,15,8,1,1.946824,149.938225,3.0,...,1.444275,2,1,0.666823,1.444275,2.111098,0.666823,1,2.911304,0.800206


## store the df with extra infomation

In [82]:
PROCESSED_DES = os.path.join(
    DEFAULT_DATA_FOLDER, 'behavior', 'subjects_extra',
)

def save_processed_files():
    for subj in all_subjs:
        subj = str(subj)
        df = load_subject_timing(subj)
        des_file = os.path.join(PROCESSED_DES, f'{subj}.csv')
        df.to_csv(des_file)

In [None]:
# save_processed_files()