In [6]:
import pandas as pd
import numpy as np

from bisect import bisect_left

import json

import os
import re

In [7]:
# path to data folder
path_data = '.'

# for filtering, list of dates in yyyy-mm-dd format
dates = ['2019-08-29']

In [8]:
def get_data_files(path_data, end='.csv'):
    
    # set up regex
    r_date = '\d{4}-\d{2}-\d{2}'
    r_subject = '^\d{3}'
    r_trial = '[tT]rial\d+'
    
    # get folders
    date_folders = [f for f in os.listdir(path_data) if (os.path.isdir(os.path.join(path_data, f)) and re.match(r_date, f))]
    date_folders.sort()
    
    data = []

    for folder in date_folders:

        # get csv files from date folder
        files = [f for f in os.listdir(os.path.join(path_data, folder)) if f.endswith(end)]
        files.sort()

        for file in files:

            # get subject and trial numbers
            subject = re.match(r_subject, file).group()
            trial = re.search('\d+', re.search(r_trial, file).group()).group()
            
            data.append({'subject':subject, 'date':folder, 'trial':trial, 'filename':file})
            
    # return dataframe
    return pd.DataFrame(data)

In [9]:
def find_closest_timestamp(timestamps, target, start=0):

    index = bisect_left(timestamps, target, lo=start)
    before = timestamps[index - 1]
    after = timestamps[index]
    
    if after - target < target - before:
        return index, after
    else:
        return index, before


In [10]:
df_files_csv = get_data_files(path_data)

print('Found {} csv files in {} date folders'.format(len(df_files_csv), len(df_files_csv['date'].unique())))

# filter by date(s)
if dates is not None:
    df_files_csv = df_files_csv.loc[df_files_csv['date'].isin(dates)]
    
print('Keeping {} files after filtering for date(s)'.format(len(df_files_csv)))

# get all txt and json files
df_files_txt = get_data_files(path_data, end='.txt')
df_files_json = get_data_files(path_data, end='.json')
df_files_txt.rename(columns={'filename':'filename_txt'}, inplace=True)
df_files_json.rename(columns={'filename':'filename_json'}, inplace=True)

# keep only trials that have all three types of files
merge_keys = ['subject', 'date', 'trial']
df_files = df_files_csv.merge(df_files_json, how='inner', on=merge_keys)
df_files = df_files.merge(df_files_txt, how='inner', on=merge_keys)

# look for trials with empty csv file
for index, row in df_files.iterrows():
    if not pd.read_csv(os.path.join(path_data, row['date'], row['filename'])).shape[0] == 0:
        df_files.drop(index=index, inplace=True)

print('Found {} empty csv file(s) that can be fixed'.format(len(df_files)))

columns_all = ['i_sample', 'Channel 1', 'Channel 2', 'Channel 3', 'Channel 4', 'Channel 5', 'Channel 6',
               'Channel 7', 'Channel 8', 'Channel 9', 'Channel 10', 'Channel 11', 'Time_hours', 'Time']
columns = ['Time', 'Channel 1', 'Channel 2', 'Channel 3', 'Channel 4',
           'Channel 5','Channel 6', 'Channel 7', 'Channel 8']

for index, row in df_files.iterrows(): 
    
    subject = row['subject']
    date = row['date']
    trial = row['trial']
    filename = row['filename']
    filename_txt = row['filename_txt']
    filename_json = row['filename_json']
    
    # load txt file
    df_trial = pd.read_csv(os.path.join(path_data, date, filename_txt), names=columns_all, skiprows=6)
    df_trial = df_trial.loc[:, columns]
    
    # load json file
    with open(os.path.join(path_data, date, filename_json)) as file_json:
        timestamps_json = json.load(file_json)
        
    df_trial['Direction'] = None
    timestamps = df_trial['Time']
    
    start = 0
    direction = None
    for cue in timestamps_json:
        
        target = cue['time']
        stop, timestamp = find_closest_timestamp(timestamps, target, start)
        df_trial.loc[start:stop, 'Direction'] = direction
        
        start = stop
        direction = cue['cue']
            
    df_trial.dropna(axis=0, how='any', inplace=True)
    
    df_trial.to_csv(os.path.join(path_data, date, '{}-trial{}-{}-FIXED.csv'.format(subject, trial, date)), index=False)
    
    print('\tFixed {}'.format(os.path.join(date, filename)))


Found 104 csv files in 16 date folders
Keeping 10 files after filtering for date(s)
Found 1 empty csv file(s) that can be fixed
	Fixed 2019-08-29/001-trial1-ssvep-2019-8-29-19-2-36-BAD.csv
