In [59]:
import pandas as pd
import os
import re


# Data

### Raw Data

In [60]:
# Data parameters
N_SUBJECTS = 11             # a folder for each subject
INVALID_SUBJECTS = [6, 7]   # these subjects have missing data and will be ignored
N_TRIALS = 42               # 6 repetitions of 7 gestures, the number of csv files in each subject folder
N_REPETITIONS = 6           # repetitions of gestures in each trial
N_GESTURES = 7              # number of gestures (which we will try to classify) 
# Instrumentation parameters
EEG_SAMPLING_RATE = 250     # Hz, or how many points are recorded per second
EMG_SAMPLING_RATE = 200     # Hz, or how many points are recorded per second

In [61]:
# import data from csv files into a list of dataframes with all informations
def import_data(path, transpose=False):

    dataframes = []
    # iterate over all subjects 
    valid_subjects_range = [i for i in range(1, N_SUBJECTS) if i not in INVALID_SUBJECTS]
    for subject in valid_subjects_range:
        folder = os.path.join(path, f'subject_{subject}')
        for file in os.listdir(folder):
            # grab file paths and names and ensure they are in valid format
            file_path = os.path.join(folder, file)
            assert os.path.isfile(file_path), f"File {file_path} is invalid!"    
            assert file.endswith('.csv'), f"File {file_path} is not a csv file!" 
            assert re.match(r"^S\d+_R\d+_G\d+\.csv$", file), f"Filename {file} does not match the expected pattern!"    # pattern: SX_RX_GX.csv, where X is a digit      
            
            # grab subject, repetition, and gesture numbers
            subject, repetition, gesture = re.search(r"S(\d+)_R(\d+)_G(\d+).csv", file).groups()

            # transpose the data if it's EEG data
            dataframe = pd.read_csv(file_path).transpose() if transpose else pd.read_csv(file_path)
            
            # If column names are missing, rename them to "channel_X" 
            # This happens for EEG data, but we ensure that the column names will be the same for all dataframes
            dataframe.columns = [f"channel_{i+1}" for i in range(len(dataframe.columns))]

            # append data with subject, trial, and gesture numbers
            dataframe_entry = {'subject': subject, 'repetition': repetition, 'gesture': gesture, 'data': dataframe}
            dataframes.append(dataframe_entry)

    return dataframes

raw_eeg_data = import_data('original EEG Data', transpose=True)
raw_emg_data = import_data('original EMG Data', transpose=False)


In [62]:
# !!! Warning: some trials have invalid collected data, be sure to ignore them!
# see output of this cell to see which
for eeg, emg in zip(raw_eeg_data, raw_emg_data):
        if len(eeg["data"])<1349:
            print(f"INVALID EEG DATA for Subject {eeg["subject"]}, Experiment R{eeg["repetition"]}_G{eeg["gesture"]}: data len are {len(eeg["data"])}-{len(emg["data"])}")
        elif len(emg["data"])<1500:
            print(f"INVALID EMG DATA for Subject {eeg["subject"]}, Experiment R{eeg["repetition"]}_G{eeg["gesture"]}: data len are {len(eeg["data"])}-{len(emg["data"])}")
        # else:
        #     print(f"Subject {eeg["subject"]}, Experiment R{eeg["repetition"]}_G{eeg["gesture"]}: data len are {len(eeg["data"])}-{len(emg["data"])}")


# NOTE: Some EEG samples have length of 1349 and some of 1350.
# Given that we later apply windowing clipping some of starting and ending points, this should not matter much

INVALID EEG DATA for Subject 4, Experiment R3_G3: data len are 633-1586


In [63]:
# inspect structure of a raw data sample
print(f"Number of EEG dataframes: {len(raw_eeg_data)}")
raw_eeg_data[0]["data"].info()
print("\n-----------------------------------\n")
print(f"Number of EMG dataframes: {len(raw_emg_data)}")
raw_emg_data[0]["data"].info()

Number of EEG dataframes: 336
<class 'pandas.core.frame.DataFrame'>
Index: 1349 entries, 0 to 1348
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   channel_1  1349 non-null   float64
 1   channel_2  1349 non-null   float64
 2   channel_3  1349 non-null   float64
 3   channel_4  1349 non-null   float64
 4   channel_5  1349 non-null   float64
 5   channel_6  1349 non-null   float64
 6   channel_7  1349 non-null   float64
 7   channel_8  1349 non-null   float64
dtypes: float64(8)
memory usage: 94.9+ KB

-----------------------------------

Number of EMG dataframes: 336
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586 entries, 0 to 1585
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   channel_1  1586 non-null   int64
 1   channel_2  1586 non-null   int64
 2   channel_3  1586 non-null   int64
 3   channel_4  1586 non-null   int64
 4   channel_5  1586 non-

### Preprocess Data

In [64]:
# Create windows of data by cutting the dataframes into smaller windows, starting from the middle
# This is done to account for start/stop delays in the data collection due to manual start/stop of the recording
def create_windows(dataframe:pd.DataFrame, window_size:int, step_size:int, n_windows:int=1):
    assert window_size <= len(dataframe), "Window size must be less than or equal to the length of the dataframe"
    assert step_size <= window_size, "Step size must be less than or equal to the window size"
    assert n_windows > 0 and n_windows%2 != 0, "Number of windows must be greater than 0 and odd"   
    assert (n_windows - 1) * step_size + window_size <= len(dataframe), "The number of windows and step size are too large for the dataframe"
    """
    dataframe: a pandas DataFrame to be cut into windows
    window_size: size of each window to be created
    step_size: the step size to move the window
    n_windows: number of windows to be created (take into account that the middle window is the center of the dataframe, therefore it must be an odd number)
    """

    windows = []
    half_window = window_size // 2      # Half of the window size to center it
    middle_point = len(dataframe)//2    # middle index of the dataframe

    # Create the middle (main) window
    start_idx = max(0, middle_point - half_window)
    end_idx = min(len(dataframe), middle_point + half_window)
    windows.append(dataframe.iloc[start_idx:end_idx])

    # Generate remaining windows moving outward both ways from the middle window
    left_idx, right_idx = middle_point - step_size, middle_point + step_size
    count = 1  # We've already created one window, the main one

    while count < n_windows:
       # Create another window on the left
       if left_idx - half_window >= 0:  
           windows.append(dataframe.iloc[left_idx - half_window:left_idx + half_window])
           count += 1
       # Create another window on the right
       if count < n_windows and right_idx + half_window <= len(dataframe):  
           windows.append(dataframe.iloc[right_idx - half_window:right_idx + half_window])
           count += 1

       left_idx -= step_size
       right_idx += step_size


    return windows

# Example usage: create example windows of 1s of data with 50% overlap:
# len(create_windows(raw_eeg_data[0]["data"], EEG_SAMPLING_RATE, int(EEG_SAMPLING_RATE/2), 9))     
# len(create_windows(raw_emg_data[0]["data"], EMG_SAMPLING_RATE, int(EMG_SAMPLING_RATE/2), 9))

In [65]:
assert len(raw_eeg_data) == len(raw_emg_data), "Number of EEG and EMG dataframes must be the same"
N_WINDOWS = 9   # number of windows to be created for each EEG and EMG dataframe

data_dicts = []
# iterate over all eeg and emg dataframes and create windows of data with class labels
for eeg_item, emg_item in zip(raw_eeg_data, raw_emg_data):
    # Create windows of data for EEG and EMG data
    try:
        eeg_windows = create_windows(eeg_item["data"], EEG_SAMPLING_RATE, int(EEG_SAMPLING_RATE/2), N_WINDOWS)
        emg_windows = create_windows(emg_item["data"], EMG_SAMPLING_RATE, int(EMG_SAMPLING_RATE/2), N_WINDOWS)
    except AssertionError:
        print(f"Invalid data lengths found!! SKIPPING subject {eeg["subject"]}, trial R{eeg["repetition"]}_G{eeg["gesture"]}")
        continue    # some data has invalid lenghts. Skip these entries to avoid invalid windows of data.

    # Append each EEG and EMG window to the dictionary to create samples
    count = 0
    for eeg_window, emg_window in zip(eeg_windows, emg_windows):
        # Create a unique ID for each sample
        sample_id = f"S{eeg_item['subject']}_R{eeg_item['repetition']}_G{eeg_item['gesture']}_N{str(count).zfill(2)}"
        sample = {"id": sample_id, "eeg": eeg_window, "emg": emg_window, "label": eeg_item["gesture"]}
        data_dicts.append(sample)
        count += 1
  


Invalid data lengths found!! SKIPPING subject 10, trial R5_G7


In [74]:
print(f"EEG: 8chX{len(data_dicts[0]["eeg"])}samples.")
print(f"EMG: 8chX{len(data_dicts[0]["emg"])}samples.")
ids = [sample['id'] for sample in data_dicts]
print(f"Indices:\n{ids}")

EEG: 8chX250samples.
EMG: 8chX200samples.
Indices:
['S1_R6_G7_N00', 'S1_R6_G7_N01', 'S1_R6_G7_N02', 'S1_R6_G7_N03', 'S1_R6_G7_N04', 'S1_R6_G7_N05', 'S1_R6_G7_N06', 'S1_R6_G7_N07', 'S1_R6_G7_N08', 'S1_R4_G2_N00', 'S1_R4_G2_N01', 'S1_R4_G2_N02', 'S1_R4_G2_N03', 'S1_R4_G2_N04', 'S1_R4_G2_N05', 'S1_R4_G2_N06', 'S1_R4_G2_N07', 'S1_R4_G2_N08', 'S1_R3_G1_N00', 'S1_R3_G1_N01', 'S1_R3_G1_N02', 'S1_R3_G1_N03', 'S1_R3_G1_N04', 'S1_R3_G1_N05', 'S1_R3_G1_N06', 'S1_R3_G1_N07', 'S1_R3_G1_N08', 'S1_R5_G6_N00', 'S1_R5_G6_N01', 'S1_R5_G6_N02', 'S1_R5_G6_N03', 'S1_R5_G6_N04', 'S1_R5_G6_N05', 'S1_R5_G6_N06', 'S1_R5_G6_N07', 'S1_R5_G6_N08', 'S1_R6_G2_N00', 'S1_R6_G2_N01', 'S1_R6_G2_N02', 'S1_R6_G2_N03', 'S1_R6_G2_N04', 'S1_R6_G2_N05', 'S1_R6_G2_N06', 'S1_R6_G2_N07', 'S1_R6_G2_N08', 'S1_R1_G7_N00', 'S1_R1_G7_N01', 'S1_R1_G7_N02', 'S1_R1_G7_N03', 'S1_R1_G7_N04', 'S1_R1_G7_N05', 'S1_R1_G7_N06', 'S1_R1_G7_N07', 'S1_R1_G7_N08', 'S1_R1_G4_N00', 'S1_R1_G4_N01', 'S1_R1_G4_N02', 'S1_R1_G4_N03', 'S1_R1_G4_N04', 'S1_

### Processed data files generation

In [68]:
# TODO: save the data samples to a file

In [None]:
# import pandas as pd

# # Save a DataFrame to Parquet
# df = pd.DataFrame(data_dicts[0])
# df.to_parquet('data.parquet')

# # Load a Parquet file back into a DataFrame
# loaded_df = pd.read_parquet('data.parquet')
# print(loaded_df)


ValueError: If using all scalar values, you must pass an index