# **Create optimization set and train- and test sets**

## Import libraries

In [6]:
import pandas as pd
import numpy as np
import random

## Read data

In [7]:
df = pd.read_csv("data_csv/df_MI.csv")

# Drop first two columns
df = df.iloc[:, 2:]

## Remove rest phases + run column

In [8]:
df = df[df['annotation'] != 'rest']
df = df.drop(columns=['run'])

## **Optimization set**

All samples from 5 random subjects are drawn for optimization set

In [12]:
random.seed(5) 

# Define common parameters
nr_runs_per_participant = 3
trial_length = 656
expected_rows = nr_runs_per_participant * 15 * trial_length # Nr rows for subjects with all trials

# Select subjects with all 15 left/right hand trials
subset_full_run_subjects = df[df['subject_ID'].isin(df['subject_ID'].value_counts()[lambda x: x == expected_rows].index)]
unique_subjects = list(subset_full_run_subjects['subject_ID'].unique())
optimize_df = df[df['subject_ID'].isin(random.sample(unique_subjects, 5))]

print(optimize_df["subject_ID"].unique())
print(f"Shape:", optimize_df.shape)
print(f"Target length:", 5 * 3 * 15 * 656)

nr_participants = optimize_df["subject_ID"].nunique()
nr_trials = len(optimize_df) // trial_length

## FEATURE VECTOR

features_subset = optimize_df[["C3..", "C4..", "Cz.."]].to_numpy()
features = features_subset.reshape(nr_trials, trial_length, features_subset.shape[1])

print("Shape of train samples (examples, len_example, features):", features.shape)

## ATTRIBUTE VECTOR

attributes_subset = optimize_df[['annotation', 'subject_ID']].to_numpy()
attributes = attributes_subset.reshape(nr_trials, trial_length, attributes_subset.shape[1])
attributes = attributes[:, 0, :]

print("Shape of train samples (examples, len_example, attributes):", attributes.shape)
print("Subjects included for optimization:", optimize_df["subject_ID"].unique())

# Save
np.save('optimize_features.npy', features)
np.save('optimize_attributes.npy', attributes)

['S33' 'S44' 'S70' 'S87' 'S91']
Shape: (147600, 67)
Target length: 147600
Shape of train samples (examples, len_example, features): (225, 656, 3)
Shape of train samples (examples, len_example, attributes): (225, 2)
Subjects included for optimization: ['S33' 'S44' 'S70' 'S87' 'S91']


## **Train- and test splits for 3-ALL, 3-SUB and 15-SUB**

## Define common parameters

In [13]:
nr_participants = df["subject_ID"].nunique()
nr_runs_per_participant = 3
trial_length = 656
nr_trials = len(df) // trial_length

## Prepare 25-subject subset

Draw 25 random subjects from the dataframe to establish the 3-SUB and 15-SUB datasets.

In [None]:
random.seed(5) 

unique_subject_ids = df['subject_ID'].unique()
selected_subject_ids = np.random.choice(unique_subject_ids, size=25, replace=False)
df_25_sub = df[df['subject_ID'].isin(selected_subject_ids)]
df_25_sub = df_25_sub.drop(columns=['time'])

print(df_25_sub['subject_ID'].nunique())
print(df_25_sub.shape)

## 3-ALL (all subjects, 3 channels)

Include all 103 subjects and only the three primary channels for left/right hand MI (Cz, C3 and C4)

In [15]:
subset = df[["C3..", "C4..", "Cz..", 'annotation', 'subject_ID']].to_numpy()
samples = subset.reshape(nr_trials, trial_length, subset.shape[1])

subject_ids = samples[:, 0, -1]
unique_subject_ids = np.unique(subject_ids)

train_samples = []
test_samples = []

for subject_id in unique_subject_ids:
    subject_examples = samples[subject_ids == subject_id]
    num_examples = subject_examples.shape[0]
    num_to_select = int(np.floor(0.9 * num_examples))
    selected_indices = np.random.choice(num_examples, num_to_select, replace=False)
    remaining_indices = np.setdiff1d(np.arange(num_examples), selected_indices)
    
    train_samples.append(subject_examples[selected_indices])
    test_samples.append(subject_examples[remaining_indices])

train_samples_subset_channels = np.vstack(train_samples)

features = train_samples_subset_channels[:, :, :3]
attributes = train_samples_subset_channels[:, 0, -2:]

print("Shape of the samples (examples, len_example, features):", features.shape)
print("Shape of the samples (examples, len_example, attributes):", attributes.shape)

np.save('train_features_3_ALL.npy', features)
np.save('train_attributes_3_ALL.npy', attributes)

test_samples_subset_channels = np.vstack(test_samples)

features = test_samples_subset_channels[:, :, :3]
attributes = test_samples_subset_channels[:, 0, -2:]

print("Shape of the samples (examples, len_example, features):", features.shape)
print("Shape of the samples (examples, len_example, attributes):", attributes.shape)

np.save('test_features_3_ALL.npy', features)
np.save('test_attributes_3_ALL.npy', attributes)


Shape of the samples (examples, len_example, features): (4021, 656, 3)
Shape of the samples (examples, len_example, attributes): (4021, 2)
Shape of the samples (examples, len_example, features): (515, 656, 3)
Shape of the samples (examples, len_example, attributes): (515, 2)


## 3-SUB (25 subjects, 3 channels)

Include only a subset of subjects and the three primary channels for left/right hand MI (Cz, C3 and C4)

In [None]:
nr_features = 3 # Number of channels to include
subset = df_25_sub[["C3..", "C4..", "Cz..", 'annotation', 'subject_ID']].to_numpy()
samples = subset.reshape(nr_trials, trial_length, subset.shape[1])

subject_ids = samples[:, 0, -1]
unique_subject_ids = np.unique(subject_ids)

train_samples = []
test_samples = []

for subject_id in unique_subject_ids:
    subject_examples = samples[subject_ids == subject_id]
    num_examples = subject_examples.shape[0]
    num_to_select = int(np.floor(0.8 * num_examples))
    selected_indices = np.random.choice(num_examples, num_to_select, replace=False)
    remaining_indices = np.setdiff1d(np.arange(num_examples), selected_indices)
    
    train_samples.append(subject_examples[selected_indices])
    test_samples.append(subject_examples[remaining_indices])

train_samples_subset_channels = np.vstack(train_samples)

features = train_samples_subset_channels[:, :, :nr_features]
attributes = train_samples_subset_channels[:, 0, -2:]

print("Shape of the samples (examples, len_example, features):", features.shape)
print("Shape of the samples (examples, len_example, attributes):", attributes.shape)

np.save('train_features_3_SUB.npy', features)
np.save('train_attributes_3_SUB', attributes)

test_samples_subset_channels = np.vstack(test_samples)

features = test_samples_subset_channels[:, :, :nr_features]
attributes = test_samples_subset_channels[:, 0, -2:]

print("Shape of the samples (examples, len_example, features):", features.shape)
print("Shape of the samples (examples, len_example, attributes):", attributes.shape)

np.save('test_features_3_SUB', features)
np.save('test_attributes_3_SUB', attributes)


## 15-SUB (25 subjects, 15 channels)

Include only 25 subjects but more regions of interest channels

In [None]:
ROI_channels = ["C3..", "C4..", "Cz..", "Fcz.", "Fc3.", "Fc4.", "Cpz.", "Cp3.", "Cp4.", "P3..", "P4..", "P7..", "P8..", "T7..", "T8.."]
nr_features = len(ROI_channels)
columns = ROI_channels + ['annotation', 'subject_ID']
subset = df_25_sub[columns].to_numpy()
samples = subset.reshape(nr_trials, trial_length, subset.shape[1])

subject_ids = samples[:, 0, -1]
unique_subject_ids = np.unique(subject_ids)

train_samples = []
test_samples = []

for subject_id in unique_subject_ids:
    subject_examples = samples[subject_ids == subject_id]
    num_examples = subject_examples.shape[0]
    num_to_select = int(np.floor(0.8 * num_examples))
    selected_indices = np.random.choice(num_examples, num_to_select, replace=False)
    remaining_indices = np.setdiff1d(np.arange(num_examples), selected_indices)
    
    train_samples.append(subject_examples[selected_indices])
    test_samples.append(subject_examples[remaining_indices])

train_samples_subset_channels = np.vstack(train_samples)

features = train_samples_subset_channels[:, :, :nr_features]
attributes = train_samples_subset_channels[:, 0, -2:]

print("Shape of the samples (examples, len_example, features):", features.shape)
print("Shape of the samples (examples, len_example, attributes):", attributes.shape)

np.save('train_features_15_SUB.npy', features)
np.save('train_attributes_15_SUB.npy', attributes)

test_samples_subset_channels = np.vstack(test_samples)

features = test_samples_subset_channels[:, :, :nr_features]
attributes = test_samples_subset_channels[:, 0, -2:]

print("Shape of the samples (examples, len_example, features):", features.shape)
print("Shape of the samples (examples, len_example, attributes):", attributes.shape)

np.save('test_features_15_SUB', features)
np.save('test_attributes_15_SUB', attributes)
