In [None]:
import os 
import numpy as np
import string
import torch
import re
from tqdm import tqdm
from scipy.io import loadmat
from utils import  TrainDataset

## Dataset
The dataset contains the EEG recordings of 15 subjects. For each subject, we have 15 different recordings, each one collected while watching a different movie clip. Each clip is associated to an emotional state amon {sad: -1, neutral: 0, happy: 1}. EEG recordings comprises 62 channels.

N.B. Recordings correspondent to the same movies have the same length, while recordings correspondent to different movies have different length (iun general). How to do? No problem, since we are taking sub windows of the signals.

Data have been preprocessed by downsampling signals to 200Hz, segmentating the signals such that it corresponds to the length of the movie and applying a band-pass filter at 0-75Hz. Since recordings are about 4 minutes long and are now sampled at 200Hz, they contain roughly 48k time points each.

In [None]:
data = loadmat("data/Preprocessed_EEG/1_20131027.mat")

In [None]:
labels = loadmat("data/Preprocessed_EEG/label.mat")
labels["label"].shape

In [None]:
print(data.keys())
data["djc_eeg3"].shape

In [None]:
eeg_files = [fname for fname in os.listdir("data/Preprocessed_EEG") if fname[0] in string.digits]
data_lengths = []
for eeg_file in tqdm(eeg_files):
    raw = loadmat(os.path.join("data/Preprocessed_EEG", eeg_file))
    curr_lengths = []
    pattern = list(raw.keys())[4].split("_")[0]
    for i in range(15):
        curr_lengths.append(raw[f"{pattern}_eeg{i + 1}"].shape[1])
    data_lengths.append(curr_lengths)

data_lengths = np.asarray(data_lengths)
print(data_lengths.shape)
print(np.mean(data_lengths))
print(np.mean(data_lengths, axis=0))
print(np.mean(data_lengths, axis=1))
print(np.unique(data_lengths, return_counts=True))
print(np.std(data_lengths, axis=0))

In [None]:
dataset = TrainDataset("data/Preprocessed_EEG", "data/Preprocessed_EEG/label.mat", 2000, 200, False)