# Preprocess data, find minimum data length, and store in file

__Warning__: long run time.

Store in folder `raw`.

## Set up

In [None]:
import os
import glob
import time
import mne
from mne.preprocessing import ICA

In [None]:
# From the data manual
EEG_CHS = ['Fp1', 'AF3', 'F3', 'F7', 'FC5', 'FC1', 'C3', 'T7', 'CP5', 
           'CP1', 'P3', 'P7', 'PO3', 'O1', 'Oz', 'Pz', 'Fp2', 'AF4', 
           'Fz', 'F4', 'F8', 'FC6', 'FC2', 'Cz', 'C4', 'T8', 'CP6', 
           'CP2', 'P4', 'P8', 'PO4', 'O2']
STIM_CHS = ['Status']
OTHER_CHS = ['EXG1', 'EXG2', 'EXG3', 'EXG4', 'EXG5', 'EXG6', 'EXG7', 
             'EXG8', 'GSR1', 'GSR2', 'Erg1', 'Erg2', 'Resp', 'Temp']

### Make bdf file path list

In [None]:
dataset_path = "/net2/expData/affective_eeg/mahnob_dataset/Sessions"
meta_data_path = "session.xml"
all_session_nums = os.listdir(dataset_path) # List of all session names

raw_names = [] # Absolute paths to all bdf files
session_nums = [] # Sessions with bdf recordings

# Get current working directory to change back later
curr_dir = os.getcwd()

# From data manual, bdf file may not exist if "the trials is missing due to 
# technical difficulties" (pg 15).
# Skip all sessions with no bdf recordings
for session in all_session_nums:
    session_path = os.path.join(dataset_path, session)
    os.chdir(session_path)
    bdf_list = glob.glob("*.bdf")
    
    if (len(bdf_list) == 1):
        session_nums.append(session)
        name = os.path.join(dataset_path, session, bdf_list[0])
        raw_names.append(name)
    
    elif (len(bdf_list) > 1):
        raise ValueError("Cannot handle multiple bdf files in one session.")

# Change back to notebook directory as a precaution
os.chdir(curr_dir)
print("Back to directory: ", os.getcwd())

### Find the shortest recording for time cutoff

In [None]:
%%capture
start_time = time.time()

loaded_raws = []
min_record_len = 1000

for session, raw_name in zip(session_nums, raw_names):
    raw_path = os.path.join(dataset_path, session, raw_name)
    raw = mne.io.read_raw_bdf(raw_path, preload=True)
    events = mne.find_events(raw, stim_channel="Status")
    
    if (not len(events) == 2):
        raise ValueError("Events other than stimuli start and stop are found.")

    # Cannot find method of conversion. Use time index / sampling frequency
    event_idxs = [events[x][0] for x in range(len(events))]
    sampling_freq = int(raw.info["sfreq"])
    start, end = [int(index / sampling_freq) for index in event_idxs]
    
    duration = end - start
    if duration < min_record_len:
        min_record_len = duration
    
    loaded_raws.append(raw)

In [None]:
print("Took %ss to finish." % (time.time() - start_time))
print(format("Shortest stimuli presentation: {}s".format(min_record_len)))

### Preprocess raw

In [None]:
%%capture
start_time = time.time()

if not os.path.exists("raw"):
    os.makedirs("raw")

for i in range(len(loaded_raws)):
    raw = loaded_raws[i]
    
    # Find stimuli start and stop
    events = mne.find_events(raw, stim_channel="Status")
    start_idx = events[0][0]
    sampling_freq = int(raw.info["sfreq"])
    start = int(start_idx / sampling_freq)
    end = start + min_record_len

    # Cut data based on duration
    raw = raw.crop(tmin=start, tmax=end)

    # Drop channel, rereference, filter
    raw = raw.drop_channels(OTHER_CHS)
    raw = raw.drop_channels(STIM_CHS)
    raw, _ = mne.set_eeg_reference(raw) # Rereference by mean
    raw = raw.filter(l_freq=1, h_freq=49)

    # ICA
    ica = ICA(n_components=15, random_state=413)
    ica.fit(raw)
    ica.exclude = [0, 1]
    ica.apply(raw) # Project back
    
    # Store raw
    idx = "{:04}".format(i)
    file_name = "raw/session_" + idx + "_raw.fif"
    raw.save(file_name, overwrite=True)

In [None]:
print("Took %ss to finish." % (time.time() - start_time))