# DOWNLOAD DATA AND UNZIP IT

In [None]:
# Download data from https://voxellab.pl/EmoNeuroDB/

# emoneuro_challenge/data/raw contains the original data provided by the competition. That is our start point.

# IMPORT LIBRARIES

In [None]:
import os
import numpy as np
import pandas as pd
from scipy import signal
from tqdm import tqdm

import pickle
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.utils import to_categorical
import random

from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.backend import clear_session
from sklearn.decomposition import PCA
import gc
from keras import backend as K

from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
import warnings
warnings.filterwarnings('ignore')

# MODULE 1: Data Pre-processing

In [None]:
class Preprocess_data():
    def __init__(self):
        pass


    def Preprocess_Signal(self, df):
        # Define the channel names (some of them are discarded, i.e., X1, X2...
        # not found at https://upload.wikimedia.org/wikipedia/commons/thumb/7/70/21_electrodes_of_International_10-20_system_for_EEG.svg/1200px-21_electrodes_of_International_10-20_system_for_EEG.svg.png)
        #subset:
        channel_names = ['Fp1', 'Fp2', 'F7', 'F8', 'C3', 'C4', 'P3', 'P4', 'O1', 'O2',
                        'F3', 'F4', 'T3', 'T4', 'T5', 'T6', 'Fz', 'Cz', 'A1', 'A2']
        #All:
        #channel_names = ['P3', 'C3', 'F3', 'Fz', 'F4', 'C4', 'P4', 'Cz', 'CM', 'A1',
        #                 'Fp1', 'Fp2', 'T3', 'T5', 'O1', 'O2', 'X3', 'X2', 'F7', 'F8', 'X1',
        #                 'A2', 'T6', 'T4']
        # Create a copy of the DataFrame for processing
        df_processed = df.copy()

        # Specify the reference location
        mastoid_channels = ['A1', 'A2']

        sample_freq = 300  # Sample frequency (Hz)

        # Re-reference to mastoid_channels
        for sensor in channel_names:
            df_processed[sensor] = df[sensor] - df[mastoid_channels].mean(axis=1)


        # Define filter parameters
        hp_freq = 1  # High-pass filter cutoff frequency (Hz)
        lp_freq = 50  # Low-pass filter cutoff frequency (Hz)
        filter_delay_ms = 40  # Filter delay (ms)
        sample_freq = 300  # Sample frequency (Hz)

        # Design high-pass and low-pass filters
        b_hp, a_hp = signal.butter(N=4, Wn=hp_freq / (sample_freq / 2), btype='high', analog=False)
        b_lp, a_lp = signal.butter(N=4, Wn=lp_freq / (sample_freq / 2), btype='low', analog=False)

        # Apply high-pass filter to each sensor's data
        df_filtered_hp = df_processed.apply(lambda col: signal.filtfilt(b_hp, a_hp, col))

        # Apply low-pass filter to each sensor's data
        df_filtered = df_filtered_hp.apply(lambda col: signal.filtfilt(b_lp, a_lp, col))

        # Remove filter delay
        filter_delay_samples = int(filter_delay_ms * sample_freq / 1000)
        df_filtered_delay_removed = df_filtered.iloc[filter_delay_samples:]

        # Perform FFT on each sensor's data
        fft_results = {}
        for col in df.columns:
            fft_result = np.fft.fft(df_filtered_delay_removed[col])
            frequencies = np.fft.fftfreq(len(fft_result), d=1/sample_freq)
            fft_results[col] = (frequencies, np.abs(fft_result))
        return fft_results



    def generate_train_data(self, train_data_path, train_labels_file):

        labels = pd.read_csv(train_labels_file)

        files = os.listdir(train_data_path)
        # Sort files alphabetically
        files.sort()

        df_tot = []
        # Process files in alphabetical order
        for file_name in tqdm(files):
            file_path = os.path.join(train_data_path, file_name)

            #print(file_path)
            df = pd.read_csv(file_path)

            res = self.Preprocess_Signal(df)

            new_df = pd.DataFrame()


            if 'Time' in res.keys():
                new_df['Time'] = df.Time.values[:len(res['Time'][0])]

            for channel in res.keys():
                new_df[channel+'_Amp'] = res[channel][1]

            new_df.drop(['Time_Amp'], axis=1, inplace=True)
            new_df['label'] = labels[labels.filename==file_name]['class'].values[0]
            new_df['filename'] = file_name.split('.')[0]

            if len(df_tot)==0:
                df_tot = new_df
            else:
                df_tot = pd.concat([df_tot, new_df], ignore_index=True)

        return df_tot

    def generate_test_data(self, data_dir):
        files = os.listdir(data_dir)
        files.sort()

        df_tot = []

        # List all CSV files in the directory
        csv_files = [f for f in files if f.endswith('.csv')]
        # Load and evaluate each DataFrame
        dataframe_list = []
        for csv_file in tqdm(csv_files):
            #print(csv_file)
            file_path = os.path.join(data_dir, csv_file)
            df = pd.read_csv(file_path)
            #new_df = df
            new_df = pd.DataFrame()

            res = self.Preprocess_Signal(df)

            if 'Time' in res.keys():
                new_df['Time'] = df.Time.values[:len(res['Time'][0])]

            for channel in res.keys():
                new_df[channel+'_Amp'] = res[channel][1]

            new_df['filename'] = csv_file.split('.')[0]

            new_df.drop(['Time_Amp'], axis=1, inplace=True)

            if len(df_tot)==0:
                df_tot = new_df
            else:
                df_tot = pd.concat([df_tot, new_df], ignore_index=True)
        return df_tot


In [None]:
ppd = Preprocess_data()


train_data_path = 'emoneuro_challenge/data/raw/train_raw_data/train/'
train_labels_file = 'emoneuro_challenge/data/raw/train_raw_data/train_labels.csv'

print('Processing training files')
df_train = ppd.generate_train_data(train_data_path, train_labels_file)
df_train['train'] = 1 # train

print('Processing validation files')
valid_data_path = 'emoneuro_challenge/data/raw/valid_raw_data/validation'
valida_labels_file = 'emoneuro_challenge/data/raw/valid_raw_data/validation_labels.csv'

df_val = ppd.generate_train_data(valid_data_path, valida_labels_file)
df_val['train'] = 2 # validation

print('Processing test files')
test_data_path = 'emoneuro_challenge/data/raw/test_raw_data'
df_test = ppd.generate_test_data(test_data_path)
df_test['train'] = 0
df_test['label'] = -1 # dummy label

df_train_val = pd.concat([df_train, df_val], ignore_index=True)

In [None]:
#This may take some time, only if you are interested in saving intermediate data.

#outpath = 'emoneuro_challenge/data/processed/stage_1/'

#if not os.path.exists(outpath):
#    os.makedirs(outpath)

#df_train_val.to_csv(os.path.join(outpath, 'train_val.csv'), index=False)
#df_test.to_csv(os.path.join(outpath, 'test.csv'), index=False)

# MODULE 2: NN INPUT GENERATED


In [None]:
class Generate_Datasets():
    def __init__(self):
        pass


    def extract_features(self, df, bucket_div, test=0):
        left_chn = ['Fp1', 'F7', 'C3', 'P3', 'O1',
                                'F3', 'T3', 'T5', 'Fz', 'Cz', 'A1']
        right_chn = ['Fp2', 'F8', 'C4', 'P4', 'O2',
                                 'F4', 'T4', 'T6', 'Fz', 'Cz', 'A2']
        left_nn_chn = [i+'_Amp' for i in left_chn]
        right_nn_chn = [i+'_Amp' for i in right_chn]
        # Extract video_ID for unique identification
        video_IDs = df['filename'].unique()

        # Split dataset into features (X) and labels (y) for each video_ID
        X_left = []
        X_right = []
        y = []

        for video_ID in video_IDs:
            video_data = df[df['filename'] == video_ID]

            # Assuming the 'label' column is the same for the entire sequence
            label = video_data['label'].iloc[0]

            # Drop non-essential columns
            video_data = video_data.drop(['filename', 'label', 'user_ID', 'train'], axis=1)

            left_data = video_data[left_nn_chn]
            right_data = video_data[right_nn_chn]

            if bucket_div > 1:
                numeric_cols = video_data.select_dtypes(include=[np.number]).columns
                string_cols = video_data.select_dtypes(exclude=[np.number]).columns
                df_averaged_num = video_data[numeric_cols].groupby(np.arange(len(video_data)) // bucket_div).mean()
                df_averaged_str = video_data[string_cols].groupby(np.arange(len(video_data)) // bucket_div).first()
                video_data = pd.concat([df_averaged_num, df_averaged_str], axis=1)


            # Convert the sequence to a numpy array
            sequence_array = left_data.to_numpy()
            # Append the sequence and label to X and y
            X_left.append(sequence_array)

            # Convert the sequence to a numpy array
            sequence_array = right_data.to_numpy()
            # Append the sequence and label to X and y
            X_right.append(sequence_array)

            y.append(label)

        X_left = np.array(X_left)
        X_right = np.array(X_right)
        y = np.array(y)
        if test == 0:
            y = to_categorical(y)
        return X_left, X_right, y


    def extract_features_train(self, df, train_IDs, val_IDs, bucket_div, scale=0, pca=0):
        print('Starting training data massage...')

        setOI = df[df['user_ID'].isin(train_IDs+val_IDs)].reset_index(drop=True)
        if scale: #fit_transform (train, )
            print('Scaling data...')
            setOI_clean = setOI.drop(['filename', 'label', 'user_ID', 'train'], axis=1)
            scaler = StandardScaler()
            setOI_scaled = pd.DataFrame(scaler.fit_transform(setOI_clean), columns=setOI_clean.columns)
            setOI_scaled = pd.concat([setOI[['filename', 'label', 'user_ID', 'train']], setOI_scaled], axis=1)
            setOI = setOI_scaled
        else:
            scaler = None

        if pca:
            assert scale==True
            print('Applying PCA..')
            # Apply PCA with the number of components equal to the original number of features
            setOI_clean = setOI.drop(['filename', 'label', 'user_ID', 'train'], axis=1)
            pca_ = PCA(n_components=len(setOI_clean.columns))
            setOI_pca = pd.DataFrame(pca_.fit_transform(setOI_clean), columns=setOI_clean.columns)
            setOI_pca = pd.concat([setOI[['filename', 'label', 'user_ID', 'train']], setOI_pca], axis=1)
            setOI = setOI_pca
        else:
            pca_=None

        train_set = setOI[setOI['user_ID'].isin(train_IDs)].reset_index(drop=True)
        validation_set = setOI[setOI['user_ID'].isin(val_IDs)].reset_index(drop=True)

        X_left_train, X_right_train, y_train = self.extract_features(train_set, bucket_div)
        X_left_val, X_right_val, y_val = self.extract_features(validation_set, bucket_div)

        return [X_left_train, X_right_train, y_train], [X_left_val, X_right_val, y_val], scaler, pca_

    def extract_features_test(self, df, test_IDs, bucket_div, scale=0, pca=0):
        print('Starting test data massage...')

        setOI = df[df['user_ID'].isin(test_IDs)].reset_index(drop=True)

        if scale:
            print('Scaling data...')
            setOI_clean = setOI.drop(['filename', 'label', 'user_ID', 'train'], axis=1)
            setOI_scaled = pd.DataFrame(scale.transform(setOI_clean), columns=setOI_clean.columns)
            setOI_scaled = pd.concat([setOI[['filename', 'label', 'user_ID', 'train']], setOI_scaled], axis=1)
            setOI = setOI_scaled
            if pca:
                print('Applying PCA..')
                setOI_clean = setOI.drop(['filename', 'label', 'user_ID', 'train'], axis=1)
                setOI_pca = pd.DataFrame(pca.transform(setOI_clean), columns=setOI_clean.columns)
                setOI_pca = pd.concat([setOI[['filename', 'label', 'user_ID', 'train']], setOI_pca], axis=1)
                setOI = setOI_pca

        X_left_test, X_right_test, y_test = self.extract_features(setOI, bucket_div, test=1) #ojo test=0 si meto val para pruebas

        return [X_left_test, X_right_test, y_test]



    def generate_sets(self, df_train_val, df_test, outpath, scale_data, pca_data, bucket_div):


        df_train_val['user_ID'] = df_train_val.filename.str.split('_').str[0]

        df_train_ = df_train_val[df_train_val.train==1].copy()
        #df_train_['user_ID'] = df_train_.filename.str.split('_').str[0]
        #print(df_train_.shape)
        train_IDs = list(df_train_.user_ID.unique())

        df_val_ = df_train_val[df_train_val.train==2].copy()
        #df_val_['user_ID'] = df_val_.filename.str.split('_').str[0]
        #print(df_val_.shape)
        val_IDs = list(df_val_.user_ID.unique())

        df_test_ = df_test.copy()
        df_test_['user_ID'] = df_test_.filename.str.split('_').str[0]
        #print(df_test_.shape)
        test_IDs = list(df_test_.user_ID.unique())
        df_test_.filename = df_test_.filename + '.csv'
        test_filenames = list(df_test_.filename.unique())

        #Training
        train, val, scaler, pca = self.extract_features_train(df_train_val.copy(),
                                                              train_IDs,
                                                              val_IDs,
                                                              bucket_div,
                                                              scale=scale_data,
                                                              pca=pca_data)

        test = self.extract_features_test(df_test_, test_IDs, bucket_div, scale=scaler, pca=pca)



        filename = os.path.join(outpath, 'dataset_'+str(bucket_div)+'buckets.pkl')
        with open(filename, 'wb') as f:
            data = {'train': train,
                    'val': val,
                    'test': test,
                    'pca_data': pca_data,
                    'scale_data': scale_data,
                    'bucket_div': bucket_div,
                    'train_IDs': train_IDs,
                    'val_IDs': val_IDs,
                    'test_IDs': test_IDs,
                    'test_filenames': test_filenames
                    }
            pickle.dump(data, f)

In [None]:
scale_data=True
pca_data = False

gd = Generate_Datasets()

bucket_divs = [1]#, 2]

outpath = 'emoneuro_challenge/data/processed/stage_2'
if not os.path.exists(outpath):
    os.makedirs(outpath)

df_train_val = df_train_val.drop(['Time'], axis=1)
df_test = df_test.drop(['Time'], axis=1)

label_encoder = LabelEncoder()
df_train_val['label'] = label_encoder.fit_transform(df_train_val['label'])
joblib.dump(label_encoder, 'label_encoder.joblib')


for bucket_div in bucket_divs:
    gd.generate_sets(df_train_val, df_test, outpath, scale_data, pca_data, bucket_div)

# MODULE 3: CLASSIFICATION

In [None]:
def LSTM_2branches(input_data):

    input_layers = []
    processed_outputs = []

    inp = Input(shape=(input_data.shape[1], input_data.shape[2]), name='input_signal_left')
    input_layers.append(inp)
    x = Conv1D(filters=32, kernel_size=3, activation='relu')(inp)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.5)(x)
    x = Reshape((x.shape[1], x.shape[2]))(x)
    x = LSTM(64, return_sequences=True)(x)
    x = Dropout(0.3)(x)
    x = Flatten()(x)

    processed_outputs.append(x)

    inp2 = Input(shape=(input_data.shape[1], input_data.shape[2]), name='input_signal_right')
    input_layers.append(inp2)
    x = Conv1D(filters=32, kernel_size=3, activation='relu')(inp2)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.5)(x)
    x = Reshape((x.shape[1], x.shape[2]))(x)
    x = LSTM(64, return_sequences=True)(x)
    x = Dropout(0.3)(x)
    x = Flatten()(x)

    processed_outputs.append(x)
    concatenated_out = Concatenate()(processed_outputs)

    x = Dense(128, activation='relu', kernel_regularizer=l2(0.01))(concatenated_out)
    x = Dense(6, activation='softmax')(x)


    model = Model(inputs=input_layers, outputs=x)

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [None]:
class Classifier:
    def __init__(self, id_experiment):
        self.id_experiment = id_experiment


    def evaluate_challenge_model(self, id_model, train, val, bucket_div):
        print('Starting training...')

        X_train_left = train[0]
        X_train_right = train[1]
        y_train = train[2]
        X_val_left = val[0]
        X_val_right = val[1]
        y_val = val[2]

        #LSTM_model.get_model_from_id(id_model, X_train)
        model = LSTM_2branches(X_train_left)
        #model = Attention_model.get_model_from_id(id_model, X_train)

        # Display the model summary
        model.summary()
        weights_file = 'emoneuro_challenge/weights'
        if not os.path.exists(weights_file):
            os.makedirs(weights_file)

        weights_filename = os.path.join(weights_file, self.id_experiment+ '_'+id_model+'_bckt_'+str(bucket_div)+'_weights.h5')

        ckpt = ModelCheckpoint(weights_filename,
                            save_best_only=True, save_weights_only=True,
                            monitor='val_accuracy', verbose=0, mode='max')

        earlystopper = EarlyStopping(monitor='val_loss', patience=20)


        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, min_lr=0.00001)

        # Model fitting
        model.fit([X_train_left, X_train_right], y_train, batch_size=64, validation_data=([X_val_left, X_val_right], y_val),
            epochs=200, verbose=1,
            callbacks=[ckpt, earlystopper, reduce_lr])

        # Load the best model
        model.load_weights(weights_filename)

        print('Evaluating best on train data...')
        eval_train = model.evaluate([X_train_left, X_train_right], y_train)

        print('Evaluating best on val data...')
        eval_train = model.evaluate([X_val_left, X_val_right], y_val)

        return eval_train, model

    def run_restored_model(self, id_model, weight_file, test, test_filenames, bucket_div):
        print('Starting restoring...')

        X_test = test


        model = LSTM_2branches(X_test[0])

        # Load the best model
        model.load_weights(weights_filename)


        # Display the model summary
        model.summary()

        label_encoder = joblib.load('label_encoder.joblib')

        eval_path = 'emoneuro_challenge/evaluation_from_saved_model'
        if not os.path.exists(eval_path):
            os.makedirs(eval_path)

        # Use the evaluate_dataframes function
        print('Evaluating dataframes...')

        # Make predictions on the new data
        predictions = model.predict([X_test[0], X_test[1]])

        # Convert predicted class indices back to string labels
        predicted_classes = label_encoder.inverse_transform(np.argmax(predictions, axis=1))

        # Create a DataFrame with 'video_ID' and 'predicted_class'
        result_df = pd.DataFrame({'filename': test_filenames, 'class': predicted_classes})

        print(result_df['class'].value_counts())
        result_df.to_csv(os.path.join(eval_path, 'eval_'+id_model+'bckt_'+str(bucket_div)+'.csv'), index=False)


        del model
        K.clear_session()
        gc.collect()

    def generate_evaluation_file(self, id_test, X_test, test_filenames, best_model, bucket_div):

        label_encoder = joblib.load('label_encoder.joblib')

        eval_path = 'emoneuro_challenge/evaluation_from_trained_model'
        if not os.path.exists(eval_path):
            os.makedirs(eval_path)

        # Use the evaluate_dataframes function
        print('Evaluating dataframes...')

        # Make predictions on the new data
        predictions = best_model.predict([X_test[0], X_test[1]])

        # Convert predicted class indices back to string labels
        predicted_classes = label_encoder.inverse_transform(np.argmax(predictions, axis=1))

        # Create a DataFrame with 'video_ID' and 'predicted_class'
        result_df = pd.DataFrame({'filename': test_filenames, 'class': predicted_classes})

        print(result_df['class'].value_counts())

        result_df.to_csv(os.path.join(eval_path, id_test+'bckt_'+str(bucket_div)+'.csv'), index=False)



In [None]:
# Loading a pre-trained solution

datapath = 'emoneuro_challenge/data/processed/stage_2/dataset_1buckets.pkl'
with open(datapath, 'rb') as f:
    data = pickle.load(f)



experiment_id = 'Dev_LSTM'


clf = Classifier(experiment_id)

model_id = 'model_1_2branch'


train = data['train']
val = data['val']
test = data['test']
pca_data=data['pca_data']
bucket_div = data['bucket_div']
train_IDs = data['train_IDs']
val_IDs = data['val_IDs']
test_IDs = data['test_IDs']
test_filenames = data['test_filenames']


print('2 branches with {0} Buckets'.format(bucket_div))
print('Train set: {0}'.format(train_IDs))
print('Val set: {0}'.format(val_IDs))
print('Test set: {0}'.format(test_IDs))

print('Model: ', model_id)

weights_filename = os.path.join('emoneuro_challenge/weights', 'Dev_LSTM_2branches.h5')
clf.run_restored_model(model_id, weights_filename, test, test_filenames, bucket_div)


In [None]:
# If you are interested in training the model

datapath = 'emoneuro_challenge/data/processed/stage_2/dataset_1buckets.pkl'
with open(datapath, 'rb') as f:
    data = pickle.load(f)

train_dict = {}
train_dict['Experiment']=[]
train_dict['Test']=[]
train_dict['Train']=[]
train_dict['Val']=[]
train_dict['Val_Acc']=[]
train_dict['Val_Loss']=[]
train_dict['Test_Acc']=[]
train_dict['Test_Loss']=[]


test_dict = {}
test_dict['Experiment']=[]
test_dict['Train']=[]
test_dict['Val']=[]
test_dict['Val_Acc']=[]
test_dict['Val_Loss']=[]

experiment_id = 'Trained_LSTM_2b'

clf = Classifier(experiment_id)

model_id = 'model_1_2branch'


train = data['train']
val = data['val']
test = data['test']
bucket_div = data['bucket_div']
train_IDs = data['train_IDs']
val_IDs = data['val_IDs']
test_IDs = data['test_IDs']
test_filenames = data['test_filenames']


print('{0} Buckets'.format(bucket_div))
print('Train set: {0}'.format(train_IDs))
print('Val set: {0}'.format(val_IDs))
print('Test set: {0}'.format(test_IDs))

eval_train, best_model = clf.evaluate_challenge_model(model_id, train, val, bucket_div)
res_df = clf.generate_evaluation_file(model_id, test, test_filenames, best_model, bucket_div)