In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn import metrics

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [2]:
remove_baseline_time = True
# create a list of k folds
number_of_folds = 10
lower_bound = 10
upper_bound = 550

In [3]:
# Read negative, neutral, and positive data from data folder
subject_id = "02"
states = ["neg", "neu", "pos"]
bands = ["alpha", "beta", "theta"]
neg_neu_pos = []
for state in states:
    df = pd.DataFrame()
    for band in bands:
        df_temp = pd.read_csv(f"data/{subject_id}{state}_filt_{band}.csv")
        df_temp = df_temp.rename(columns={"TP9":f"TP9_{band}",
                                          "AF7":f"AF7_{band}",
                                          "AF8":f"AF8_{band}",
                                          "TP10":f"TP10_{band}"})
        # remove the time column for beta and theta
        if band in ["beta", "theta"]:
            df_temp = df_temp.drop([f"Time"], axis=1)
    
        df = pd.concat([df, df_temp], axis=1)
    neg_neu_pos.append(df)

subject_1_negative = neg_neu_pos[0]
subject_1_neutral = neg_neu_pos[1]
subject_1_positive = neg_neu_pos[2]

In [4]:
# Suppose negative = -1; neutral = 0, and positive = 1
subject_1_negative["y"] = -1
subject_1_neutral["y"] = 0
subject_1_positive["y"] = 1

In [5]:
# Concatenate all three datasets
subject_1_data = pd.concat([subject_1_negative, subject_1_neutral, subject_1_positive], ignore_index=True)

In [6]:
if remove_baseline_time:
    subject_1_data = subject_1_data.loc[subject_1_data["Time"] > 0]
subject_1_data.reset_index(drop=True)

Unnamed: 0,Time,TP9_alpha,AF7_alpha,AF8_alpha,TP10_alpha,TP9_beta,AF7_beta,AF8_beta,TP10_beta,TP9_theta,AF7_theta,AF8_theta,TP10_theta,y
0,3.9062,-3.0137,0.4058,0.5754,-3.8169,2.7023,1.5699,4.0310,3.5601,1.0786,-3.1485,4.1479,-5.4435,-1
1,7.8125,-3.2715,-0.5018,0.2089,-4.0241,2.4943,-0.7785,3.8728,7.6806,0.6374,-3.6357,4.3252,-5.6220,-1
2,11.7188,-3.3096,-1.3771,-0.1593,-4.0076,1.7407,-2.5450,2.7630,10.6358,0.1160,-4.0726,4.4351,-5.8117,-1
3,15.6250,-3.0776,-2.1612,-0.4854,-3.7371,0.5460,-3.2586,0.8417,11.0240,-0.5015,-4.4453,4.4723,-6.0238,-1
4,19.5312,-2.5463,-2.8013,-0.7286,-3.2007,-0.8762,-2.9653,-1.5161,8.1334,-1.2290,-4.7407,4.4335,-6.2677,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35797,582.0312,6.3104,-1.4109,2.3836,5.4292,7.4722,-3.9437,-2.6890,15.2891,1.4775,-0.1774,1.4733,2.3734,1
35798,585.9375,7.7470,-1.4542,1.4851,4.6253,6.6585,-2.8115,-3.0971,9.1157,2.5591,-0.2375,1.3332,1.7398,1
35799,589.8438,8.6010,-1.3890,0.5167,3.5358,5.4510,-0.9086,-3.0196,1.3267,3.5198,-0.3016,1.1524,1.0071,1
35800,593.7500,8.8443,-1.2393,-0.4377,2.2147,3.9851,0.9928,-2.7807,-5.3306,4.3391,-0.3656,0.9343,0.1984,1


In [7]:
# k-fold cross-validation
# Extract each unit, assign it to different folds in order

folds = [[] for i in range(number_of_folds)]

def assign_units_to_folds(df, folds, lower_bound, upper_bound):
    num_folds = len(folds)
    num_rows = len(df)
    j = 1
    i = fold_pointer = 0
    while j <= num_rows - 1:
        prev_time = df.iloc[j-1]["Time"]
        time = df.iloc[j]["Time"]
        # if the time jumps from upper_bound(250) to a time smaller than the lower_bound (-80)
        # we find a unit
        if(time < lower_bound and prev_time > upper_bound):
            unit = df.iloc[i:j]
            folds[fold_pointer].append(unit)
            fold_pointer = (fold_pointer + 1) % num_folds
            i = j
        j = j + 1
    last_unit = df.iloc[i : j]
    folds[fold_pointer].append(last_unit)

In [8]:
assign_units_to_folds(subject_1_data, folds, lower_bound, upper_bound)

In [9]:
# concatenate lists of dataframes to one dataframe and drop the specified columns if needed
def concat_dataframes(fold_list, remove_columns_names):
    folds_concat = []
    for fold in fold_list:
        folds_concat.append(pd.concat(fold, ignore_index=True).drop(columns=remove_columns_names))
    return folds_concat

In [10]:
#  "TP9", "TP10"
columns_to_be_removed = []
for i in bands:
    columns_to_be_removed.append(f"TP9_{i}")
    columns_to_be_removed.append(f"TP10_{i}")
folds_concat = concat_dataframes(folds, [])

In [16]:
names = [
    "Adaboost",
    "RandomForest",
#     "GradientBoost",
    'Nearest Neighbors',
#     'LDA',
]

acc_res = {}
for name in names:
    accuracy_lst = []
    for _ in range(number_of_folds):
        if name == "Adaboost":
            clf = AdaBoostClassifier()
        if name == "RandomForest":
            clf = RandomForestClassifier()
        if name == "GradientBoost":
            clf = GradientBoostingClassifier()
        if name == 'Nearest Neighbors':
            clf = KNeighborsClassifier(n_neighbors=3)
        if name == "LDA":
            clf = LinearDiscriminantAnalysis()
        train_data = pd.concat(folds_concat[:-1], ignore_index=True)
        # take the last fold as the test set
        test_data = folds_concat[-1]
        # move the last fold to the beginning of the list of folds
        folds_concat = folds_concat[-1:] + folds_concat[:-1]
        train_X = train_data.iloc[:, :-1]
        train_Y = train_data.iloc[:, -1]
        test_X = test_data.iloc[:, :-1]
        test_Y = test_data.iloc[:, -1]
        clf.fit(train_X, train_Y)
        y_predict = clf.predict(test_X)
        accuracy = metrics.accuracy_score(y_predict,test_Y)
        accuracy_lst.append(accuracy)
    avg_acc = round(sum(accuracy_lst) / len(accuracy_lst),3)
    acc_res[name] = avg_acc
    print(f"{name} yields accuracy of {avg_acc}")

Adaboost yields accuracy of 0.344
RandomForest yields accuracy of 0.357
Nearest Neighbors yields accuracy of 0.357


In [17]:
acc_res

{'Adaboost': 0.344, 'RandomForest': 0.357, 'Nearest Neighbors': 0.357}