## Data Preprocessing

### Separate the input and output columns

In [16]:
import pandas as pd

data = pd.read_csv("area_under_curve.csv")
data.head()

Unnamed: 0,Delta_TP9,Delta_AF7,Delta_AF8,Delta_TP10,Theta_TP9,Theta_AF7,Theta_AF8,Theta_TP10,Alpha_TP9,Alpha_AF7,...,Alpha_TP10,Beta_TP9,Beta_AF7,Beta_AF8,Beta_TP10,Gamma_TP9,Gamma_AF7,Gamma_AF8,Gamma_TP10,Label
0,17099.267458,18719.659603,0.0,16531.085677,14241.372044,16014.591076,0.0,14005.573779,20210.156169,13390.938539,...,17939.091727,18599.079901,11146.12458,0.0,16501.495172,14527.987858,1724.751042,0.0,13411.244841,1.0
1,11198.7203,18034.494223,0.0,10638.094777,9873.838888,10651.599822,0.0,10103.227633,14258.112876,11266.480227,...,11566.18776,16072.507083,4980.861282,0.0,14331.406797,10389.427864,698.552269,0.0,11762.435847,0.0
2,11759.988085,13951.123996,19378.781415,12556.104976,13919.691015,10065.378732,11625.817268,11532.015437,17423.91887,11571.261819,...,16049.267047,22875.177018,7477.828144,11555.482578,19023.980067,21727.751956,1820.347673,8161.365913,18038.997118,1.0
3,15323.164331,17742.255086,26539.714903,13709.487935,12177.496766,11701.420319,19188.046574,12695.649223,16062.281032,11428.816573,...,14820.330323,20204.401936,7329.884952,13204.570131,17277.559869,14531.156166,-292.081546,4763.145038,13344.224953,0.0
4,11107.3057,9098.477752,0.0,13752.142985,12020.918779,4061.515408,0.0,10217.90727,18837.98411,7334.236651,...,13511.564101,18480.740151,4682.758088,0.0,14374.795867,14601.476676,534.096694,0.0,10550.715838,0.0


### Median imputation

In [17]:
def replace_zeros_with_median(df, label_column):
    """Function to replace 0s with median for each column based on the label"""
    
    for column in df.columns:
        # calculate median for each label group
        median_values = df.groupby(label_column)[column].median()

        # replace 0s with median based on the label of each row
        df[column] = df.apply(lambda row: median_values[row[label_column]] if row[column] == 0 else row[column], axis=1)

    return df

In [18]:
data = replace_zeros_with_median(data, "Label")
data.head()

Unnamed: 0,Delta_TP9,Delta_AF7,Delta_AF8,Delta_TP10,Theta_TP9,Theta_AF7,Theta_AF8,Theta_TP10,Alpha_TP9,Alpha_AF7,...,Alpha_TP10,Beta_TP9,Beta_AF7,Beta_AF8,Beta_TP10,Gamma_TP9,Gamma_AF7,Gamma_AF8,Gamma_TP10,Label
0,17099.267458,18719.659603,10611.265447,16531.085677,14241.372044,16014.591076,3761.048147,14005.573779,20210.156169,13390.938539,...,17939.091727,18599.079901,11146.12458,2903.246128,16501.495172,14527.987858,1724.751042,-232.061855,13411.244841,1.0
1,11198.7203,18034.494223,7233.162447,10638.094777,9873.838888,10651.599822,4152.945713,10103.227633,14258.112876,11266.480227,...,11566.18776,16072.507083,4980.861282,1057.621655,14331.406797,10389.427864,698.552269,0.0,11762.435847,0.0
2,11759.988085,13951.123996,19378.781415,12556.104976,13919.691015,10065.378732,11625.817268,11532.015437,17423.91887,11571.261819,...,16049.267047,22875.177018,7477.828144,11555.482578,19023.980067,21727.751956,1820.347673,8161.365913,18038.997118,1.0
3,15323.164331,17742.255086,26539.714903,13709.487935,12177.496766,11701.420319,19188.046574,12695.649223,16062.281032,11428.816573,...,14820.330323,20204.401936,7329.884952,13204.570131,17277.559869,14531.156166,-292.081546,4763.145038,13344.224953,0.0
4,11107.3057,9098.477752,7233.162447,13752.142985,12020.918779,4061.515408,4152.945713,10217.90727,18837.98411,7334.236651,...,13511.564101,18480.740151,4682.758088,1057.621655,14374.795867,14601.476676,534.096694,0.0,10550.715838,0.0


In [19]:
selected_columns = [
    'Delta_TP9', 'Delta_AF7', 'Delta_AF8', 'Delta_TP10',
    'Theta_TP9', 'Theta_AF7', 'Theta_AF8', 'Theta_TP10',
    'Alpha_TP9', 'Alpha_AF7', 'Alpha_AF8', 'Alpha_TP10',
    'Beta_TP9', 'Beta_AF7', 'Beta_AF8', 'Beta_TP10',
    'Gamma_TP9', 'Gamma_AF7', 'Gamma_AF8', 'Gamma_TP10'
]

X = data.loc[:, selected_columns]
X.head()

Unnamed: 0,Delta_TP9,Delta_AF7,Delta_AF8,Delta_TP10,Theta_TP9,Theta_AF7,Theta_AF8,Theta_TP10,Alpha_TP9,Alpha_AF7,Alpha_AF8,Alpha_TP10,Beta_TP9,Beta_AF7,Beta_AF8,Beta_TP10,Gamma_TP9,Gamma_AF7,Gamma_AF8,Gamma_TP10
0,17099.267458,18719.659603,10611.265447,16531.085677,14241.372044,16014.591076,3761.048147,14005.573779,20210.156169,13390.938539,5140.426448,17939.091727,18599.079901,11146.12458,2903.246128,16501.495172,14527.987858,1724.751042,-232.061855,13411.244841
1,11198.7203,18034.494223,7233.162447,10638.094777,9873.838888,10651.599822,4152.945713,10103.227633,14258.112876,11266.480227,3976.212778,11566.18776,16072.507083,4980.861282,1057.621655,14331.406797,10389.427864,698.552269,0.0,11762.435847
2,11759.988085,13951.123996,19378.781415,12556.104976,13919.691015,10065.378732,11625.817268,11532.015437,17423.91887,11571.261819,13469.210388,16049.267047,22875.177018,7477.828144,11555.482578,19023.980067,21727.751956,1820.347673,8161.365913,18038.997118
3,15323.164331,17742.255086,26539.714903,13709.487935,12177.496766,11701.420319,19188.046574,12695.649223,16062.281032,11428.816573,18221.013524,14820.330323,20204.401936,7329.884952,13204.570131,17277.559869,14531.156166,-292.081546,4763.145038,13344.224953
4,11107.3057,9098.477752,7233.162447,13752.142985,12020.918779,4061.515408,4152.945713,10217.90727,18837.98411,7334.236651,3976.212778,13511.564101,18480.740151,4682.758088,1057.621655,14374.795867,14601.476676,534.096694,0.0,10550.715838


In [20]:
# For y, we have either 0 or 1. 
# 0 = "Recall"
# 1 = "Maze"

y = data.iloc[:, -1]
y.head()

0    1.0
1    0.0
2    1.0
3    0.0
4    0.0
Name: Label, dtype: float64

### Removing rows with missing data

In [21]:
X_cleaned = X.dropna()
X_cleaned.head()

Unnamed: 0,Delta_TP9,Delta_AF7,Delta_AF8,Delta_TP10,Theta_TP9,Theta_AF7,Theta_AF8,Theta_TP10,Alpha_TP9,Alpha_AF7,Alpha_AF8,Alpha_TP10,Beta_TP9,Beta_AF7,Beta_AF8,Beta_TP10,Gamma_TP9,Gamma_AF7,Gamma_AF8,Gamma_TP10
0,17099.267458,18719.659603,10611.265447,16531.085677,14241.372044,16014.591076,3761.048147,14005.573779,20210.156169,13390.938539,5140.426448,17939.091727,18599.079901,11146.12458,2903.246128,16501.495172,14527.987858,1724.751042,-232.061855,13411.244841
1,11198.7203,18034.494223,7233.162447,10638.094777,9873.838888,10651.599822,4152.945713,10103.227633,14258.112876,11266.480227,3976.212778,11566.18776,16072.507083,4980.861282,1057.621655,14331.406797,10389.427864,698.552269,0.0,11762.435847
2,11759.988085,13951.123996,19378.781415,12556.104976,13919.691015,10065.378732,11625.817268,11532.015437,17423.91887,11571.261819,13469.210388,16049.267047,22875.177018,7477.828144,11555.482578,19023.980067,21727.751956,1820.347673,8161.365913,18038.997118
3,15323.164331,17742.255086,26539.714903,13709.487935,12177.496766,11701.420319,19188.046574,12695.649223,16062.281032,11428.816573,18221.013524,14820.330323,20204.401936,7329.884952,13204.570131,17277.559869,14531.156166,-292.081546,4763.145038,13344.224953
4,11107.3057,9098.477752,7233.162447,13752.142985,12020.918779,4061.515408,4152.945713,10217.90727,18837.98411,7334.236651,3976.212778,13511.564101,18480.740151,4682.758088,1057.621655,14374.795867,14601.476676,534.096694,0.0,10550.715838


### Remove outliers

In [22]:
Q1 = X_cleaned.quantile(0.25)
Q3 = X_cleaned.quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 3.0 * IQR
upper_bound = Q3 + 3.0 * IQR

outliers = ((X_cleaned < lower_bound) | (X_cleaned > upper_bound)).any(axis=1)

X_cleaned = X_cleaned[~outliers]
y_cleaned = y[~outliers]

### Splitting the train and test data

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(18, 20) (5, 20) (18,) (5,)


### Data Augmentation

In [24]:
import numpy as np

augmentation_factor = 2
noise_factor = 0.5

X_train_augmented = np.empty((0, X_train.shape[1]))
y_train_augmented = np.tile(y_train, augmentation_factor)

for i in range(augmentation_factor):
    X_train_noisy = X_train + np.random.normal(0, noise_factor, size=X_train.shape)
    X_train_augmented = np.vstack((X_train_augmented, X_train_noisy))

y_train_augmented = np.tile(y_train, augmentation_factor)

### Feature Scaling

In [25]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(X_train_augmented)

sc.feature_names_in_ = [str(i) for i in range(X_train_augmented.shape[1])]

X_train = sc.fit_transform(X_train_augmented)
y_train = y_train_augmented
X_test = sc.transform(X_test)



### KNN

In [26]:
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

knn_clf = KNeighborsClassifier(n_neighbors=5)

scores = cross_val_score(knn_clf, X_train, y_train, cv=5)

for i, score in enumerate(scores):
    print(f'Fold {i+1} Accuracy: {score:.3f}')

mean_accuracy = scores.mean()
print(f'Mean Accuracy: {mean_accuracy:.3f}')

knn_clf.fit(X_train, y_train)
y_pred_knn = knn_clf.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_knn)}")

Fold 1 Accuracy: 0.625
Fold 2 Accuracy: 0.857
Fold 3 Accuracy: 1.000
Fold 4 Accuracy: 0.429
Fold 5 Accuracy: 1.000
Mean Accuracy: 0.782
Test Accuracy: 0.8


### Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression

reg_model = LogisticRegression(max_iter=1000)

reg_model.fit(X_train, y_train)
y_pred_reg = reg_model.predict(X_test)

print(f"Test Accuracy: {accuracy_score(y_test, y_pred_reg)}")

Test Accuracy: 0.6


### SVM

In [28]:
from sklearn.svm import SVC

svmrbf = SVC(kernel='rbf')
svmrbf.fit(X_train, y_train)
y_pred = svmrbf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.6


### Decision Tree Classifier

In [29]:
from sklearn.tree import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier(max_depth=10, class_weight='balanced')
dt_classifier.fit(X_train, y_train)
y_pred = dt_classifier.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.6


### Random Forest Classifier

In [30]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(class_weight='balanced')
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8
