CLPS 1291 Final Project

In [None]:
import pandas as pd
import numpy as np
import collections
from sklearn.preprocessing import StandardScaler, MinMaxScaler, normalize, OneHotEncoder
import os
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.svm import SVC  # For Support Vector Machine
from sklearn.neural_network import MLPClassifier  # For Multi-Layer Perceptron
from sklearn.model_selection import train_test_split # For splitting data into training and testing sets
from sklearn.metrics import accuracy_score, classification_report # For evaluating

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
if torch.cuda.is_available():
  device = "cuda"
else:
  device = "cpu"
print(f"Using {device} device")

Using cuda device


In [None]:
base_path = "/content/drive/MyDrive/EEG/"
columns = pd.read_csv(base_path + "columnLabels.csv").columns

In [None]:
columns

Index(['subject', 'trial', 'condition', 'sample', 'Fp1', 'AF7', 'AF3', 'F1',
       'F3', 'F5', 'F7', 'FT7', 'FC5', 'FC3', 'FC1', 'C1', 'C3', 'C5', 'T7',
       'TP7', 'CP5', 'CP3', 'CP1', 'P1', 'P3', 'P5', 'P7', 'P9', 'PO7', 'PO3',
       'O1', 'Iz', 'Oz', 'POz', 'Pz', 'CPz', 'Fpz', 'Fp2', 'AF8', 'AF4', 'AFz',
       'Fz', 'F2', 'F4', 'F6', 'F8', 'FT8', 'FC6', 'FC4', 'FC2', 'FCz', 'Cz',
       'C2', 'C4', 'C6', 'T8', 'TP8', 'CP6', 'CP4', 'CP2', 'P2', 'P4', 'P6',
       'P8', 'P10', 'PO8', 'PO4', 'O2', 'VEOa', 'VEOb', 'HEOL', 'HEOR', 'Nose',
       'TP10'],
      dtype='object')

In [None]:
demography = pd.read_csv(base_path + "demographic.csv")
subjects = demography['subject']
types = demography[' group']
subject_type = {subjects[i]:types[i] for i in range(len(subjects))}

In [None]:
def averaged_N_rows(a, n):
    assert a.shape[0] % n == 0
    b = a.reshape(a.shape[0] // n, n, a.shape[1])
    mean_vec = b.mean(axis=1)
    return mean_vec

In [None]:
def read_subject(data, subject_idx):
    trialCount = collections.Counter(data.trial)
    all_data = []
    labels = []
    scaler = MinMaxScaler()
    ohe = OneHotEncoder()
    for t in trialCount:
        if trialCount[t] != 9216:
            continue
        trial_data = data[data.trial == t]
        trial_data = averaged_N_rows(trial_data.drop(columns=["subject", "trial", "sample", "condition"]).values, 64)
#         cond = ohe.fit_transform(trial_data[:, 0].reshape(-1, 1)).toarray()
        trial_data = scaler.fit_transform(trial_data)
        # extract subject and condition, average on the rest
#         all_data.append(np.concatenate([cond, trial_data], axis=1))
        all_data.append(trial_data)
        labels.append(subject_type[subject_idx])
    return np.array(all_data), np.array(labels)

In [None]:
def read_all_subjects():
    all_labels = []
    all_subjects_data = []
    subjects = [path for path in os.listdir("../eeg_data/") if path.endswith(".csv")]
    for subject in tqdm(subjects):
        subject_idx = int(subject.split(".")[0])
        data = pd.read_csv(os.path.join("../eeg_data/", subject, subject), header=None, names=columns)
        data, labels = read_subject(data, subject_idx)
        all_subjects_data.append(data)
        all_labels.append(labels)
    return np.concatenate(all_subjects_data, axis=0), np.concatenate(all_labels)

### Run if data has not been processed yet

In [None]:
all_subjects, all_labels = read_all_subjects()

FileNotFoundError: ignored

In [None]:
all_subjects.shape

(7092, 144, 70)

In [None]:
np.save("data_extracted", all_subjects)
np.save("labels", all_labels)

### Directly load in processed data

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
all_subjects = np.load(base_path + "data_extracted.npy")
all_labels = np.load(base_path + "labels.npy")
labelsOHE = OneHotEncoder().fit_transform(all_labels.reshape(-1, 1)).toarray()

In [None]:
# Flattening the data from (7092, 144, 70) to (7092, 144*70)
flattened_subjects = all_subjects.reshape(all_subjects.shape[0], -1)

In [None]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(flattened_subjects, all_labels, test_size=0.3, random_state=42)


Traditional ML model building

###SVM



In [None]:
# Initialize the SVM classifier
svm_model = SVC()

# Fit the model
svm_model.fit(flattened_subjects, all_labels)


In [None]:
# Training the SVM model
svm_model.fit(X_train, y_train)


In [None]:
# Predicting on the test set
y_pred = svm_model.predict(X_test)

# Evaluating the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.6625939849624061
              precision    recall  f1-score   support

           0       0.68      0.30      0.41       849
           1       0.66      0.91      0.76      1279

    accuracy                           0.66      2128
   macro avg       0.67      0.60      0.59      2128
weighted avg       0.67      0.66      0.62      2128



###MLP

In [None]:
# Initialize the MLP classifier
mlp_model = MLPClassifier(hidden_layer_sizes=(512, 256), activation='relu', solver='adam', max_iter=500)

# Fit the model
mlp_model.fit(flattened_subjects, all_labels)


In [None]:
# Training the MLP model
mlp_model.fit(X_train, y_train)


In [None]:
# Predicting on the test set
y_pred = mlp_model.predict(X_test)

# Evaluating the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


### Model Building

In [None]:
class CNNModel(nn.Module):
    def __init__(self):
        super().__init__()
        # input: 144x70x1, output: 71x34x16
        self.conv1 = nn.Conv2d(1, 16, kernel_size=(3,3), stride=2)
        self.act1 = nn.LeakyReLU()
        self.drop1 = nn.Dropout(0.3)

        # input: 71x34x16, output: 35x16x32
        self.conv2 = nn.Conv2d(16, 32, kernel_size=(3,3), stride=2)
        self.act2 = nn.LeakyReLU()

        # input: 35x16x32, output:17x7x32
        self.conv3 = nn.Conv2d(32, 32, kernel_size=(3,3), stride=2)
        self.act3 = nn.LeakyReLU()

        self.flat = nn.Flatten()

        self.fc5 = nn.Linear(3808, 512)
        self.act5 = nn.ReLU()
        self.drop5 = nn.Dropout(0.5)

        self.fc6 = nn.Linear(512, 2)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.act1(self.conv1(x))
        x = self.act2(self.conv2(x))
        x = self.act3(self.conv3(x))
        # x = self.act4(self.conv4(x))
        x = self.flat(x)
        x = self.act5(self.fc5(x))
        x = self.drop5(x)
        # input 512, output 10
        x = self.sigmoid(self.fc6(x))
        return x


model = CNNModel().to(device)
# Define the loss function and optimizer
criterion = nn.BCELoss()

adam_optimizer = optim.SGD(model.parameters(), lr=0.05)

# Print the model architecture
print(model)

CNNModel(
  (conv1): Conv2d(1, 16, kernel_size=(3, 3), stride=(2, 2))
  (act1): LeakyReLU(negative_slope=0.01)
  (drop1): Dropout(p=0.3, inplace=False)
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2))
  (act2): LeakyReLU(negative_slope=0.01)
  (conv3): Conv2d(32, 32, kernel_size=(3, 3), stride=(2, 2))
  (act3): LeakyReLU(negative_slope=0.01)
  (flat): Flatten(start_dim=1, end_dim=-1)
  (fc5): Linear(in_features=3808, out_features=512, bias=True)
  (act5): ReLU()
  (drop5): Dropout(p=0.5, inplace=False)
  (fc6): Linear(in_features=512, out_features=2, bias=True)
  (sigmoid): Sigmoid()
)


In [None]:
subjects = torch.Tensor(all_subjects)[:, None, :, :]
labels = torch.Tensor(labelsOHE)

In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

In [None]:
X_train, X_test, y_train, y_test = train_test_split(subjects, labels, test_size=0.25, random_state=42)

In [None]:
train_dataset = TensorDataset(X_train, y_train) # create your datset
train_dataloader = DataLoader(train_dataset, batch_size=32) # create your dataloader

test_dataset = TensorDataset(X_test, y_test) # create your datset
test_dataloader = DataLoader(test_dataset, batch_size=32) # create your dataloader

In [None]:
# Define training parameters
epochs = 30
val_accuracies = []
# Training loop

model = model.cuda()
for epoch in range(epochs):
    total_loss = 0

    for (batch_X, batch_y) in train_dataloader:
        # Zero the gradients
        adam_optimizer.zero_grad()
        # Forward pass
        outputs = model(batch_X.cuda())
        # Compute the loss
        loss = criterion(outputs, batch_y.cuda())
        # Backward pass
        loss.backward()
        # Update weights
        adam_optimizer.step()
        total_loss += loss.item()
    # Print average loss for the epoch
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader):.4f}")
    # Validation
    model.eval()
    with torch.no_grad():
      val_acc = []
      for (batch_X, batch_y) in test_dataloader:
        val_outputs = model(batch_X.cuda())
        val_loss = criterion(val_outputs, batch_y.cuda())

        # If y_val is one-hot encoded
        _, val_preds = torch.max(val_outputs, 1)
        _, yv = torch.max(batch_y, 1)
        # print(val_preds, yv)

        val_acc.append((val_preds == yv.cuda()).float().mean().item())
      val_accuracies.append(np.mean(val_acc))

    print(f"Validation Loss: {val_loss.item():.4f}, Validation Accuracy: {np.mean(val_acc):.4f}")

Epoch 1, Loss: 0.6773
Validation Loss: 0.6134, Validation Accuracy: 0.6008
Epoch 2, Loss: 0.6744
Validation Loss: 0.6124, Validation Accuracy: 0.6008
Epoch 3, Loss: 0.6742
Validation Loss: 0.6118, Validation Accuracy: 0.6008
Epoch 4, Loss: 0.6741
Validation Loss: 0.6113, Validation Accuracy: 0.6008
Epoch 5, Loss: 0.6740
Validation Loss: 0.6107, Validation Accuracy: 0.6008
Epoch 6, Loss: 0.6738
Validation Loss: 0.6100, Validation Accuracy: 0.6008
Epoch 7, Loss: 0.6736
Validation Loss: 0.6092, Validation Accuracy: 0.6008
Epoch 8, Loss: 0.6733
Validation Loss: 0.6081, Validation Accuracy: 0.6008
Epoch 9, Loss: 0.6730
Validation Loss: 0.6066, Validation Accuracy: 0.6008
Epoch 10, Loss: 0.6724
Validation Loss: 0.6044, Validation Accuracy: 0.6008
Epoch 11, Loss: 0.6718
Validation Loss: 0.6014, Validation Accuracy: 0.6008
Epoch 12, Loss: 0.6709
Validation Loss: 0.5976, Validation Accuracy: 0.6008
Epoch 13, Loss: 0.6700
Validation Loss: 0.5929, Validation Accuracy: 0.6008
Epoch 14, Loss: 0.668

In [None]:
import tensorflow as tf

In [None]:
resnet50 = tf.keras.applications.resnet50.ResNet50(
  include_top=False,
  weights=None,
  input_tensor=None,
  input_shape=(144, 70, 1),
  pooling="avg",
  classes=2,
  classifier_activation="sigmoid"
)

In [None]:
resnet50.summary()

In [None]:
model = tf.keras.models.Sequential([
    resnet50,
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(2, activation="sigmoid")
])
optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=0.05)
model.compile(optimizer=optimizer, loss=tf.keras.losses.BinaryCrossentropy(), metrics=["accuracy"])

In [None]:
resnet50 = tf.keras.applications.resnet50.ResNet50(
  include_top=False,
  weights=None,
  input_tensor=None,
  input_shape=(144, 70, 1),
  pooling="avg",
  classes=2,
  classifier_activation="sigmoid"
)
model = tf.keras.models.Sequential([
    resnet50,
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(2, activation="sigmoid")
])
optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=0.02)
model.compile(optimizer=optimizer, loss=tf.keras.losses.BinaryCrossentropy(), metrics=["accuracy"])

model.fit(np.squeeze(X_train.numpy()), y_train.numpy(), epochs=20, batch_size=32, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7cc140659510>