In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score

from sklearn.neural_network import MLPClassifier

import joblib
import json

**IMPORTING DATA**
- Scaled full data
- Scaled PCA data
- Scaled LDA data

In [2]:
DATA_PATH = os.path.join("../data/processed/")

In [3]:
# Loading scaled data
X_train_scaled = pd.read_csv(os.path.join(DATA_PATH, "X_train_scaled.csv"))
X_test_scaled = pd.read_csv(os.path.join(DATA_PATH, "X_test_scaled.csv"))

# Loading pca data
X_train_pca = pd.read_csv(os.path.join(DATA_PATH, "X_train_pca.csv"))
X_test_pca = pd.read_csv(os.path.join(DATA_PATH, "X_test_pca.csv"))

# Loading lda data
X_train_lda = pd.read_csv(os.path.join(DATA_PATH, "X_train_lda.csv"))
X_test_lda = pd.read_csv(os.path.join(DATA_PATH, "X_test_lda.csv"))

# Loading data labels
y_train = pd.read_csv(os.path.join(DATA_PATH, "y_train.csv")).to_numpy().ravel()
y_test = pd.read_csv(os.path.join(DATA_PATH, "y_test.csv")).to_numpy().ravel()

**MLP ARCHITECTURE**
- reference to paper: <https://pubmed.ncbi.nlm.nih.gov/36236658/>
- MLP has 3 hidden layers:
    1) (n_features + n_classes)/2 neurons
    2) 32 neurons
    3) 16 neurons

In [4]:
n_features = len(X_test_scaled.columns)
n_classes = 5
first_layer = (n_features + n_classes) // 2

**DEFINING HYPER-PARAMETERS FOR TUNING**

In [5]:
param_grid = {
    'hidden_layer_sizes': [(first_layer, 32, 16)], # MLP architecture from mentioned paper
    'activation': ['tanh', 'relu'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive'],
    'max_iter' : [750]
}

**TRAINING MLP ON SCALED DATA**

In [19]:
# Initialize classifier
mlp = MLPClassifier()

# Initialize GridSearchCV
grid_search_scaled = GridSearchCV(mlp, param_grid, cv=KFold(n_splits=5, shuffle=True), scoring='accuracy', verbose=3, n_jobs=-1)

# Fit the model
grid_search_scaled.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 3/5] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(496, 32, 16), learning_rate=constant, max_iter=500, solver=adam;, score=0.899 total time= 2.0min
[CV 4/5] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(496, 32, 16), learning_rate=constant, max_iter=500, solver=adam;, score=0.902 total time= 2.1min
[CV 1/5] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(496, 32, 16), learning_rate=constant, max_iter=500, solver=adam;, score=0.890 total time= 2.2min
[CV 5/5] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(496, 32, 16), learning_rate=constant, max_iter=500, solver=adam;, score=0.905 total time= 2.2min
[CV 2/5] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(496, 32, 16), learning_rate=constant, max_iter=500, solver=adam;, score=0.898 total time= 2.4min
[CV 1/5] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(496, 32, 16), learning_rate=adaptive, max_iter=500, solver=adam;, sco

In [49]:
best_params = grid_search_scaled.best_params_
print("Best Parameters:", best_params)

with open('../models/mlp/scaled.csv', 'w') as f:
    json.dump(best_params, f)

# Get the best model
best_mlp = grid_search_scaled.best_estimator_

joblib.dump(best_mlp, "../models/mlp/scaled.pkl")

# Print mean validation score for the best model
best_score = grid_search_scaled.best_score_
print("Best Model Validation Score:", best_score)

# Predict on the test set
y_pred_mlp_scaled = best_mlp.predict(X_test_scaled)

# Calculate accuracy on the test set
accuracy_scaled = accuracy_score(y_test, y_pred_mlp_scaled)
print("Test Accuracy:", accuracy_scaled)

Best Parameters: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (496, 32, 16), 'learning_rate': 'adaptive', 'max_iter': 500, 'solver': 'adam'}
Best Model Validation Score: 0.9048565512674133
Test Accuracy: 0.5459183673469388


In [31]:
pd.DataFrame(y_pred_mlp_scaled, columns=["y_pred"]).to_csv("../results/mlp/scaled.csv")

**TRAINING MLP ON PCA DATA**

In [6]:
# Initialize classifier
mlp = MLPClassifier()

# Initialize GridSearchCV
grid_search_pca = GridSearchCV(mlp, param_grid, cv=KFold(n_splits=5, shuffle=True), scoring='accuracy', verbose=3, n_jobs=-1)

# Fit the model
grid_search_pca.fit(X_train_pca, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[CV 3/5] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(496, 32, 16), learning_rate=constant, max_iter=750, solver=adam;, score=0.851 total time= 1.3min
[CV 5/5] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(496, 32, 16), learning_rate=constant, max_iter=750, solver=adam;, score=0.857 total time= 1.3min
[CV 2/5] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(496, 32, 16), learning_rate=constant, max_iter=750, solver=adam;, score=0.867 total time= 1.3min
[CV 4/5] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(496, 32, 16), learning_rate=constant, max_iter=750, solver=adam;, score=0.862 total time= 1.3min
[CV 1/5] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(496, 32, 16), learning_rate=constant, max_iter=750, solver=adam;, score=0.832 total time= 1.3min
[CV 2/5] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(496, 32, 16), learning_rate=adaptive, max_iter=750, solver=adam;, score=0.867 total time= 1.2min
[CV 3/5] END activation=tanh, alph

In [9]:
best_params = grid_search_pca.best_params_
print("Best Parameters:", best_params)

with open('../models/mlp/pca.csv', 'w') as f:
    json.dump(best_params, f)

# Get the best model
best_mlp = grid_search_pca.best_estimator_

joblib.dump(best_mlp, "../models/mlp/pca.pkl")

# Print mean validation score for the best model
best_score = grid_search_pca.best_score_
print("Best Model Validation Score:", best_score)

# Predict on the test set
y_pred_mlp_pca = best_mlp.predict(X_test_pca)

# Calculate accuracy on the test set
accuracy_pca = accuracy_score(y_test, y_pred_mlp_pca)
print("Test Accuracy:", accuracy_pca)

Best Parameters: {'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (496, 32, 16), 'learning_rate': 'adaptive', 'max_iter': 750, 'solver': 'adam'}
Best Model Validation Score: 0.8886132716698197
Test Accuracy: 0.5752551020408163


In [10]:
pd.DataFrame(y_pred_mlp_pca, columns=["y_pred"]).to_csv("../results/mlp/pca.csv")

**TRAINING MLP ON LDA DATA**

In [None]:
# NOTE: Few models could not converge

In [28]:
# Initialize classifier
mlp = MLPClassifier()

# Initialize GridSearchCV
grid_search_lda = GridSearchCV(mlp, param_grid, cv=KFold(n_splits=5, shuffle=True), scoring='accuracy', verbose=3, n_jobs=-1)

# Fit the model
grid_search_lda.fit(X_train_lda, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(496, 32, 16), learning_rate=constant, max_iter=750, solver=sgd;, score=0.915 total time=  29.4s
[CV 3/5] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(496, 32, 16), learning_rate=constant, max_iter=750, solver=sgd;, score=0.906 total time=  36.2s
[CV 2/5] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(496, 32, 16), learning_rate=constant, max_iter=750, solver=sgd;, score=0.909 total time=  39.4s
[CV 4/5] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(496, 32, 16), learning_rate=constant, max_iter=750, solver=sgd;, score=0.923 total time=  30.8s
[CV 4/5] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(496, 32, 16), learning_rate=constant, max_iter=750, solver=adam;, score=0.905 total time= 1.1min
[CV 3/5] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(496, 32, 16), learning_rate=constant, max_iter=750, solver=adam;, score=0



[CV 1/5] END activation=tanh, alpha=0.001, hidden_layer_sizes=(496, 32, 16), learning_rate=adaptive, max_iter=750, solver=adam;, score=0.893 total time= 2.1min
[CV 4/5] END activation=tanh, alpha=0.001, hidden_layer_sizes=(496, 32, 16), learning_rate=adaptive, max_iter=750, solver=adam;, score=0.891 total time= 1.9min
[CV 4/5] END activation=tanh, alpha=0.001, hidden_layer_sizes=(496, 32, 16), learning_rate=adaptive, max_iter=750, solver=sgd;, score=0.922 total time=  45.3s
[CV 1/5] END activation=tanh, alpha=0.01, hidden_layer_sizes=(496, 32, 16), learning_rate=constant, max_iter=750, solver=adam;, score=0.916 total time=  35.6s
[CV 5/5] END activation=tanh, alpha=0.001, hidden_layer_sizes=(496, 32, 16), learning_rate=adaptive, max_iter=750, solver=sgd;, score=0.915 total time=  43.2s
[CV 3/5] END activation=tanh, alpha=0.01, hidden_layer_sizes=(496, 32, 16), learning_rate=constant, max_iter=750, solver=adam;, score=0.891 total time=  32.4s




[CV 5/5] END activation=tanh, alpha=0.001, hidden_layer_sizes=(496, 32, 16), learning_rate=adaptive, max_iter=750, solver=adam;, score=0.877 total time= 2.1min
[CV 5/5] END activation=tanh, alpha=0.01, hidden_layer_sizes=(496, 32, 16), learning_rate=constant, max_iter=750, solver=adam;, score=0.900 total time=  27.4s
[CV 1/5] END activation=tanh, alpha=0.01, hidden_layer_sizes=(496, 32, 16), learning_rate=constant, max_iter=750, solver=sgd;, score=0.911 total time=  28.4s
[CV 2/5] END activation=tanh, alpha=0.01, hidden_layer_sizes=(496, 32, 16), learning_rate=constant, max_iter=750, solver=adam;, score=0.891 total time= 1.1min
[CV 2/5] END activation=tanh, alpha=0.01, hidden_layer_sizes=(496, 32, 16), learning_rate=constant, max_iter=750, solver=sgd;, score=0.906 total time=  36.0s
[CV 4/5] END activation=tanh, alpha=0.01, hidden_layer_sizes=(496, 32, 16), learning_rate=constant, max_iter=750, solver=sgd;, score=0.923 total time=  34.7s
[CV 3/5] END activation=tanh, alpha=0.01, hidden

In [52]:
best_params = grid_search_lda.best_params_
print("Best Parameters:", best_params)

with open('../models/mlp/lda.csv', 'w') as f:
    json.dump(best_params, f)

# Get the best model
best_mlp = grid_search_lda.best_estimator_

joblib.dump(best_mlp, "../models/mlp/lda.pkl")

# Print mean validation score for the best model
best_score = grid_search_lda.best_score_
print("Best Model Validation Score:", best_score)

# Predict on the test set
y_pred_mlp_lda = best_mlp.predict(X_test_lda)

# Calculate accuracy on the test set
accuracy_lda = accuracy_score(y_test, y_pred_mlp_lda)
print("Test Accuracy:", accuracy_lda)

Best Parameters: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (496, 32, 16), 'learning_rate': 'constant', 'max_iter': 750, 'solver': 'sgd'}
Best Model Validation Score: 0.9131459520953641
Test Accuracy: 0.5165816326530612


In [39]:
pd.DataFrame(y_pred_mlp_lda, columns=["y_pred"]).to_csv("../results/mlp/lda.csv")