In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier

import joblib
import json

**IMPORTING DATA**
- Scaled full data
- Scaled PCA data
- Scaled LDA data

In [3]:
DATA_PATH = os.path.join("../data/processed/")

In [4]:
# Loading scaled data
X_train_scaled = pd.read_csv(os.path.join(DATA_PATH, "X_train_scaled.csv"))
X_test_scaled = pd.read_csv(os.path.join(DATA_PATH, "X_test_scaled.csv"))

# Loading pca data
X_train_pca = pd.read_csv(os.path.join(DATA_PATH, "X_train_pca.csv"))
X_test_pca = pd.read_csv(os.path.join(DATA_PATH, "X_test_pca.csv"))

# Loading lda data
X_train_lda = pd.read_csv(os.path.join(DATA_PATH, "X_train_lda.csv"))
X_test_lda = pd.read_csv(os.path.join(DATA_PATH, "X_test_lda.csv"))

# Loading data labels
y_train = pd.read_csv(os.path.join(DATA_PATH, "y_train.csv")).to_numpy().ravel()
y_test = pd.read_csv(os.path.join(DATA_PATH, "y_test.csv")).to_numpy().ravel()

In [5]:
# Initialize parameters grid
param_grid = {
    'n_estimators': [50, 100, 200, 500],  # Number of trees in the forest
    'criterion': ['gini'],  # Split criterion
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10]  # Minimum number of samples required to split an internal node
}

**TRAINING RANDOM FOREST ON SCALED DATA**

In [19]:
# Initialize classifier
rf = RandomForestClassifier()

# Initialize GridSearchCV
grid_search_scaled = GridSearchCV(rf, param_grid, cv=KFold(n_splits=5, shuffle=True), scoring='accuracy', verbose=3, n_jobs=-1)

# Fit the model
grid_search_scaled.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 5/5] END criterion=gini, max_depth=None, min_samples_split=2, n_estimators=50;, score=0.835 total time=   7.6s
[CV 4/5] END criterion=gini, max_depth=None, min_samples_split=2, n_estimators=50;, score=0.850 total time=   8.0s
[CV 3/5] END criterion=gini, max_depth=None, min_samples_split=2, n_estimators=50;, score=0.833 total time=   8.1s
[CV 2/5] END criterion=gini, max_depth=None, min_samples_split=2, n_estimators=50;, score=0.826 total time=   8.1s
[CV 1/5] END criterion=gini, max_depth=None, min_samples_split=2, n_estimators=50;, score=0.829 total time=   8.4s
[CV 3/5] END criterion=gini, max_depth=None, min_samples_split=2, n_estimators=100;, score=0.846 total time=  14.7s
[CV 1/5] END criterion=gini, max_depth=None, min_samples_split=2, n_estimators=100;, score=0.854 total time=  15.1s
[CV 2/5] END criterion=gini, max_depth=None, min_samples_split=2, n_estimators=100;, score=0.838 total time=  15.8s
[CV 5/5] END cr

In [20]:
best_params = grid_search_scaled.best_params_
print("Best Parameters:", best_params)

with open('../models/rf/scaled.csv', 'w') as f:
    json.dump(best_params, f)
    
#TODO save model and save params
# Get the best model
best_rf = grid_search_scaled.best_estimator_

joblib.dump(best_rf, "../models/rf/scaled.pkl")

# Print mean validation score for the best model
best_score = grid_search_scaled.best_score_
print("Best Model Validation Score:", best_score)

# Predict on the test set
y_pred_rf_scaled = best_rf.predict(X_test_scaled)

# Calculate accuracy on the test set
accuracy_scaled = accuracy_score(y_test, y_pred_rf_scaled)
print("Test Accuracy:", accuracy_scaled)

Best Parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2, 'n_estimators': 500}
Best Model Validation Score: 0.8715406672794547
Test Accuracy: 0.6090561224489796


In [21]:
pd.DataFrame(y_pred_rf_scaled, columns=["y_pred"]).to_csv("../results/rf/scaled.csv")

**TRAINING RANDOM FOREST ON PCA DATA**

In [22]:
# Initialize classifier
rf = RandomForestClassifier()

# Initialize GridSearchCV
grid_search_pca = GridSearchCV(rf, param_grid, cv=KFold(n_splits=5, shuffle=True), scoring='accuracy', verbose=1, n_jobs=-1)

# Fit the model
grid_search_pca.fit(X_train_pca, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [23]:
best_params = grid_search_pca.best_params_
print("Best Parameters:", best_params)

with open('../models/rf/pca.csv', 'w') as f:
    json.dump(best_params, f)

# Get the best model
best_rf = grid_search_pca.best_estimator_

joblib.dump(best_rf, "../models/rf/pca.pkl")

# Print mean validation score for the best model
best_score = grid_search_pca.best_score_
print("Best Model Validation Score:", best_score)

# Predict on the test set
y_pred_rf_pca = best_rf.predict(X_test_pca)

# Calculate accuracy on the test set
accuracy_pca = accuracy_score(y_test, y_pred_rf_pca)
print("Test Accuracy:", accuracy_pca)

Best Parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 10, 'n_estimators': 500}
Best Model Validation Score: 0.7666177535410492
Test Accuracy: 0.5535714285714286


In [24]:
pd.DataFrame(y_pred_rf_pca, columns=["y_pred"]).to_csv("../results/rf/pca.csv")

**TRAINING RANDOM FOREST ON LDA DATA**

In [25]:
# Initialize classifier
rf = RandomForestClassifier()

# Initialize GridSearchCV
grid_search_lda = GridSearchCV(rf, param_grid, cv=KFold(n_splits=5, shuffle=True), scoring='accuracy', verbose=1, n_jobs=-1)

# Fit the model
grid_search_lda.fit(X_train_lda, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [28]:
best_params = grid_search_lda.best_params_
print("Best Parameters:", best_params)

with open('../models/rf/lda.csv', 'w') as f:
    json.dump(best_params, f)

# Get the best model
best_rf = grid_search_lda.best_estimator_

joblib.dump(best_rf, "../models/rf/lda.pkl")

# Print mean validation score for the best model
best_score = grid_search_lda.best_score_
print("Best Model Validation Score:", best_score)

# Predict on the test set
y_pred_rf_lda = best_rf.predict(X_test_lda)

# Calculate accuracy on the test set
accuracy_lda = accuracy_score(y_test, y_pred_rf_lda)
print("Test Accuracy:", accuracy_lda)

Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 500}
Best Model Validation Score: 0.9080055398236654
Test Accuracy: 0.5210459183673469


In [29]:
pd.DataFrame(y_pred_rf_lda, columns=["y_pred"]).to_csv("../results/rf/lda.csv")