In [1]:
# mounting drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# JUST IMPORTS
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV 
import csv
import argparse as ap
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import pickle
import warnings
from sklearn.metrics import confusion_matrix
import numpy as np


***Evaluation Function***

Defining a modified evaluation function in order to do base comparisons of different models.

In [3]:
import warnings
from sklearn.metrics import confusion_matrix

def metrics(lbl, pred):
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        tn, fp, fn, tp = confusion_matrix(lbl, pred).ravel()
        accuracy = (tn + tp) / (tn + tp + fn + fp)
    return accuracy



***Randomized Search Cross Validation***

Here, we begin hyperparameter tuning by using a randomized search. This will be used to determine ranges in which to perform grid search. 

In [4]:

# Hyperparamater Tuning for Random Forest Classifier 

# loading the data
url_data = "https://raw.githubusercontent.com/dlezcan1/machine-learning-fall-2021-final-project/main/data/egfr_erbB1_train_pca.csv"
url_labels = "https://raw.githubusercontent.com/dlezcan1/machine-learning-fall-2021-final-project/main/data/egfr_erbB1_train_pca_labels.csv"
data = pd.read_csv(url_data, error_bad_lines=False)
labels = pd.read_csv(url_labels, error_bad_lines=False)
  
# Split training data into training and validation datasets.
feat_train, feat_val, lbl_train, lbl_val = train_test_split(data, labels, test_size=0.2, random_state=20, stratify=labels)

# initializing all possibilities for the hyperparameters
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]


# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(feat_train, lbl_train)
new_pred = rf_random.predict(feat_val)


Fitting 3 folds for each of 10 candidates, totalling 30 fits


  self.best_estimator_.fit(X, y, **fit_params)


Next, we will visualize the best model accuracy, the chosen hyperparameter values, and compare them to the accuracy of the original model. 



In [5]:
# printing the accuracy of the best model vs new model

# original model
orig_model = RandomForestClassifier()
orig_model.fit(feat_train, lbl_train)
orig_pred = orig_model.predict(feat_val)

# get best parameters
print("Best Parameters: ", rf_random.best_params_)

# get accuracies
new_model_acc = metrics(lbl_val, new_pred)
orig_model_acc = metrics(lbl_val, orig_pred)

# print accuracies
print("Original Model Accuracy: ", orig_model_acc)
print("New Model Accuracy: ", new_model_acc)
print("Improvement in Accuracy: ", new_model_acc - orig_model_acc)

  """


Best Parameters:  {'n_estimators': 1000, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 80, 'bootstrap': False}
Original Model Accuracy:  0.8571428571428571
New Model Accuracy:  0.8670520231213873
Improvement in Accuracy:  0.009909165978530199


***Grid Search Cross Validation***

Next, we will choose a range of values near the best randomized search 

In [7]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [False],
    'max_depth': [ 70, 80, 90], 
    'max_features': ['auto'],
    'min_samples_leaf': [1], # 1, 2, 3 # increased runtime so much that I ended up leaving this out
    'min_samples_split': [8, 10, 12],
    'n_estimators': [ 900, 1000, 1100],
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 2, n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(feat_train, lbl_train)
best_pred = grid_search.predict(feat_val)


Fitting 2 folds for each of 27 candidates, totalling 54 fits


  self.best_estimator_.fit(X, y, **fit_params)


Next, we will visualize the best model accuracy, the chosen hyperparameter values, and compare them to the accuracy of the original model. 


In [8]:
# printing the accuracy of the best model vs new model

# get best parameters
print("Best Parameters: ", grid_search.best_params_)

# get accuracies
best_model_acc = metrics(lbl_val, best_pred)

# print
print("Original Model Accuracy: ", orig_model_acc)
print("New Model Accuracy: ", best_model_acc)
print("Improvement in Accuracy: ", best_model_acc - orig_model_acc)

Best Parameters:  {'bootstrap': False, 'max_depth': 70, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 1100}
Original Model Accuracy:  0.8571428571428571
New Model Accuracy:  0.8587943848059455
Improvement in Accuracy:  0.0016515276630884035
