In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_theme()
pd.set_option("display.max_columns", 85)

# Random Forest Classifier

In [2]:
df = pd.read_csv("../data/abnormal_writeout_noscale.data.csv") # NEW

# trascurare da ACC a UVM
start_drop = df.columns.get_loc("ACC")
end_drop = df.columns.get_loc("UVM")
cols = np.arange(start_drop, end_drop + 1)
df.drop(df.columns[cols], axis=1, inplace=True)

# trascurare alcune colonne
df.drop("TTT_freq", axis=1, inplace=True)
df.drop("oldest_phylostratum_factor", axis=1, inplace=True)
df.drop("Unnamed: 0", axis=1, inplace=True)

# Drop NaNs
df.dropna(inplace=True)

# Sort features
resp = df["response"].to_numpy()
occ = df["occ_total_sum"]
age = df["oldest_phylostratum"].to_numpy()
conf = df.drop(labels=["response", "occ_total_sum", "oldest_phylostratum"], axis=1).to_numpy()

X = np.c_[occ, age, conf]
Y = resp

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print("Feature matrix shape:", X_train.shape, X_test.shape)

Feature matrix shape: (14536, 83) (3634, 83)


In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA

conf_cols = np.arange(2, X.shape[1]) # Columns of confounder variables (highly colinear)

class ConfounderPCA(BaseEstimator, TransformerMixin):
    ''' Custom PCA transformer for this dataset''' 
    
    def __init__(self, explained_variance=.95):
        self.explained_variance = explained_variance
        self.pca = PCA(n_components = self.explained_variance )
        
    def fit(self, X, y = None):
        self.pca.fit(X[:, conf_cols])
        return self
    
    def transform(self, X, y = None):
        X_conf_pca = self.pca.transform(X[:, conf_cols])
        return np.c_[X[:, :2], X_conf_pca]

In [4]:
from sklearn.metrics import auc, make_scorer, precision_recall_curve


def auprc(y_true, y_scores, **kwargs):
    """ Remember to use make_scorer(auprc, needs_proba=True,) """
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_scores)
    # result is sum of the areas under each curve
    return auc(thresholds, precisions[:-1]) + auc(thresholds, recalls[:-1])

# [Random Forest](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)

The goal of this notebook is to implement a definite grid-search of the best hyper-parameters for the random forest classifier. The search takes about 3 hours on my machine. The complete history of the search can be found in `rf_gs_results.csv` and its companion excel file. 

The parameters which optimized ROC-AUC were:
- `max_depth=10`
- `max_features=2`
- `min_samples_split=20`
- `n_estimators=200` 

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Define metrics to be calculated during the grid search, along with their names
my_metrics = {
    "ROC_AUC": "roc_auc",
    # "Precision": "precision",
    # "Recall": "recall",
    # "f1-score": "f1",
    "AUPRC": make_scorer(auprc, needs_proba=True,),  # custom: area under precision-recall-curve
}


# Define the parameter grid
param_grid = {
    "rf__bootstrap": [True, False],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__max_features': [2, 3],
    "rf__min_samples_leaf": [1, 2, 5, 10, 20,],
    "rf__min_samples_split": [2, 5, 10, 20],
    "rf__n_estimators": [100, 200, 300, 500],
}

rf_clf = Pipeline([
    ('scaler', StandardScaler()),
    ("pca", ConfounderPCA()), 
    ("rf", RandomForestClassifier(random_state=42, n_jobs=-1))
])


# Define the grid search object
grid_search = GridSearchCV(
    estimator=rf_clf,
    scoring=my_metrics,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    verbose=1,
    error_score="raise",
    refit="ROC_AUC",  # Final fit
)

# Search
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 1280 candidates, totalling 3840 fits


GridSearchCV(cv=3, error_score='raise',
             estimator=Pipeline(steps=[('pca', ConfounderPCA()),
                                       ('rf',
                                        RandomForestClassifier(n_jobs=-1,
                                                               random_state=42))]),
             n_jobs=-1,
             param_grid={'rf__bootstrap': [True, False],
                         'rf__max_depth': [None, 10, 20, 30],
                         'rf__max_features': [2, 3],
                         'rf__min_samples_leaf': [1, 2, 5, 10, 20],
                         'rf__min_samples_split': [2, 5, 10, 20],
                         'rf__n_estimators': [100, 200, 300, 500]},
             refit='ROC_AUC',
             scoring={'AUPRC': make_scorer(auprc, needs_proba=True),
                      'ROC_AUC': 'roc_auc'},
             verbose=1)

In [6]:
gs_df = pd.DataFrame(grid_search.cv_results_)
gs_df.to_csv("./results/rf_gs_results.csv")

In [7]:
grid_search.best_estimator_
# max_depth=10, 
# max_features=2,
# min_samples_split=20, 
#  n_estimators=200,

Pipeline(steps=[('pca', ConfounderPCA()),
                ('rf',
                 RandomForestClassifier(max_depth=10, max_features=2,
                                        min_samples_split=20, n_estimators=200,
                                        n_jobs=-1, random_state=42))])

In [8]:
# from sklearn.model_selection import cross_validate

# my_metrics = {
#     "ROC_AUC": "roc_auc",
#     "Precision": "precision",
#     "Recall": "recall",
#     "f1-score": "f1",
#     "AUPRC": make_scorer(auprc, needs_proba=True,),  # custom: area under precision-recall-curve
# }

# default_rf_clf = Pipeline([
#     ("pca", ConfounderPCA()), 
#     ("rf", RandomForestClassifier(random_state=42, n_jobs=-1))
# ])

# best_rf_clf = Pipeline([
#     ("pca", ConfounderPCA()), 
#     ("rf", RandomForestClassifier(
#         bootstrap=False, 
#         max_depth=20, 
#         max_features=3,
#         min_samples_leaf=10,
#         n_estimators=300, 
#         random_state=42, 
#         n_jobs=-1, )
#     )
# ])

# scores_default = cross_validate(default_rf_clf, X_train, y_train, cv=10, scoring=my_metrics)
# scores_best = cross_validate(best_rf_clf, X_train, y_train, cv=10, scoring=my_metrics)

In [9]:
# pd.DataFrame(scores_default)

In [10]:
# pd.DataFrame(scores_best)

# [Balanced Random Forest](https://imbalanced-learn.org/stable/references/generated/imblearn.ensemble.BalancedRandomForestClassifier.html)

Balancing seems to consistently produce higher roc-auc scores for this dataset.

Best hyperparameters:
- `max_depth=10`
- `max_features=2`
- `min_samples_leaf=2`
- `min_samples_split=20`
- `n_estimators=500`

 bootstrap=False, max_depth=20,
- max_features=3,
- min_samples_split=20,
- n_estimators=500, n_jobs=-

In [6]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Define metrics to be calculated during the grid search, along with their names
my_metrics = {
    "ROC_AUC": "roc_auc",
    # "Precision": "precision",
    # "Recall": "recall",
    # "f1-score": "f1",
    "AUPRC": make_scorer(auprc, needs_proba=True,),  # custom: area under precision-recall-curve
}


# Define the parameter grid
param_grid = {
    "rf__bootstrap": [True, False],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__max_features': [2, 3],
    "rf__min_samples_leaf": [1, 2, 5, 10, 20,],
    "rf__min_samples_split": [2, 5, 10, 20],
    "rf__n_estimators": [100, 200, 300, 500],
}

brf_clf = Pipeline([
    ('scaler', StandardScaler()),
    ("pca", ConfounderPCA()), 
    ("rf", BalancedRandomForestClassifier(random_state=42, n_jobs=-1))
])


# Define the grid search object
bal_grid_search = GridSearchCV(
    estimator=brf_clf,
    scoring=my_metrics,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    verbose=1,
    error_score="raise",
    refit="ROC_AUC",  # Final fit
)

# Search
bal_grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 1280 candidates, totalling 3840 fits


GridSearchCV(cv=3, error_score='raise',
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('pca', ConfounderPCA()),
                                       ('rf',
                                        BalancedRandomForestClassifier(n_jobs=-1,
                                                                       random_state=42))]),
             n_jobs=-1,
             param_grid={'rf__bootstrap': [True, False],
                         'rf__max_depth': [None, 10, 20, 30],
                         'rf__max_features': [2, 3],
                         'rf__min_samples_leaf': [1, 2, 5, 10, 20],
                         'rf__min_samples_split': [2, 5, 10, 20],
                         'rf__n_estimators': [100, 200, 300, 500]},
             refit='ROC_AUC',
             scoring={'AUPRC': make_scorer(auprc, needs_proba=True),
                      'ROC_AUC': 'roc_auc'},
             verbose=1)

In [7]:
bgs_df = pd.DataFrame(bal_grid_search.cv_results_)
bgs_df.to_csv("./results/brf_gs_results.csv")

In [8]:
bal_grid_search.best_estimator_

# bootstrap=False, max_depth=20,
# max_features=3,
# min_samples_split=20,
# n_estimators=500, n_jobs=-1,
# random_state=42)

Pipeline(steps=[('scaler', StandardScaler()), ('pca', ConfounderPCA()),
                ('rf',
                 BalancedRandomForestClassifier(bootstrap=False, max_depth=20,
                                                max_features=3,
                                                min_samples_split=20,
                                                n_estimators=500, n_jobs=-1,
                                                random_state=42))])