In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

pd.set_option("display.max_columns", 85)
sns.set_theme(context="paper", font_scale=1.5, style="ticks", rc={"axes.grid": True})

### Data

In [2]:
# Read Data from CSV (NEW DATA, NOT SCALED)
df = pd.read_csv("../data/abnormal_writeout_noscale.data.csv", index_col=0)

# trascurare da ACC a UVM
start_drop = df.columns.get_loc("ACC")
end_drop = df.columns.get_loc("UVM")
cols = np.arange(start_drop, end_drop + 1)
df.drop(df.columns[cols], axis=1, inplace=True)

# trascurare alcune colonne
df.drop("TTT_freq", axis=1, inplace=True)
df.drop("oldest_phylostratum_factor", axis=1, inplace=True)
df.drop("gc_cds", axis=1, inplace=True) # ! New

# Drop NaNs
df.dropna(inplace=True)

# Sort features
resp = df["response"]
occ = df["occ_total_sum"]
age = df["oldest_phylostratum"]
conf = df.drop(labels=["response", "occ_total_sum", "oldest_phylostratum"], axis=1)

# Collect Features and Labels
features_df = pd.DataFrame()
features_df["occ_total_sum"] = occ
features_df["oldest_phylostratum"] = age
features_df = pd.concat([features_df, conf], axis=1)

X = features_df.to_numpy()
y = df["response"].to_numpy()

### Custom PCA

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Columns of confounder variables (highly colinear)
conf_index = 2
conf_cols = np.arange(2, X.shape[1])  

class ConfounderPCA(BaseEstimator, TransformerMixin):
    """ 
    Custom PCA transformer for this dataset.
    Applies PCA only to the many collinear confounder 
    variables.
    """
    def __init__(self, confcols, n_components=None, apply_PCA=True):
        self.n_components = n_components
        self.apply_PCA = apply_PCA
        self.confcols = confcols
        if self.apply_PCA:
            self.pca = PCA(n_components=self.n_components)

    def fit(self, X, y=None):
        if self.apply_PCA:
            self.pca.fit(X[:, self.confcols])
        return self

    def transform(self, X, y=None):
        if self.apply_PCA:
            X_conf_pca = self.pca.transform(X[:, self.confcols])
            return np.c_[X[:, :2], X_conf_pca]
        else:
            return X

### Custom Scoring: Area Under Precision Recall Curve

In [4]:
from sklearn.metrics import auc, make_scorer, precision_recall_curve


def auprc(y_true, y_scores, **kwargs):
    """ Remember to use make_scorer(auprc, needs_proba=True,) """
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_scores)
    # results is area under x=Recall and y=Precision curve. 
    return auc(recalls, precisions)

### Train-Test Split

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (14536, 82) (14536,)
Testing set shape: (3634, 82) (3634,)


# [Random Forest](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)

The goal of this notebook is to optimize the random forest classifier, a model with many hyperparameters. Because the hyperparameter space is so large, I chose a randomized search approach.

In [39]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Define the parameter grid
param_grid = {
    "rf__bootstrap": [True, False],
    "rf__max_depth": [None, 10, 20, 30],
    "rf__max_features": ['auto', 'sqrt'],
    "rf__min_samples_leaf": [1, 2, 5, 10, 20,],
    "rf__min_samples_split": [2, 5, 10, 20],
    "rf__n_estimators": [100, 200, 300, 500, 1000],
    "pca__n_components": [0.5, 0.6, 0.7, 0.9, 0.95, 0.99, None],
    "pca__apply_PCA": [True, False],
}

rf_clf = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("pca", ConfounderPCA(confcols=np.arange(2, X.shape[1]))),
        ("rf", BalancedRandomForestClassifier(random_state=42, n_jobs=-1)),
    ]
)

In [40]:
# Define search
ran_search = RandomizedSearchCV(
    rf_clf, 
    param_distributions=param_grid, 
    n_iter=100, cv=3, n_jobs=-1, 
    verbose=1,
    random_state=42, 
    scoring="roc_auc"
)

# Search

rs_results = ran_search.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [42]:
rs_df = pd.DataFrame(rs_results.cv_results_)
rs_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rf__n_estimators,param_rf__min_samples_split,param_rf__min_samples_leaf,param_rf__max_features,param_rf__max_depth,param_rf__bootstrap,param_pca__n_components,param_pca__apply_PCA,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,6.450221,0.029039,0.112958,0.001128,100,20,20,sqrt,20.0,False,0.7,False,"{'rf__n_estimators': 100, 'rf__min_samples_spl...",0.687427,0.685309,0.673485,0.682074,0.006134,24
1,7.072739,0.33421,0.124212,0.0045,100,2,10,auto,,False,0.5,True,"{'rf__n_estimators': 100, 'rf__min_samples_spl...",0.678886,0.672701,0.673467,0.675018,0.002753,84
2,4.013316,0.323373,0.129473,0.0116,100,10,20,sqrt,20.0,True,0.9,True,"{'rf__n_estimators': 100, 'rf__min_samples_spl...",0.676699,0.676745,0.668838,0.674094,0.003717,90
3,3.939204,0.464821,0.111908,0.002759,100,20,10,sqrt,30.0,True,,False,"{'rf__n_estimators': 100, 'rf__min_samples_spl...",0.67656,0.684935,0.672397,0.677964,0.005214,64
4,40.212489,0.374139,0.699936,0.031049,1000,2,10,sqrt,30.0,True,0.5,False,"{'rf__n_estimators': 1000, 'rf__min_samples_sp...",0.684982,0.688956,0.674711,0.682883,0.006002,16


In [43]:
rs_results.best_params_

{'rf__n_estimators': 500,
 'rf__min_samples_split': 5,
 'rf__min_samples_leaf': 10,
 'rf__max_features': 'sqrt',
 'rf__max_depth': 30,
 'rf__bootstrap': False,
 'pca__n_components': 0.9,
 'pca__apply_PCA': False}

In [48]:
rs_df.groupby(by='param_pca__n_components').mean()

Unnamed: 0_level_0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
param_pca__n_components,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.5,20.064156,0.277331,0.348628,0.030154,0.682453,0.683314,0.672585,0.67945,0.00512,47.166667
0.6,16.709124,0.309289,0.254225,0.02318,0.680257,0.681791,0.671273,0.677774,0.004925,61.384615
0.7,13.533876,0.190966,0.201499,0.023234,0.681242,0.683827,0.672821,0.679297,0.004884,49.0
0.9,16.018306,0.192539,0.23262,0.022917,0.681246,0.682082,0.672884,0.678738,0.004492,51.4375
0.95,27.25395,0.274837,0.349879,0.03269,0.681115,0.682885,0.672046,0.678682,0.004834,45.090909
0.99,20.385668,0.179369,0.322847,0.02217,0.67989,0.681652,0.671911,0.677817,0.004467,59.076923


## Gridsearch

In [9]:
from sklearn.model_selection import GridSearchCV

# Define search
gs = GridSearchCV(
    rf_clf, 
    param_grid=param_grid, 
    cv=3, n_jobs=-1, 
    verbose=1,
    scoring="roc_auc"
)

# Search

gs_results = gs.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


In [34]:
pd.DataFrame(gs_results.cv_results_).to_csv("./results/rf_gs.csv")

In [37]:
gs_data = pd.DataFrame(gs_results.cv_results_)
gs_data.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_pca__apply_PCA,param_pca__n_components,param_rf__max_depth,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,6.669488,0.043308,0.175162,0.037076,True,0.5,,"{'pca__apply_PCA': True, 'pca__n_components': ...",0.664689,0.656231,0.663606,0.661509,0.003758,31
1,4.758661,0.230081,0.120528,0.001336,True,0.5,10.0,"{'pca__apply_PCA': True, 'pca__n_components': ...",0.676188,0.672681,0.670279,0.673049,0.002426,19
2,5.387558,0.254806,0.121892,0.001012,True,0.5,20.0,"{'pca__apply_PCA': True, 'pca__n_components': ...",0.663767,0.658679,0.657976,0.660141,0.00258,43
3,5.655724,0.235081,0.125896,0.005839,True,0.5,30.0,"{'pca__apply_PCA': True, 'pca__n_components': ...",0.665587,0.654633,0.662155,0.660791,0.004575,37
4,6.255743,0.263199,0.143448,0.010125,True,0.7,,"{'pca__apply_PCA': True, 'pca__n_components': ...",0.664689,0.656231,0.663606,0.661509,0.003758,31


In [38]:
gs_data.groupby(by='param_pca__n_components').mean()

Unnamed: 0_level_0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
param_pca__n_components,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.5,5.467549,0.13921,0.123487,0.006395,0.671436,0.671373,0.664409,0.669072,0.005151,22.375
0.7,6.451854,0.239094,0.143105,0.019473,0.671436,0.671373,0.664409,0.669072,0.005151,22.75
0.9,5.836493,0.184592,0.11769,0.002667,0.671436,0.671373,0.664409,0.669073,0.005151,22.5
0.95,5.045318,0.104051,0.120627,0.006564,0.671436,0.671373,0.664409,0.669072,0.005151,22.875
0.99,5.012952,0.08637,0.121743,0.004977,0.671436,0.671373,0.664409,0.669072,0.005151,22.125
