In [29]:
import pickle
from time import time
from typing import List

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import uniform
from sklearn.base import clone
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss

In [2]:
df = pd.read_csv("input/bioresponse/train.csv")
df.head()

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [3]:
target_name = "Activity"
y = df.loc[:, target_name]
X = df.drop(target_name, axis="columns")

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=1,
                                                    )

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3000, 1776), (751, 1776), (3000,), (751,))

In [5]:
kwargs_rf = {"random_state": 1}
grid_rf = {"n_estimators": np.arange(50, 500),
           "max_features": ["auto", "sqrt", 0.33],
           "min_samples_leaf": np.arange(1, 15),
           }

In [6]:
rf_base = RandomForestClassifier(**kwargs_rf)

In [7]:
n_settings_to_sample = 300
n_jobs = 2

start_time = time()

#Instantiate CV model
rf_cv = RandomizedSearchCV(estimator=rf_base,
                           param_distributions=grid_rf,
                           n_iter=n_settings_to_sample,
                           random_state=1,
                           n_jobs=n_jobs,
                           )

#fit model performing CV
rf_cv.fit(X_train, y_train)

end_time = time()
tot_time = end_time - start_time
print(f"CV trying {n_settings_to_sample} settings fitted in {tot_time / 3600} hours"
      f" with {n_jobs} jobs!"
      )

CV trying 300 settings fitted in 2.5820587002568773 hours with 2 jobs!


In [11]:
best_params = rf_cv.best_params_
best_params

{'n_estimators': 205, 'min_samples_leaf': 3, 'max_features': 0.33}

In [24]:
df_tried_params = pd.DataFrame(rf_cv.cv_results_.get("params"))
df_tried_params.head()

Unnamed: 0,n_estimators,min_samples_leaf,max_features
0,344,11,0.33
1,498,4,0.33
2,66,10,0.33
3,265,9,auto
4,436,2,sqrt


In [27]:
out_name = "output/better_rf_1_cv.pickle"
with open(out_name, "wb") as f:
    pickle.dump(rf_cv, f) #whole CV model is being dumped, not only best_estimator_! For further studies

In [38]:
def various_metrics_binary_classification(model, metrics: List, X: pd.DataFrame, y_true: pd.Series, identifier: str = None):
    
    if not (isinstance(metrics, List) and any(metrics)):
        raise ValueError("First argument should be a non-empty list!")
        
    pred_proba = model.predict_proba(X)[:, 1]
    
    results = {}
    for metric in metrics:
        result = metric(y_true, pred_proba)
        label_metric = metric.__name__ if not identifier else f"{metric.__name__}_{identifier}"
        results[label_metric] = result
        
    return results

metrics = [roc_auc_score, log_loss]
results_metrics_train = various_metrics_binary_classification(rf_cv, metrics, X_train, y_train, identifier="train")
results_metrics_test = various_metrics_binary_classification(rf_cv, metrics, X_test, y_test, identifier="test")
results_metrics = {**results_metrics_train, **results_metrics_test}
results_metrics

{'roc_auc_score_train': 0.9987689530863688,
 'log_loss_train': 0.1884336270550344,
 'roc_auc_score_test': 0.8729358322381578,
 'log_loss_test': 0.44510011383177567}

In [39]:
pd.Series(results_metrics)

roc_auc_score_train    0.998769
log_loss_train         0.188434
roc_auc_score_test     0.872936
log_loss_test          0.445100
dtype: float64

My note: it is only slightly better on test set that previous model!