# Make a Random Forest model using bagging

In [None]:
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, precision_score

RSEED = 42

### Load the data

In [None]:
df = pd.read_csv('data/cleaned_data.csv')

### Prepare Data for training

In [None]:
# Define Target and Values 

y = df['state'] 
X = df.drop('state', axis=1)

# Train-Test split

X_train, X_test, y_train, y_test  = train_test_split(X,y,random_state=RSEED, stratify=y)

In [None]:
rfc = RandomForestClassifier(random_state=RSEED, n_jobs=-1)
rfc.fit(X_train, y_train)


In [None]:
y_pred_rfc = rfc.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred_rfc))

In [None]:
sns.heatmap(confusion_matrix(y_test,y_pred_rfc), annot=True)

## Perform Grid-Search 

In [None]:
# define params for grid search 
params_rf = {"n_estimators": range(10,150,10),
             "criterion": ['gini', 'entropy', 'log_loss'],
             "min_samples_leaf": range(1,20),
             "max_features": ['sqrt', 'log2'],
             "class_weight": ['balanced','balanced_subsample'],
             "max_samples": [0.1, 0.25, 0.5, 0.75, 1]
             }

#Instantiate gridsearch 
gs_rf = RandomizedSearchCV(rfc, n_iter=600, param_distributions=params_rf, cv=5, n_jobs=-1, verbose=5, scoring='precision',random_state=RSEED)

#fit gridsearch object to data
gs_rf.fit(X_train, y_train)

In [None]:
# Best score
print('Best score:', round(gs_rf.best_score_, 3))
# Best parameters
print('Best parameters:', gs_rf.best_params_)
print('----------'*8)

In [None]:
y_pred_best_random_rf = gs_rf.best_estimator_.predict(X_test)
y_pred_proba_random_rf = gs_rf.predict_proba(X_test)[:,1]

In [None]:
print('----------'*6)
print('Classification report:')
print(classification_report(y_test,y_pred_best_random_rf))
print('----------'*6)
print('Confusion matrix:')
sns.heatmap(confusion_matrix(y_test,y_pred_best_random_rf), annot=True, fmt='d')
print('----------'*6)


### More specified grid-search

In [None]:
# define params for grid search 
params_rf = {"n_estimators": range(70,110,10),
             "criterion": ['gini', 'entropy', 'log_loss'],
             "min_samples_leaf": range(7,15,2),
             "max_features": ['sqrt', 'log2'],
             "class_weight": ['balanced','balanced_subsample'],
             "max_samples": [0.6, 0.7, 0.8]
             }

#Instantiate gridsearch 
gs_rf_grid = GridSearchCV(rfc, param_grid=params_rf, cv=5, n_jobs=-1, verbose=1, scoring='precision')

#fit gridsearch object to data
gs_rf_grid.fit(X_train, y_train)

In [None]:
# Best score
print('Best score:', round(gs_rf_grid.best_score_, 3))
# Best parameters
print('Best parameters:', gs_rf_grid.best_params_)
print('----------'*8)

In [None]:
y_pred_best_grid_rf = gs_rf_grid.best_estimator_.predict(X_test)
y_pred_proba_grid_rf = gs_rf_grid.predict_proba(X_test)[:,1]

In [None]:
print('----------'*6)
print('Classification report:')
print(classification_report(y_test,y_pred_best_grid_rf))
print('----------'*6)
print('Confusion matrix:')
sns.heatmap(confusion_matrix(y_test,y_pred_best_grid_rf), annot=True, fmt='d')
print('----------'*6)

## Grid_search 2 without class_weight

In [None]:
# define params for grid search 
params_rf = {"n_estimators": range(80,111,10),
             "criterion": ['gini', 'entropy', 'log_loss'],
             "min_samples_leaf": range(7,12,2),
             "max_features": ['sqrt', 'log2'],
             "class_weight": ['balanced','balanced_subsample'],
             "max_samples": [0.9, 1.0]
             }

#Instantiate gridsearch 
gs_rf_grid_new = GridSearchCV(rfc, param_grid=params_rf, cv=5, n_jobs=-1, verbose=1, scoring='precision')

#fit gridsearch object to data
gs_rf_grid_new.fit(X_train, y_train)

In [None]:
# Best score
print('Best score:', round(gs_rf_grid_new.best_score_, 3))
# Best parameters
print('Best parameters:', gs_rf_grid_new.best_params_)
print('----------'*8)

In [None]:
y_pred_best_grid_new_rf = gs_rf_grid_new.best_estimator_.predict(X_test)
y_pred_proba_grid_new_rf = gs_rf_grid_new.predict_proba(X_test)[:,1]

In [None]:
print('----------'*6)
print('Classification report:')
print(classification_report(y_test,y_pred_best_grid_new_rf))
print('----------'*6)
print('Confusion matrix:')
sns.heatmap(confusion_matrix(y_test,y_pred_best_grid_new_rf), annot=True, fmt='d')
print('----------'*6)

### 3. Grid_search 

In [None]:
# define params for grid search 
params_rf = {"n_estimators": range(80,111,10),
             "criterion": ['gini', 'entropy', 'log_loss'],
             "min_samples_leaf": range(7,12,2),
             "max_features": ['sqrt', 'log2'],
             "max_samples": [0.9, 1.0]
             }

In [None]:
#Instantiate gridsearch 
gs_rf_grid_wo_weight = GridSearchCV(rfc, param_grid=params_rf, cv=5, n_jobs=-1, verbose=1, scoring='precision')

#fit gridsearch object to data
gs_rf_grid_wo_weight.fit(X_train, y_train)

In [None]:
# Best score
print('Best score:', round(gs_rf_grid_wo_weight.best_score_, 3))
# Best parameters
print('Best parameters:', gs_rf_grid_wo_weight.best_params_)
print('----------'*8)

In [None]:
y_pred_best_grid_wo_weight_rf = gs_rf_grid_wo_weight.best_estimator_.predict(X_test)
y_pred_proba_grid_wo_weight_rf = gs_rf_grid_wo_weight.predict_proba(X_test)[:,1]

In [None]:
print('----------'*6)
print('Classification report:')
print(classification_report(y_test,y_pred_best_grid_wo_weight_rf))
print('----------'*6)
print('Confusion matrix:')
sns.heatmap(confusion_matrix(y_test,y_pred_best_grid_wo_weight_rf), annot=True, fmt='d')
print('----------'*6)

In [None]:
import numpy as np

importances = gs_rf_grid.best_estimator_.feature_importances_
std = np.std([tree.feature_importances_ for tree in gs_rf_grid.best_estimator_.estimators_], axis=0)

In [None]:
from sklearn.inspection import permutation_importance
r = permutation_importance(gs_rf_grid.best_estimator_, X_val, y_val,
                            n_repeats=30,
                            random_state=0)

for i in r.importances_mean.argsort()[::-1]:
     if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
         print(f"{df.feature_names[i]:<8}"
              f"{r.importances_mean[i]:.3f}"
              f" +/- {r.importances_std[i]:.3f}")