# Random Forest Classifier
* In this notebook we are going to train a model using Random Forest, experiment with various params using grid search and try to come up with best `Random Forest` version.

## Install Libraries

In [1]:
# %pip install scikit-learn

## Import Libraries

In [2]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,recall_score,precision_score,precision_recall_curve
import seaborn as sns


# Build an absolute path from this notebook's parent directory
module_path = os.path.abspath(os.path.join('..'))

# Add to sys.path if not already present
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.utils import preprocessing
from src.utils import common
from src.utils.training import refit_strategy

## Initialize Directories

In [3]:
data_root_dir = Path("..", "data/")
models_root_dir = Path("..", "models/")

## Read Data

In [4]:
X_train = pd.read_csv(Path(data_root_dir,"X_train.csv"))
y_train = pd.read_csv(Path(data_root_dir,"y_train.csv"))

## Training Default Model

In [5]:
# import sklearn


# sklearn.metrics.get_scorer_names() 

In [6]:
from sklearn.discriminant_analysis import StandardScaler
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.pipeline import Pipeline


default_random_forest_model = RandomForestClassifier(random_state=42)

model_pipeline = Pipeline([
    ("preprocessing", preprocessing.pipeline),
    ("normalizing", StandardScaler()),
    ("prediction", default_random_forest_model)
])

scoring = ["recall", "precision", "f1"]

default_random_forest_scores = cross_validate(
    estimator=model_pipeline, 
    X=X_train, 
    y=y_train.values.ravel(), 
    cv=3, scoring=scoring,
    n_jobs=-1, verbose=2)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.


[CV] END .................................................... total time=   3.3s
[CV] END .................................................... total time=   3.3s
[CV] END .................................................... total time=   3.3s


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    4.7s finished


In [7]:
default_random_forest_scores

{'fit_time': array([2.47265124, 2.36221004, 2.36410451]),
 'score_time': array([0.80415249, 0.90820003, 0.88891435]),
 'test_recall': array([0.88475666, 0.87511478, 0.87396694]),
 'test_precision': array([0.84946   , 0.84094419, 0.85454545]),
 'test_f1': array([0.86674913, 0.85768928, 0.86414709])}

In [8]:
cv_scores = default_random_forest_scores


In [9]:
mean_recall,mean_precision,mean_f1 = common.calculate_mean_from_cv(default_random_forest_scores)

Mean Recall: 0.877946127946128, Mean Precision: 0.877946127946128,Mean F1: 0.8628618320927531


In [10]:
# commenting this code out to avoid overwriting the metrics file. 
common.update_models_metrics("Random Forest", "v0", mean_recall,mean_precision,mean_f1)

Unnamed: 0,model,version,recall,precision,f1,file
0,Logistic Regression,v0,0.885369,0.885369,0.870934,
1,Logistic Regression,v1,0.885675,0.85711,0.87114,logistic_regression_v1.joblib
2,Logistic Regression,v2,0.886746,0.857321,0.871766,logistic_regression_v2.joblib
3,Linear SVC,v0,0.887511,0.887511,0.871667,
4,Linear SVC,v1,0.887435,0.8564,0.871625,linear_svc_v1.joblib
5,Linear SVC,v2,0.887894,0.856845,0.872076,linear_svc_v2.joblib
6,Random Forest,v0,0.87787,0.87787,0.862945,
7,Random Forest,v1,0.881772,0.848675,0.864893,random_forest_v1.joblib
8,Random Forest,v0,0.877946,0.877946,0.862862,


## GridSearch CV v1

In [11]:
## checking params
# preprocessing.pipeline.get_params()

In [12]:
from sklearn.model_selection import GridSearchCV


model_pipeline = Pipeline([
    ("preprocessing", preprocessing.pipeline),
    ("normalizing", StandardScaler()),
    ("prediction", RandomForestClassifier(random_state=42))
])

scoring = ["recall", "precision", "f1"]

param_grid = {
    "prediction__n_estimators": [250,300,350,400,500],
    "prediction__criterion": ["gini", "entropy", "log_loss"],
    "prediction__max_features": ["sqrt", "log2", None]
}

grid_search = GridSearchCV(model_pipeline, param_grid,
                           scoring=scoring, cv=3, n_jobs=-1, refit=refit_strategy)
grid_search.fit(X_train, y_train.values.ravel())

In [13]:
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Scores : {grid_search.best_index_}")
average_recall,average_precision,average_f1 = common.read_best_mean_grid_search_metrics(grid_search.cv_results_,grid_search.best_index_)

Best Parameters: {'prediction__criterion': 'log_loss', 'prediction__max_features': 'sqrt', 'prediction__n_estimators': 400}
Best Scores : 33
Mean Recall: 0.882843587389042, Mean Precision: 0.8490424292899217,Mean F1: 0.8655937899834956


In [14]:
# commenting this code out to avoid overwriting the metrics file.
_, file_name = common.save_model(
    "Random Forest", "v1", grid_search.best_estimator_)
common.update_models_metrics("Random Forest", "v1", average_recall,
                             average_precision, average_f1, file_name=file_name)
common.update_model_params(
    "RandomForest", "v1", grid_search.best_params_)

[{'name': 'LogisticRegression',
  'version': 'v1',
  'params': {'prediction__C': 1,
   'prediction__penalty': 'l1',
   'prediction__solver': 'liblinear'}},
 {'name': 'LogisticRegression',
  'version': 'v2',
  'params': {'prediction__C': 1,
   'prediction__penalty': 'l2',
   'prediction__solver': 'saga',
   'preprocessing__age_pipeline__age_encoding__age_range_encoding__encoding': 'ordinal',
   'preprocessing__cgpa_pipeline__cgpa_encoding__cgpa_range_encoding__encoding': 'ordinal',
   'preprocessing__degree_pipeline__degree_encoding__degree_level_encoding__encoding': 'ordinal',
   'preprocessing__dietary_habits_pipeline__dietary_habits_encoding__encoding': 'ordinal',
   'preprocessing__hours_pipeline__hours_encoding__hours_range_encoding__encoding': 'onehot',
   'preprocessing__sleep_duration_pipeline__sleep_duration_encoding__encoding': 'ordinal'}},
 {'name': 'LinearSVC',
  'version': 'v1',
  'params': {'prediction__C': 10, 'prediction__penalty': 'l1'}},
 {'name': 'LinearSVC',
  'versi

## GridSearch CV v2

In [15]:
preprocessing.pipeline.get_params(deep=True)

{'force_int_remainder_cols': True,
 'n_jobs': None,
 'remainder': 'drop',
 'sparse_threshold': 0.3,
 'transformer_weights': None,
 'transformers': [('preprocess_gender',
   Pipeline(steps=[('default_cat_pipeline',
                    Pipeline(steps=[('fill_empty_strings',
                                     FunctionTransformer(feature_names_out='one-to-one',
                                                         func=<function fill_empty_strings_fn at 0x7fa884208c20>)),
                                    ('strip_spaces',
                                     FunctionTransformer(feature_names_out='one-to-one',
                                                         func=<function strip_spaces_fn at 0x7fa884208f40>)),
                                    ('to_lower_case',
                                     FunctionTransformer(feature_names_out='one-to-one',
                                                         func=<function to_lower_case_fn at 0x7fa8842a3240>)),
                

In [16]:

model_pipeline = Pipeline([
    ("preprocessing", preprocessing.pipeline),
    ("normalizing", StandardScaler()),
    ("prediction", RandomForestClassifier(random_state=42))
])

scoring = ["recall", "precision", "f1"]

## experiment between onehot and ordinal encoding of various features.

param_grid = {
    "preprocessing__sleep_duration_pipeline__sleep_duration_encoding__encoding":["onehot", "ordinal"],
    "preprocessing__dietary_habits_pipeline__dietary_habits_encoding__encoding":["onehot", "ordinal"],
    "preprocessing__degree_pipeline__degree_encoding__degree_level_encoding__encoding":["onehot", "ordinal"],
    "preprocessing__age_pipeline__age_encoding__age_range_encoding__encoding":["onehot", "ordinal"],
    "preprocessing__cgpa_pipeline__cgpa_encoding__cgpa_range_encoding__encoding":["onehot", "ordinal"],
    "preprocessing__hours_pipeline__hours_encoding__hours_range_encoding__encoding":["onehot", "ordinal"],
    "prediction__n_estimators": [100,150,200,250,300,350,400,500],
    "prediction__criterion": ["gini", "entropy", "log_loss"],
    "prediction__max_features": ["sqrt", "log2", None]
}

grid_search = GridSearchCV(model_pipeline, param_grid, scoring=scoring, cv=3,n_jobs=-1,refit=refit_strategy)
grid_search.fit(X_train, y_train.values.ravel())


In [19]:
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Scores : {grid_search.best_index_}")
average_recall,average_precision,average_f1 = common.read_best_mean_grid_search_metrics(grid_search.cv_results_,grid_search.best_index_)

Best Parameters: {'prediction__criterion': 'gini', 'prediction__max_features': 'sqrt', 'prediction__n_estimators': 400, 'preprocessing__age_pipeline__age_encoding__age_range_encoding__encoding': 'onehot', 'preprocessing__cgpa_pipeline__cgpa_encoding__cgpa_range_encoding__encoding': 'ordinal', 'preprocessing__degree_pipeline__degree_encoding__degree_level_encoding__encoding': 'ordinal', 'preprocessing__dietary_habits_pipeline__dietary_habits_encoding__encoding': 'ordinal', 'preprocessing__hours_pipeline__hours_encoding__hours_range_encoding__encoding': 'ordinal', 'preprocessing__sleep_duration_pipeline__sleep_duration_encoding__encoding': 'onehot'}
Best Scores : 414
Mean Recall: 0.8835322926232018, Mean Precision: 0.8503987121482556,Mean F1: 0.8666254719048757


In [20]:
# commenting this code out to avoid overwriting the metrics file.
_, file_name = common.save_model(
    "Random Forest", "v2", grid_search.best_estimator_)
common.update_models_metrics("Random Forest", "v2", average_recall,
                             average_precision, average_f1, file_name=file_name)
common.update_model_params(
    "RandomForest", "v2", grid_search.best_params_)

[{'name': 'LogisticRegression',
  'version': 'v1',
  'params': {'prediction__C': 1,
   'prediction__penalty': 'l1',
   'prediction__solver': 'liblinear'}},
 {'name': 'LogisticRegression',
  'version': 'v2',
  'params': {'prediction__C': 1,
   'prediction__penalty': 'l2',
   'prediction__solver': 'saga',
   'preprocessing__age_pipeline__age_encoding__age_range_encoding__encoding': 'ordinal',
   'preprocessing__cgpa_pipeline__cgpa_encoding__cgpa_range_encoding__encoding': 'ordinal',
   'preprocessing__degree_pipeline__degree_encoding__degree_level_encoding__encoding': 'ordinal',
   'preprocessing__dietary_habits_pipeline__dietary_habits_encoding__encoding': 'ordinal',
   'preprocessing__hours_pipeline__hours_encoding__hours_range_encoding__encoding': 'onehot',
   'preprocessing__sleep_duration_pipeline__sleep_duration_encoding__encoding': 'ordinal'}},
 {'name': 'LinearSVC',
  'version': 'v1',
  'params': {'prediction__C': 10, 'prediction__penalty': 'l1'}},
 {'name': 'LinearSVC',
  'versi

Observations:
* In general there is not significan't improvement between 2 versions of Random Forest and also between Random Forest and previous Linear Models.