# Random Forest Model

## Creating a predictive model using Bagging Ensemble method


Import the necessary libraries


In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    recall_score,
    accuracy_score,
)
from sklearn.metrics import make_scorer, fbeta_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import pickle

Create the scoring function based on the fbeta score where beta = 2


In [2]:
def f2_func(y_true, y_pred):
    f2_score = fbeta_score(y_true, y_pred, beta=2, average="weighted")
    return f2_score


def my_f2_scorer():
    return make_scorer(f2_func)

Load pre-seperated data from CSV files


In [3]:
# Import cleaned train and test data
X_train = pd.read_csv("train_X_In-Car-Rec.csv")
y_train = pd.read_csv("train_y_In-Car-Rec.csv")
X_test = pd.read_csv("test_X_In-Car-Rec.csv")
y_test = pd.read_csv("test_y_In-Car-Rec.csv")

Creating generic random forest model to determine some base metrics


In [4]:
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train.values.ravel())

In [5]:
print("Max Tree depths: ", max(list([t.get_depth() for t in classifier.estimators_])))
print(
    "Max Tree number of leaves: ",
    max(list([t.get_n_leaves() for t in classifier.estimators_])),
)

Max Tree depths:  38
Max Tree number of leaves:  3138


## Hyperparamaterizing and Fitting

Determine parameters to start filter through the randomized search


In [6]:
# Defining hyperparameters for tuning
param_grid = {
    "criterion": ["entropy"],
    "n_estimators": [100, 150, 200, 300],
    "max_depth": [10, 15, 20, 25, 30, 35],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4, 6, 8, 10],
}
randomized_search = RandomizedSearchCV(
    RandomForestClassifier(),
    param_distributions=param_grid,
    n_iter=1000,
    cv=5,
    scoring=my_f2_scorer(),
    n_jobs=-1,
    verbose=1,
    random_state=42,
)

Going through the randomized search to find the best parameters.


In [7]:
# Fitting the Model to the training data using the finalized parameters:
randomized_search.fit(X_train, y_train.values.ravel())


# Output the best f2_weighted score
best_f2_weighted_score = randomized_search.best_score_
print(
    f"The best f2_weighted score from RandomizedSearchCV is: {best_f2_weighted_score:.4f}"
)

# Output the best parameters
best_parameters = randomized_search.best_params_
print("The best parameters from RandomSearchCV are:")
for param, value in best_parameters.items():
    print(f"{param}: {value}")

Fitting 5 folds for each of 432 candidates, totalling 2160 fits




The best f2_weighted score from RandomizedSearchCV is: 0.7499
The best parameters from RandomSearchCV are:
n_estimators: 200
min_samples_split: 2
min_samples_leaf: 1
max_depth: 30
criterion: entropy


### Finalizing Model


In [9]:
# Creating the final pipeline with preprocessing and the classifier
finalmodel = RandomForestClassifier()
finalmodel.set_params(**best_parameters)

In [10]:
# Train the final pipeline
finalmodel.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


Predicting on the Test dataset


In [11]:
# Predict on the test set
y_pred = finalmodel.predict(X_test)

Displaying the confusion matrix, and our FBeta Score, Beta = 2 for the best Random Forest Classifier


In [12]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Precision
precision = precision_score(y_test, y_pred, average="weighted")
print(f"\nPrecision (weighted): {precision:.4f}")

# Recall
recall = recall_score(y_test, y_pred, average="weighted")
print(f"Recall (weighted): {recall:.4f}")

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Calculate f2_score on the test data
f2_score = fbeta_score(y_test, y_pred, average="weighted", beta=2)
print(f"F2Score for the MLP Classifier Model is: " + str(f2_score))

Confusion Matrix:
[[ 727  351]
 [ 223 1236]]

Precision (weighted): 0.7731
Recall (weighted): 0.7737
Accuracy: 0.7737
F2Score for the MLP Classifier Model is: 0.7723182163641071


## Pickling the Model


In [3]:
# final pipeline
pipeline_RandForest = Pipeline(
    [
        (
            "randomForest",
            RandomForestClassifier(
                n_estimators=200,
                min_samples_split=2,
                min_samples_leaf=1,
                max_depth=30,
                criterion="entropy",
            ),
        )
    ]
)

In [4]:
# Specify the filename where you want to save the model
filename = "RandomForest_Model.pkl"

# Export the model to the file using pickle.dump
with open(filename, "wb") as file:
    pickle.dump(pipeline_RandForest, file)