# Randam Forest

# Random Forest

Random Forest is a `supervised learning algorithm`. Like you can already see from its name, it creates a forest and makes it somehow random. The "forest" it builds, is an ensemble of Decision Trees, most of the time trained with the "bagging" method. The general idea of the bagging method is that a combination of learning models increases the overall result.

To say it in simple words: Random forest builds multiple decision trees and merges them together to get a more accurate and stable prediction.


In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import (
    mean_squared_error,
    r2_score,
    classification_report,
    confusion_matrix,
    accuracy_score,
    mean_absolute_error,
)

In [58]:
df = sns.load_dataset('tips')
df.head(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.0,Male,No,Sun,Dinner,2
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


In [59]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [60]:
le = LabelEncoder()
for col in df.select_dtypes(include=["category", "object"]):
    df[col] = le.fit_transform(df[col])

df.head(10)
print(df['day'].value_counts())
print(df['sex'].value_counts())
print(df['smoker'].value_counts())
print(df['time'].value_counts())

day
1    87
2    76
3    62
0    19
Name: count, dtype: int64
sex
1    157
0     87
Name: count, dtype: int64
smoker
0    151
1     93
Name: count, dtype: int64
time
0    176
1     68
Name: count, dtype: int64


In [61]:
x = df.drop("sex", axis=1)
y = df["sex"]

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)



In [62]:
model = RandomForestClassifier(n_estimators=200, random_state=42, criterion="entropy")
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

# evaluate the model
print("accuracy score: ", accuracy_score(y_test, y_pred))
print("confusion matrix: \n", confusion_matrix(y_test, y_pred))
print("classification report: ", classification_report(y_test, y_pred))

accuracy score:  0.6122448979591837
confusion matrix: 
 [[ 7 12]
 [ 7 23]]
classification report:                precision    recall  f1-score   support

           0       0.50      0.37      0.42        19
           1       0.66      0.77      0.71        30

    accuracy                           0.61        49
   macro avg       0.58      0.57      0.57        49
weighted avg       0.60      0.61      0.60        49



In [63]:
# from sklearn.tree import export_graphviz
# import os
# from subprocess import call

# # Export the first tree in the forest
# tree = model.estimators_[0]

# export_graphviz(
#     tree,
#     out_file="./saved_models/decsion_tree_dot_code_file_no_15.dot",
#     feature_names=x.columns,
#     filled=True,
#     rounded=True,
# )
# # Create directory if it doesn't exist
# os.makedirs("./saved_models", exist_ok=True)

# # Export all trees in the forest
# for i, tree_in_forest in enumerate(model.estimators_):
#     pass
#     # export_graphviz(
#     #     tree_in_forest,
#     #     out_file=f"./saved_models/decision_tree_{i}.dot",
#     #     feature_names=x.columns,
#     #     filled=True,
#     #     rounded=True,
#     # ) 
#     # Convert to png using system command (requires Graphviz) // need to install graphviz in local machine
#     # call(['dot', '-Tpng', f"./saved_models/decision_tree_{i}.dot", '-o', f"./saved_models/decision_tree_{i}.png", '-Gdpi=600'])

# Randam Forest for reggression problem

In [64]:


X = df.drop('tip', axis = 1)
y = df['tip']

# train test split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

#create, train and predict the model
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


# evaluate the model
print('mean squared error: ', mean_squared_error(y_test, y_pred))
print('mean absolute error: ', mean_absolute_error(y_test, y_pred))
print('r2 score: ', r2_score(y_test, y_pred))
print('root mean squared error: ', np.sqrt(mean_squared_error(y_test, y_pred)))


mean squared error:  0.9496955984183694
mean absolute error:  0.7704663265306128
r2 score:  0.2402261043743361
root mean squared error:  0.9745232672534655
