# Random Forest
Random Forest is an ensemble learning algorithm that builds multiple decision trees and combines their predictions to improve accuracy and reduce overfitting. It works by randomly selecting subsets of features and samples from the training data to train each tree, and then averaging their predictions to make the final prediction. 

Its advantages include handling regression and classification problems, reducing overfitting, handling high-dimensional datasets, and estimating feature importance. Its disadvantages, being slower and more computationally expensive, difficult to interpret, and may not perform well on imbalanced or noisy datasets.

In [27]:
import pandas as pd
import numpy as np
df = pd.read_csv('output/spam_email.csv')

In [28]:
# spliting test and train samples
from sklearn.model_selection import train_test_split

df_predictors = df.drop('spam', axis = 1)
df_predicted = df['spam']

X_train, X_test, y_train, y_test = train_test_split(df_predictors,
                                                    df_predicted)

In [29]:
# Random forest model
np.random.seed(30)
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 1000)
forest.fit(X_train, y_train)

# prediction score: 
fscore = forest.score(X_test, y_test)
fy_pred = forest.predict(X_test)


print("Number of mislabeled points out of a total %d points : %d \n\
Model score:"
       % (X_test.shape[0], (y_test != fy_pred).sum()), "{0:.2%}".format(fscore))

Number of mislabeled points out of a total 1151 points : 60 
Model score: 94.79%


### K-Fold Cross Validation

In [30]:
from sklearn.model_selection import cross_val_score

# we specify 20 folds, i.e, 20 train-test splits and fitted models. 
scores = cross_val_score(forest, X_train, y_train, 
                         cv = 20)

def display_scores(scores):
    print("Scores:", scores)
    print("\nMean:", scores.mean(), f"({scores.mean():.2%})")
    print("\nStandard deviation:", scores.std(), f"({scores.std():.2%})")

display_scores(scores)

Scores: [0.95953757 0.95375723 0.97687861 0.94797688 0.95953757 0.93063584
 0.96531792 0.95375723 0.96531792 0.95953757 0.95930233 0.97093023
 0.97093023 0.91860465 0.93604651 0.94767442 0.97674419 0.9127907
 0.93604651 0.97674419]

Mean: 0.9539034144374241 (95.39%)

Standard deviation: 0.01833765384073895 (1.83%)


### Repeated K-Fold Cross Validation

In [32]:
np.random.seed(30)

from sklearn.model_selection import cross_val_score, RepeatedKFold
cv = RepeatedKFold(n_splits=20, n_repeats=5, random_state=2)

scores = cross_val_score(forest, X_train, y_train, cv=cv)

display_scores(scores)

Scores: [0.95375723 0.95953757 0.94219653 0.94797688 0.97109827 0.97109827
 0.93641618 0.95953757 0.95375723 0.94219653 0.97093023 0.93604651
 0.93023256 0.93023256 0.95930233 0.94186047 0.97674419 0.94186047
 0.96511628 0.95930233 0.97109827 0.97687861 0.95953757 0.95953757
 0.92485549 0.93641618 0.96531792 0.95953757 0.94797688 0.96531792
 0.94186047 0.94186047 0.97093023 0.95348837 0.94767442 0.94186047
 0.94186047 0.95348837 0.94186047 0.94767442 0.94219653 0.97687861
 0.97109827 0.94797688 0.95953757 0.96531792 0.93063584 0.95375723
 0.96531792 0.94797688 0.94767442 0.95348837 0.95930233 0.91860465
 0.99418605 0.93023256 0.94767442 0.94767442 0.95930233 0.95930233
 0.92485549 0.98265896 0.95375723 0.97109827 0.98843931 0.94797688
 0.94797688 0.97687861 0.95375723 0.95953757 0.95348837 0.9127907
 0.94767442 0.9244186  0.93604651 0.97674419 0.94767442 0.94186047
 0.93604651 0.95930233 0.93641618 0.93641618 0.97109827 0.93063584
 0.95953757 0.96531792 0.97109827 0.94797688 0.93641618