### Review RF experiments- Useful hints for SciKit users for HW 2.docx file on Canvas

In [None]:
pip install pandas scikit-learn numpy matplotlib

## Add Imports

In [None]:
from math import sqrt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report
from collections import Counter
import matplotlib.pyplot as plt

## Create The Pipeline

In [None]:
class RandomForestPipeline:
    def __init__(self, file_path, n_estimators=500, max_features='sqrt', random_state=0):
        self.file_path = file_path
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.random_state = random_state
        self.clf = None
        self.X_train = self.X_test = self.y_train = self.y_test = None

    def load_data(self, test_size=0.2):
        df = pd.read_csv(self.file_path)
        X = df.drop('Label', axis=1)
        y = df['Label']
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=test_size, shuffle=True, random_state=self.random_state
        )
        return self.X_train, self.X_test, self.y_train, self.y_test

    def train(self):
        self.clf = RandomForestClassifier(
            n_estimators=self.n_estimators,
            max_features=self.max_features,
            oob_score=True,
            random_state=self.random_state
        )

        scores = cross_val_score(self.clf, self.X_train, self.y_train, cv=4)
        print("Cross-validation scores:", scores)
        print("Mean CV score:", np.mean(scores))

        self.clf.fit(self.X_train, self.y_train)
        print("Test set score:", self.clf.score(self.X_test, self.y_test))
        print("OOB Accuracy:", self.clf.oob_score_)
        print("OOB Error:", 1 - self.clf.oob_score_)

    def evaluate(self):
        y_pred = self.clf.predict(self.X_test)
        print("\nClassification Report:\n", classification_report(self.y_test, y_pred))

        cm = confusion_matrix(self.y_test, y_pred, labels=self.clf.classes_)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=self.clf.classes_)
        disp.plot(cmap=plt.cm.Blues)
        plt.title("Confusion Matrix")
        plt.show()

## Set Training DB File

In [None]:
DATA_FILE = "data/original-training-db-e1-positive.csv"

## Init Pipeline

In [None]:
NTREE = 1000
MTRY = "sqrt"

pipeline = RandomForestPipeline(file_path=DATA_FILE, n_estimators=NTREE, max_features=MTRY)

## Step 1

In [None]:
pipeline.load_data()

## Step 2

In [None]:
pipeline.train()

## Step 3

In [None]:
pipeline.evaluate()