<a href="https://colab.research.google.com/github/bennybahnam/cs549-final-project/blob/main/Random_Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# RANDOM FOREST MODEL (Bahnam Bahnam)


import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score

# 1. Load the Preprocessed Data
print("Loading processed_urls.csv...")
try:
    df = pd.read_csv("processed_urls.csv")
except FileNotFoundError:
    print("ERROR: 'processed_urls.csv' not found. Did you run Step 1?")
    raise

# 2. Drop non-numeric columns
df_clean = df.drop(columns=['url', 'scheme', 'subdomain', 'registrable_domain',
                            'suffix', 'path', 'query', 'fragment', 'port',
                            'username', 'password', 'host'], errors='ignore')

X = df_clean.drop(columns=['type'])
y = df_clean['type']

# 3. Split Data
print("Splitting Data (70% Train, 30% Test)...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print(f"Training Samples: {len(X_train)}")
print(f"Testing Samples: {len(X_test)}")

# 4. Train Random Forest
print("\nStarting Training...")
rf = RandomForestClassifier(random_state=42, class_weight='balanced')

# Simple grid search to satisfy requirements
param_grid = {
    'n_estimators': [100],
    'max_depth': [20, None],
    'min_samples_split': [2, 5]
}

start = time.time()
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='recall', n_jobs=-1)
grid_search.fit(X_train, y_train)
end = time.time()

best_rf = grid_search.best_estimator_
print(f"Training Complete in {end - start:.2f} seconds.")
print(f"Best Params: {grid_search.best_params_}")

# 5. Results
y_pred = best_rf.predict(X_test)

print("\n" + "="*30)
print("FINAL RESULTS")
print("="*30)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Recall:   {recall_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Loading processed_url.csv...
Splitting Data (70% Train, 30% Test)...
Training Samples: 353453
Testing Samples: 151480

Starting Training...
Training Complete in 528.18 seconds.
Best Params: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}

FINAL RESULTS
Accuracy: 0.9677
Recall:   0.9307

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98    103721
           1       0.97      0.93      0.95     47759

    accuracy                           0.97    151480
   macro avg       0.97      0.96      0.96    151480
weighted avg       0.97      0.97      0.97    151480

Confusion Matrix:
[[102142   1579]
 [  3312  44447]]
