In [3]:
#!/usr/bin/env python
# coding: utf-8

# 🌳 Final Random Forest Model

import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

# Load training and test data
data = pd.read_csv('./data/train_data.csv')
test_data = pd.read_csv('./data/test_data.csv')

# Define column groups
categorical_cols = ['protocol_type', 'service', 'flag']
drop_cols = ['num_outbound_cmds', 'is_host_login']
label_col = 'class'

# Drop unnecessary columns from training data
data = data.drop(columns=drop_cols, errors='ignore')

# Ensure categorical columns are string
for col in categorical_cols:
    if col in data.columns:
        data[col] = data[col].astype('string')

# Encode label
data[label_col] = data[label_col].map({'normal': 0, 'anomaly': 1})

# Split features and label
X = data.drop(columns=[label_col])
y = data[label_col]

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42069)

# Build preprocessing pipeline
cat_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[('cat', cat_transformer, categorical_cols)],
    remainder='passthrough'
)

pipeline = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        class_weight='balanced',
        random_state=42069,
        n_jobs=-1
    ))
])

# Fit pipeline
pipeline.fit(X_train, y_train)

# Evaluate on validation split
y_test_pred = pipeline.predict(X_test)
y_test_proba = pipeline.predict_proba(X_test)[:, 1]
print("Validation Results:")
print(classification_report(y_test, y_test_pred))
print("ROC AUC:", roc_auc_score(y_test, y_test_proba))

# === APPLY TO TEST SET ===
# Drop columns and set categorical types
test_data = test_data.drop(columns=drop_cols, errors='ignore')
for col in categorical_cols:
    if col in test_data.columns:
        test_data[col] = test_data[col].astype('string')

# Predict on new test data
test_preds = pipeline.predict(test_data)
test_proba = pipeline.predict_proba(test_data)[:, 1]

# If ground truth available
if 'class' in test_data.columns:
    y_true = test_data['class'].map({'normal': 0, 'anomaly': 1})
    print("Test Data Results:")
    print(classification_report(y_true, test_preds))
    print("Test ROC AUC:", roc_auc_score(y_true, test_proba))




Validation Results:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3362
           1       1.00      1.00      1.00      2936

    accuracy                           1.00      6298
   macro avg       1.00      1.00      1.00      6298
weighted avg       1.00      1.00      1.00      6298

ROC AUC: 0.9999159138763581


