In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import os

In [2]:
train_df = pd.read_csv("../datasets/training_dataset_new.csv")
test_df = pd.read_csv("../datasets/testing_dataset_new.csv")

In [3]:
numerical_features = ['dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'ct_srv_src', 'ct_dst_ltm']
categorical_features = ['proto', 'service', 'state']
target = 'attack_cat'

In [4]:
train_df = pd.get_dummies(train_df, columns=categorical_features, drop_first=True)
test_df = pd.get_dummies(test_df, columns=categorical_features, drop_first=True)

In [5]:
test_df = test_df.reindex(columns=train_df.columns, fill_value=0)

In [6]:
categorical_encoded_features = [
    col for col in train_df.columns 
    if col.startswith('proto_') or col.startswith('service_') or col.startswith('state_')
]
features = numerical_features + categorical_encoded_features

In [7]:
le = LabelEncoder()
train_df[target] = le.fit_transform(train_df[target])
test_df[target] = le.transform(test_df[target])

In [8]:
X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]

In [9]:
scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features].copy())
X_test[numerical_features] = scaler.transform(X_test[numerical_features].copy())

In [10]:
model = RandomForestClassifier(random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

In [11]:
y_pred = model.predict(X_test)

In [12]:
target_names = [str(class_name) for class_name in le.classes_]
print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=target_names))

Accuracy:  0.753
                precision    recall  f1-score   support

      Analysis       0.05      0.03      0.04      2000
      Backdoor       0.07      0.14      0.09      1746
           DoS       0.34      0.65      0.45     12264
      Exploits       0.84      0.52      0.65     33393
       Fuzzers       0.71      0.28      0.40     18184
       Generic       0.95      0.98      0.96     40000
        Normal       0.79      0.96      0.87     56000
Reconnaissance       0.91      0.73      0.81     10491
     Shellcode       0.61      0.30      0.40      1133
         Worms       0.71      0.34      0.46       130

      accuracy                           0.75    175341
     macro avg       0.60      0.49      0.51    175341
  weighted avg       0.79      0.75      0.75    175341



In [13]:
output_dir = "../tune_files"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

joblib.dump(model, os.path.join(output_dir, "random_forest_model.joblib"))
joblib.dump(scaler, os.path.join(output_dir, "scaler.joblib"))
joblib.dump(le, os.path.join(output_dir, "label_encoder.joblib"))

print("Model, scaler, and label encoder saved successfully.")