# Data Loading

In [20]:
import pandas as pd
import numpy as np
import kagglehub
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib

In [7]:
path = kagglehub.dataset_download("solarmainframe/ids-intrusion-csv")

In [8]:
df = pd.read_csv(f"{path}/02-14-2018.csv")
print("Shape:", df.shape)
df.head()

Shape: (1048575, 80)


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,14/02/2018 08:31:01,112641719,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320859.5,139.300036,56320958,56320761,Benign
1,0,0,14/02/2018 08:33:50,112641466,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320733.0,114.551299,56320814,56320652,Benign
2,0,0,14/02/2018 08:36:39,112638623,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56319311.5,301.934596,56319525,56319098,Benign
3,22,6,14/02/2018 08:40:13,6453966,15,10,1239,2273,744,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
4,22,6,14/02/2018 08:40:23,8804066,14,11,1143,2209,744,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign


# Preprocessing

In [21]:
print("Unique Labels:", df['Label'].unique())
print("\nLabel Distribution:\n", df['Label'].value_counts())

Unique Labels: ['Benign' 'FTP-BruteForce' 'SSH-Bruteforce']

Label Distribution:
 Label
Benign            667626
FTP-BruteForce    193360
SSH-Bruteforce    187589
Name: count, dtype: int64


In [22]:
print("Number of rows:", len(df))
print("Number of columns:", len(df.columns))

Number of rows: 1048575
Number of columns: 81


In [23]:
df.columns

Index(['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts',
       'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg',
      

In [24]:
print("Missing values per column:")
print(df.isnull().sum())
print("\nTotal missing:", df.isnull().sum().sum())

Missing values per column:
Dst Port         0
Protocol         0
Timestamp        0
Flow Duration    0
Tot Fwd Pkts     0
                ..
Idle Std         0
Idle Max         0
Idle Min         0
Label            0
y                0
Length: 81, dtype: int64

Total missing: 0


In [25]:
# Fill numeric missing values with median
df = df.fillna(df.median(numeric_only=True))
print("Missing after fill:", df.isnull().sum().sum())

Missing after fill: 0


In [26]:
drop_cols = ['Timestamp', 'Dst Port', 'Protocol', 'Label']
X = df.drop(columns=drop_cols)
y = df['Label']

In [27]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print("Class Mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

Class Mapping: {'Benign': np.int64(0), 'FTP-BruteForce': np.int64(1), 'SSH-Bruteforce': np.int64(2)}


In [29]:
import numpy as np

X = X.replace([np.inf, -np.inf], np.nan)

In [30]:
# Fill NaN with column median (safer than mean for skewed traffic features)
X = X.fillna(X.median(numeric_only=True))

# Scaling

In [31]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Feature matrix shape:", X_scaled.shape)

Feature matrix shape: (1048575, 77)


In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

print("Train:", X_train.shape, "Test:", X_test.shape)

Train: (838860, 77) Test: (209715, 77)


# Training

In [33]:
clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    class_weight="balanced"
)
clf.fit(X_train, y_train)

In [34]:
y_pred = clf.predict(X_test)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

Confusion Matrix:
 [[133525      0      0]
 [     0  38672      0]
 [     0      2  37516]]

Classification Report:
                 precision    recall  f1-score   support

        Benign       1.00      1.00      1.00    133525
FTP-BruteForce       1.00      1.00      1.00     38672
SSH-Bruteforce       1.00      1.00      1.00     37518

      accuracy                           1.00    209715
     macro avg       1.00      1.00      1.00    209715
  weighted avg       1.00      1.00      1.00    209715



In [47]:
from sklearn.metrics import accuracy_score

# Training predictions
y_train_pred = clf.predict(X_train)
train_acc = accuracy_score(y_train, y_train_pred) * 100

# Testing predictions
y_test_pred = clf.predict(X_test)
test_acc = accuracy_score(y_test, y_test_pred) * 100

print(f"Training Accuracy: {train_acc:.2f}%")
print(f"Testing Accuracy: {test_acc:.2f}%")

Training Accuracy: 100.00%
Testing Accuracy: 100.00%


# Saving

In [35]:
joblib.dump(clf, "ids_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(le, "label_encoder.pkl")

['label_encoder.pkl']

# Testing

In [37]:
import joblib
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report

# Load saved model & preprocessing
clf = joblib.load("ids_model.pkl")
scaler = joblib.load("scaler.pkl")
le = joblib.load("label_encoder.pkl")

In [41]:
df = pd.read_csv("/content/df_sample_1000.csv")

# Drop same columns as training
drop_cols = ['Timestamp', 'Dst Port', 'Protocol', 'Label']
X = df.drop(columns=drop_cols)
y = df['Label']

In [42]:
import numpy as np
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median(numeric_only=True))

In [43]:
# Encode labels
y_encoded = le.transform(y)

# Scale features
X_scaled = scaler.transform(X)

In [46]:
# Predict on full dataset (or you can do on test split)
y_pred = clf.predict(X_scaled)

# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_encoded, y_pred))

# Classification Report
print("\nClassification Report:\n",
      classification_report(y_encoded, y_pred, target_names=le.classes_, labels=np.unique(y_encoded)))

Confusion Matrix:
 [[ 95   0]
 [  0 905]]

Classification Report:
                 precision    recall  f1-score   support

        Benign       1.00      1.00      1.00        95
FTP-BruteForce       1.00      1.00      1.00       905

      accuracy                           1.00      1000
     macro avg       1.00      1.00      1.00      1000
  weighted avg       1.00      1.00      1.00      1000



