In [1]:
!pip install pandas 

import pandas as pd
print("pandas version:", pd.__version__)


pandas version: 2.2.3


In [7]:
!pip install scikit-learn
import sklearn
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("scikit-learn version:", sklearn.__version__)

scikit-learn version: 1.6.1


In [12]:
!pip install requests 



In [9]:
!pip install seaborn



In [11]:
!pip install matplotlib
import matplotlib.pyplot as plt

print("matplotlib version:", matplotlib.__version__)



In [10]:
import numpy as np
import requests
import os
import seaborn as sns

print("numpy version:", np.__version__)
print("requests version:", requests.__version__)
print("seaborn version:", sns.__version__)


numpy version: 2.2.6
requests version: 2.32.5
seaborn version: 0.13.2


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report,confusion_matrix
from scipy.io import arff

In [2]:
import pandas as pd
data=pd.read_csv("KDDDataset.txt")
df = pd.DataFrame(data)
print(df.head())

   0   tcp   private   REJ    0.1    0.2  0.3  0.4  0.5  0.6  ...  0.04.1  \
0  0   tcp   private   REJ      0      0    0    0    0    0  ...    0.00   
1  2   tcp  ftp_data    SF  12983      0    0    0    0    0  ...    0.61   
2  0  icmp     eco_i    SF     20      0    0    0    0    0  ...    1.00   
3  1   tcp    telnet  RSTO      0     15    0    0    0    0  ...    0.31   
4  0   tcp      http    SF    267  14515    0    0    0    0  ...    1.00   

   0.06.1  0.00.3  0.00.4  0.00.5  0.00.6  1.00.2  1.00.3  neptune  21  
0    0.06    0.00    0.00    0.00     0.0    1.00    1.00  neptune  21  
1    0.04    0.61    0.02    0.00     0.0    0.00    0.00   normal  21  
2    0.00    1.00    0.28    0.00     0.0    0.00    0.00    saint  15  
3    0.17    0.03    0.02    0.00     0.0    0.83    0.71    mscan  11  
4    0.00    0.01    0.03    0.01     0.0    0.00    0.00   normal  21  

[5 rows x 43 columns]


In [3]:

x=df.iloc[:, :-2].copy()      #select all columns except the last 2 (second last is label [normal/attak])
y=df.iloc[:,-2].copy()         #select last 2 columns


In [9]:

from sklearn.preprocessing import LabelEncoder

for col in x.columns:
    if x[col].dtype == 'object':
        x[col]=LabelEncoder().fit_transform(x[col].astype(str))     # Convert all text columns in X into numbers using LabelEncoder


In [10]:
y=y.apply(lambda v:0 if str(v).lower() == "normal" else 1)    # Convert each label in y to 0 if it is "normal", otherwise 1 (attack)

In [11]:
from sklearn.model_selection import train_test_split

x_train,x_test, y_train,y_test = train_test_split(       #x=features , y=labels
    x , y ,train_size=0.3 ,random_state=42,stratify=y)  #splitting data into training and test datasets

In [12]:
from sklearn.ensemble import RandomForestClassifier

classify=RandomForestClassifier(n_estimators=100,random_state=42)  #creating Random Forest Classifier Model
classify.fit(x_train,y_train)                                      #Train the model on training data


In [18]:
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
from scipy.io import arff

prediction = classify.predict(x_test)  # Make predictions on the test set

# Create confusion matrix with labels to ensure 2x2 shape
cm = confusion_matrix(y_test, prediction, labels=[0, 1])
print("\n Confusion Matrix:")
print(pd.DataFrame(cm,
                   index=["Actual Normal (0)", "Actual Attack (1)"],
                   columns=["Predicted Normal (0)", "Predicted Attack (1)"]))

# Classification report with zero_division=0 to handle cases with no predicted or true samples gracefully
print("\n Classification Report:")
print(classification_report(y_test, prediction, labels=[0, 1], target_names=["Normal (0)", "Attack (1)"], zero_division=0))



 Confusion Matrix:
                   Predicted Normal (0)  Predicted Attack (1)
Actual Normal (0)                     0                     0
Actual Attack (1)                     0                 15781

 Classification Report:
              precision    recall  f1-score   support

  Normal (0)       0.00      0.00      0.00         0
  Attack (1)       1.00      1.00      1.00     15781

    accuracy                           1.00     15781
   macro avg       0.50      0.50      0.50     15781
weighted avg       1.00      1.00      1.00     15781



In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.impute import SimpleImputer

# Load data from CSV file into a pandas DataFrame
# DataFrame: 2D table-like data structure with rows (samples) and columns (features & label)
data = pd.read_csv("KDDDataset.txt")
df = pd.DataFrame(data)
print(df.head())  # Print first 5 rows to inspect data

# Separate features and label columns from DataFrame
# Features: Input variables used by the model to make predictions (independent variables)
# Label (target): The variable to predict (dependent variable, e.g. "normal" or "attack")
x = df.iloc[:, :-1].copy()   # All columns except the last are features
y = df.iloc[:, -1].copy()    # Last column is the label

# Handle missing values in features by replacing with the most frequent value per column
imputer = SimpleImputer(strategy='most_frequent')
x = pd.DataFrame(imputer.fit_transform(x), columns=x.columns)

# Identify categorical columns with text data needing conversion to numbers
cat_cols = [col for col in x.columns if x[col].dtype == 'object']

# Encode categorical features as numeric codes for model compatibility
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()  # Converts text labels to integer codes
    x[col] = le.fit_transform(x[col].astype(str))
    label_encoders[col] = le  # Save encoders if needed for decoding later

# Convert target labels to binary numeric form
# 0 if label equals “normal”, 1 otherwise (indicating some kind of attack)
y = y.apply(lambda v: 0 if str(v).lower() == "normal" else 1)

"""
STRATIFY use:
Keeps balance of normal/attack ratio same in both sets (train and test).
If you split randomly without stratify, you might accidentally put mostly normal 
samples in training and very few attacks in testing.
"""

# Split dataset into training and testing sets
# train_test_split: function to split data
# train_size=0.7 means 70% data for training, 30% for testing
# stratify=y maintains the original class distribution in train and test sets
x_train, x_test, y_train, y_test = train_test_split(
    x, y, train_size=0.7, random_state=42, stratify=y
)

# Define hyperparameter grid for Random Forest model tuning
param_grid = {
    'n_estimators': [50, 100],          # Number of trees in the forest
    'max_depth': [None, 10, 20],       # Maximum depth of each tree
    'min_samples_split': [2, 5]        # Minimum samples needed to split a node
}

"""
1.classify → variable name for classifier model
2.RandomForestClassifier → ML algorithm that builds many decision trees and combines them to improve accuracy
3.n_estimators=100 → use 100 trees in the forest
4.random_state=42 → same randomness every run for reproducibility
"""

# RandomForestClassifier: Ensemble of decision trees for classification tasks
rfc = RandomForestClassifier(random_state=42)

# GridSearchCV performs cross-validation and searches the best hyperparameters from param_grid
# cv=3 means 3-fold cross-validation
# scoring='f1' optimizes the F1 score metric (harmonic mean of precision and recall)
grid_search = GridSearchCV(rfc, param_grid, cv=3, scoring='f1', n_jobs=-1)
grid_search.fit(x_train, y_train)  # Train models with different parameters and select the best

best_model = grid_search.best_estimator_  # Extract best model after tuning

print(f"Best hyperparameters: {grid_search.best_params_}")

# Make predictions on the test dataset using the trained best model
prediction = best_model.predict(x_test)

# confusion_matrix: Shows number of correct/incorrect predictions split by classes
# Labels: [0, 1] represent Normal (0) and Attack (1) classes
cm = confusion_matrix(y_test, prediction, labels=[0, 1])
print("\nConfusion Matrix:")
print(pd.DataFrame(cm,
                   index=["Actual Normal (0)", "Actual Attack (1)"],
                   columns=["Predicted Normal (0)", "Predicted Attack (1)"]))


"""
Accuracy → overall correctness
Precision → reliability of “attack” alarms
Recall → ability to catch actual attacks
F1-Score → tradeoff between precision & recall
"""

# classification_report: Summary of key classification metrics per class
print("\nClassification Report:")
print(classification_report(y_test, prediction, labels=[0, 1],
                            target_names=["Normal (0)", "Attack (1)"],
                            zero_division=0))  # zero_division=0 avoids divide-by-zero errors


   0   tcp   private   REJ    0.1    0.2  0.3  0.4  0.5  0.6  ...  0.04.1  \
0  0   tcp   private   REJ      0      0    0    0    0    0  ...    0.00   
1  2   tcp  ftp_data    SF  12983      0    0    0    0    0  ...    0.61   
2  0  icmp     eco_i    SF     20      0    0    0    0    0  ...    1.00   
3  1   tcp    telnet  RSTO      0     15    0    0    0    0  ...    0.31   
4  0   tcp      http    SF    267  14515    0    0    0    0  ...    1.00   

   0.06.1  0.00.3  0.00.4  0.00.5  0.00.6  1.00.2  1.00.3  neptune  21  
0    0.06    0.00    0.00    0.00     0.0    1.00    1.00  neptune  21  
1    0.04    0.61    0.02    0.00     0.0    0.00    0.00   normal  21  
2    0.00    1.00    0.28    0.00     0.0    0.00    0.00    saint  15  
3    0.17    0.03    0.02    0.00     0.0    0.83    0.71    mscan  11  
4    0.00    0.01    0.03    0.01     0.0    0.00    0.00   normal  21  

[5 rows x 43 columns]
Best hyperparameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimat