#NETWORK INTRUSION DETECTION SYSTEM USING MACHINE LEARNING

IMPORTING NECESSARY LIBARARIES

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from imblearn.over_sampling import ADASYN
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

#DATA PREPROCESSING

In [None]:
col_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
    'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
    'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
    'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate', 'label', 'difficulty_level'
]

LOADING THE DATASET

In [None]:
df = pd.read_csv("/content/KDDTrain+.txt",names=col_names)
df_test = pd.read_csv("/content/KDDTest+.txt",names=col_names)

EXPLORATORY DATA ANALYSIS

In [None]:
df.head()

In [None]:
print(f"df has {df.shape[1]} columns")

In [None]:
print(f"col_names has {len(col_names)} elements")

In [None]:
print(df.shape)
print(df.columns)

In [None]:
df.columns = col_names
df_test.columns = col_names

In [None]:
# Group labels into 5 categories
label_map = {
    'normal': 0,
    'neptune': 1, 'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1, 'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
    'ipsweep': 2, 'nmap': 2, 'portsweep': 2, 'satan': 2, 'mscan': 2, 'saint': 2,
    'ftp_write': 3, 'guess_passwd': 3, 'imap': 3, 'multihop': 3, 'phf': 3, 'spy': 3, 'warezclient': 3, 'warezmaster': 3, 'sendmail': 3, 'named': 3, 'snmpgetattack': 3,
    'snmpguess': 3, 'xlock': 3, 'xsnoop': 3, 'httptunnel': 3,
    'buffer_overflow': 4, 'loadmodule': 4, 'perl': 4, 'rootkit': 4, 'ps': 4, 'sqlattack': 4, 'xterm': 4
}
df['label'] = df['label'].map(label_map)
df_test['label'] = df_test['label'].map(label_map)

In [None]:
# Analysing classes
plt.figure(figsize=(10,5))
sns.countplot(data=df, x='label', order=df['label'].value_counts().index, palette='Set2')
plt.title("Class Distribution of Attack Types")
plt.xticks(rotation=45)
plt.xlabel("Attack Type")
plt.ylabel("Number of Samples")
plt.tight_layout()
plt.show()

The datset is highly imbalanced.

In [None]:
!pip install ydata-profiling

In [None]:
from ydata_profiling import ProfileReport

In [None]:
prof = ProfileReport(df)
prof.to_file(output_file='train_data.html')

In [None]:
prof = ProfileReport(df_test)
prof.to_file(output_file='test_data.html')

In [None]:
df.info()

In [None]:
df_test.info()

In [None]:
df.isnull().sum()

No missing values in training dataset.

In [None]:
df_test.isnull().sum()

No missing values in testing dataset.

In [None]:
df.describe()

In [None]:
df_test.describe()

In [None]:
print(df['label'])

#FEATURE ENGINEERING

In [None]:
df['label'] = df['label'].replace({
    'normal': 0,
    # DoS (Denial of Service)
    'back': 1, 'land': 1, 'neptune': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,
    'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
    # Probe (Surveillance/Scanning)
    'ipsweep': 2, 'nmap': 2, 'portsweep': 2, 'satan': 2, 'mscan': 2, 'saint': 2,
    # R2L (Remote to Local)
    'ftp_write': 3, 'guess_passwd': 3, 'imap': 3, 'multihop': 3, 'phf': 3,
    'spy': 3, 'warezclient': 3, 'warezmaster': 3, 'sendmail': 3, 'named': 3,
    'snmpgetattack': 3, 'snmpguess': 3, 'xlock': 3, 'xsnoop': 3, 'httptunnel': 3,
    # U2R (User to Root)
    'buffer_overflow': 4, 'loadmodule': 4, 'perl': 4, 'rootkit': 4,
    'ps': 4, 'sqlattack': 4, 'xterm': 4
})

In [None]:
numeric_df = df.select_dtypes(include=['int64', 'float64'])
# Computing the correlation matrix
corr_matrix = numeric_df.corr()
plt.figure(figsize=(20, 17))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True, cbar=True)
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.show()

correlation heatmap helps us understand the relationships betwwen the features. It helps us to idenify patterns in the dataset.

In [None]:
# One-hot encode categorical features
cat_cols = ['protocol_type', 'service', 'flag']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

In [None]:
df_cat = pd.DataFrame(encoder.fit_transform(df[cat_cols]))
df_cat.columns = encoder.get_feature_names_out(cat_cols)

In [None]:
df_test_cat = pd.DataFrame(encoder.transform(df_test[cat_cols]))
df_test_cat.columns = encoder.get_feature_names_out(cat_cols)

In [None]:
# Align test set columns with training
df_cat, df_test_cat = df_cat.align(df_test_cat, join='outer', axis=1, fill_value=0)

In [None]:
# Drop original categorical columns and merge encoded ones
df_final = df.drop(columns=cat_cols).reset_index(drop=True)
df_test_final = df_test.drop(columns=cat_cols).reset_index(drop=True)

In [None]:
df_final = pd.concat([df_final, df_cat], axis=1)
df_test_final = pd.concat([df_test_final, df_test_cat], axis=1)

In [None]:
# Separate features and target
X = df_final.drop(columns=['label'])
y = df_final['label']

In [None]:
X_test = df_test_final.drop(columns=['label'])
y_test = df_test_final['label']

In [None]:
# Converting int64 to float64 for scaling and compatibility
X = X.astype(np.float64)
X_test = X_test.astype(np.float64)

 HANDLING IMBALANCED DATASET

In [None]:
# Applying ADASYN to balance classes
adasyn = ADASYN()
X_res, y_res = adasyn.fit_resample(X, y)

In [None]:
# Feature scaling
scaler = StandardScaler()
X_res_scaled = scaler.fit_transform(X_res)
X_test_scaled = scaler.fit_transform(X_test)

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_encoded = encoder.fit_transform(df[cat_cols])
encoded_feature_names = encoder.get_feature_names_out(cat_cols)

In [None]:
import numpy as np
numeric_data = df.drop(columns=cat_cols + ['label'])
numeric_feature_names = numeric_data.columns
all_feature_names = list(encoded_feature_names) + list(numeric_feature_names)
X_df = pd.DataFrame(X_res_scaled, columns=all_feature_names)

In [None]:
X_df.head()

Discretization

In [None]:
from sklearn.preprocessing import KBinsDiscretizer
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
X_discretized = X.copy()
X_discretized[numeric_cols] = discretizer.fit_transform(X[numeric_cols])

Feature selection

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
selector = SelectKBest(score_func=mutual_info_classif, k=13)
X_selected = selector.fit_transform(X_discretized, y)
selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = X.columns[selected_feature_indices]
print("Selected Features:", selected_feature_names.tolist())

Handling Class imbalance Using Adasyn

In [None]:
# Applying ADASYN to balance classes
adasyn = ADASYN()
X_ress, y_ress = adasyn.fit_resample(X_selected, y)

#TRAINING AND EVALUATION OF MODELS

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
models = {
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
 }
X_train, X_test, y_train, y_test = train_test_split(X_ress, y_ress, test_size=0.2, random_state=42)

In [None]:
y_preds = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_preds[name] = y_pred
    print(f"{name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")

In [None]:
import joblib
# Save your model after training
joblib.dump(model, 'model.pkl')
joblib.dump(scaler, 'scaler.pkl')

RANDOM FOREST

In [None]:
from sklearn.metrics import classification_report
print("Random Forest Classification Report:")
print(classification_report(y_test, y_preds["Random Forest"]))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_predictions(y_test, y_preds["Random Forest"], cmap="Blues")

KNN

In [None]:
print("KNN Classification Report:")
print(classification_report(y_test, y_preds["KNN"]))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_preds["KNN"], cmap="Blues")

NAIVE BAYES

In [None]:
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_preds["Naive Bayes"]))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_preds["Naive Bayes"], cmap="Blues")

DECISION TREE

In [None]:
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_preds["Decision Tree"]))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_preds["Decision Tree"], cmap="Blues")

XGBOOST

In [None]:
print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap="Blues")

In [None]:
from sklearn.metrics import accuracy_score
accuracies = {}
for name, y_pred in y_preds.items():
    acc = accuracy_score(y_test, y_pred)
    accuracies[name] = acc

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.bar(accuracies.keys(), accuracies.values(), color='skyblue')
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.ylim(0, 1.0)
plt.xticks(rotation=45)
plt.grid(axis='y')
for i, (name, acc) in enumerate(accuracies.items()):
    plt.text(i, acc + 0.01, f"{acc:.2f}", ha='center', fontweight='bold')
plt.tight_layout()
plt.show()

BEST MODEL: XGBOOST