## Step 1: Load Libraries and Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv("log_sample.csv")
df.head()


## Step 2: Data Preprocessing

In [None]:
# Encode categorical features
df_encoded = df.copy()
le = LabelEncoder()
df_encoded['protocol'] = le.fit_transform(df_encoded['protocol'])
df_encoded['flags'] = le.fit_transform(df_encoded['flags'])
df_encoded['label'] = df_encoded['label'].map({'benign': 0, 'DoS': 1})

# Drop timestamp and IPs (not useful in clustering/classification for now)
df_encoded = df_encoded.drop(columns=["timestamp", "src_ip", "dst_ip"])
df_encoded.head()


## Step 3: Clustering with K-Means

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_encoded.drop(columns=["label"]))

kmeans = KMeans(n_clusters=2, random_state=42)
df_encoded["cluster_kmeans"] = kmeans.fit_predict(X_scaled)

# Compare with actual labels
pd.crosstab(df_encoded['label'], df_encoded['cluster_kmeans'], rownames=['Actual'], colnames=['Cluster'])


## Step 4: Classification with Random Forest

In [None]:
X = df_encoded.drop(columns=["label", "cluster_kmeans"])
y = df_encoded["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, clf.predict_proba(X_test)[:,1]))
