In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics import silhouette_score

In [None]:
# Step 1: Load the Dataset
data = pd.read_csv('cybersecurity_attacks.csv')


In [None]:
# Step 2: Data Cleaning
# Drop columns with more than 50% missing values
threshold = 0.5 * len(data)
cleaned_data = data.dropna(thresh=threshold, axis=1)

# Drop irrelevant columns
irrelevant_columns = ['Payload Data', 'User Information', 'Device Information', 'Timestamp', 
                      'Source IP Address', 'Destination IP Address']
cleaned_data = cleaned_data.drop(columns=irrelevant_columns, errors='ignore')

# Check for non-numeric columns
non_numeric_columns = cleaned_data.select_dtypes(include=['object']).columns
print(f"Non-numeric columns: {non_numeric_columns}")

# Encode all non-numeric columns using LabelEncoder
for column in non_numeric_columns:
    le = LabelEncoder()
    cleaned_data[column] = le.fit_transform(cleaned_data[column].astype(str))

# Verify the dataset is fully numeric
print("Data types after encoding:")
print(cleaned_data.dtypes)


# Impute missing values
cleaned_data = cleaned_data.fillna(cleaned_data.median())

In [None]:
# Step 3: Standardize Data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(cleaned_data)


In [None]:
# Step 4: Dimensionality Reduction for Visualization
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(scaled_data)


In [None]:
# Visualize the PCA-transformed dataset
plt.figure(figsize=(8, 6))
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], alpha=0.5)
plt.title("PCA Reduced Data")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.show()

In [None]:
# Step 5: K-Means Clustering
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans_labels = kmeans.fit_predict(scaled_data)

In [None]:
# Visualize K-Means Clustering
plt.figure(figsize=(8, 6))
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=kmeans_labels, cmap='viridis', alpha=0.7)
plt.title("K-Means Clustering Visualization")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.colorbar(label="Cluster Label")
plt.show()

In [None]:
# Evaluate K-Means with Silhouette Score
kmeans_silhouette = silhouette_score(scaled_data, kmeans_labels)
print(f"K-Means Silhouette Score: {kmeans_silhouette:.2f}")

In [None]:
# Step 6: DBSCAN Clustering
dbscan = DBSCAN(eps=2, min_samples=5)
dbscan_labels = dbscan.fit_predict(scaled_data)

# Visualize DBSCAN Clustering
plt.figure(figsize=(8, 6))
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=dbscan_labels, cmap='plasma', alpha=0.7)
plt.title("DBSCAN Clustering Visualization")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.colorbar(label="Cluster Label")
plt.show()

In [None]:
# Evaluate DBSCAN Clustering
unique_labels = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
print(f"DBSCAN identified {unique_labels} clusters (excluding noise points).")


In [None]:
# Step 7: Hierarchical Clustering
# Compute linkage matrix
linkage_matrix = linkage(scaled_data, method='ward')

# Plot dendrogram
plt.figure(figsize=(10, 7))
dendrogram(linkage_matrix, truncate_mode="level", p=5)
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Data Points")
plt.ylabel("Euclidean Distance")
plt.show()

In [None]:
# Step 8: Assign Clusters for Analysis
cleaned_data['KMeans Cluster'] = kmeans_labels
cleaned_data['DBSCAN Cluster'] = dbscan_labels

# Visualize the dataset with clusters
sns.pairplot(cleaned_data, hue='KMeans Cluster', diag_kind='kde')
plt.show()