In [1]:
# =========================================================
# 1. Import Library
# =========================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score

import warnings
warnings.filterwarnings("ignore")

In [2]:
# =========================================================
# 2. Load Dataset
# =========================================================
df = pd.read_csv("data/Wednesday-workingHours.pcap_ISCX.csv")
print("Data shape:", df.shape)

Data shape: (692703, 79)


In [3]:
# =========================================================
# 3. Pisahkan fitur dan label
# =========================================================
kolom_drop = [
    "Flow ID", "Source IP", "Destination IP",
    "Source Port", "Destination Port", "Timestamp"
]

for kol in kolom_drop:
    if kol in df.columns:
        df.drop(columns=kol, inplace=True)

# Simpan label asli
if "Label" in df.columns:
    labels_true = df["Label"].copy()
    df.drop(columns=["Label"], inplace=True)
else:
    labels_true = None

# Hapus baris kosong
df.dropna(inplace=True)

In [4]:
# =========================================================
# 4. Normalisasi SELURUH DATA (baru nanti di-sampling)
# =========================================================
X = df.select_dtypes(include=[np.number])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# =========================================================
# 5. Sampling maksimum 10.000 data setelah normalisasi
# =========================================================
sample_size = min(10000, len(X_scaled))
sample_indices = np.random.choice(len(X_scaled), sample_size, replace=False)

X_sample = X_scaled[sample_indices]
if labels_true is not None:
    y_sample = labels_true.iloc[sample_indices].reset_index(drop=True)
else:
    y_sample = None

print(f"Jumlah data diambil: {len(X_sample)} dari total {len(X_scaled)}")

ValueError: Input X contains infinity or a value too large for dtype('float64').

In [None]:
# =========================================================
# 6. Reduksi dimensi untuk visualisasi
# =========================================================
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_sample)

In [None]:
# =========================================================
# 7. KMeans Clustering
# =========================================================
kmeans = KMeans(n_clusters=2, random_state=42)
labels_kmeans = kmeans.fit_predict(X_sample)

sil_k = silhouette_score(X_sample, labels_kmeans)
db_k = davies_bouldin_score(X_sample, labels_kmeans)

print("\n=== Evaluasi KMeans ===")
print(f"Silhouette Score : {sil_k:.4f}")
print(f"Davies-Bouldin Index : {db_k:.4f}")

In [None]:
# =========================================================
# 8. DBSCAN Clustering
# =========================================================
dbscan = DBSCAN(eps=1.5, min_samples=10)
labels_dbscan = dbscan.fit_predict(X_sample)

unique_db = np.unique(labels_dbscan)
if len(unique_db) > 1:
    sil_db = silhouette_score(X_sample, labels_dbscan)
    db_db = davies_bouldin_score(X_sample, labels_dbscan)
else:
    sil_db, db_db = None, None

print("\n=== Evaluasi DBSCAN ===")
print(f"Cluster unik : {unique_db}")
if sil_db is not None:
    print(f"Silhouette Score : {sil_db:.4f}")
    print(f"Davies-Bouldin Index : {db_db:.4f}")
else:
    print("DBSCAN menghasilkan 1 cluster (tidak bisa dihitung silhouette).")

In [None]:
# =========================================================
# 9. Visualisasi hasil clustering
# =========================================================
def plot_cluster(X_pca, labels, title):
    plt.figure(figsize=(8,6))
    sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=labels, palette="tab10", s=15)
    plt.title(title)
    plt.show()

plot_cluster(X_pca, labels_kmeans, "KMeans Clustering (PCA 2D)")
plot_cluster(X_pca, labels_dbscan, "DBSCAN Clustering (PCA 2D)")

In [None]:
# =========================================================
# 10. Analisis perbandingan terhadap label asli
# =========================================================
if y_sample is not None:
    result = pd.DataFrame({
        "Label_True": y_sample.values,
        "KMeans": labels_kmeans,
        "DBSCAN": labels_dbscan
    })

    print("\n=== Distribusi Cluster KMeans vs Label Asli ===")
    print(result.groupby("KMeans")["Label_True"].value_counts(), "\n")

    print("=== Distribusi Cluster DBSCAN vs Label Asli ===")
    print(result.groupby("DBSCAN")["Label_True"].value_counts(), "\n")

In [None]:
# =========================================================
# 11. Analisis fitur pembeda antar cluster (KMeans)
# =========================================================
df_features = pd.DataFrame(X_sample, columns=X.columns)
df_features["Cluster_KMeans"] = labels_kmeans

cluster_means = df_features.groupby("Cluster_KMeans").mean()
diff = cluster_means.T
if 0 in diff.columns and 1 in diff.columns:
    diff["Difference"] = abs(diff[0] - diff[1])
else:
    diff["Difference"] = diff.max(axis=1) - diff.min(axis=1)

print("=== Fitur yang Paling Membedakan antar Cluster (KMeans) ===")
print(diff.sort_values("Difference", ascending=False).head(10))

# =========================================================
# 12. Analisis Noise DBSCAN
# =========================================================
noise_ratio = np.sum(labels_dbscan == -1) / len(labels_dbscan)
print(f"\n=== Analisis Noise DBSCAN ===")
print(f"Jumlah data noise: {(labels_dbscan == -1).sum()} dari {len(labels_dbscan)} "
      f"({noise_ratio*100:.2f}%)")

if y_sample is not None:
    print(result[result["DBSCAN"] == -1]["Label_True"].value_counts())

In [None]:
# =========================================================
# 13. Kesimpulan Otomatis
# =========================================================
print("\n================== KESIMPULAN ==================")

# a. Apakah cluster memisahkan normal vs serangan?
if y_sample is not None:
    distribusi = result.groupby("KMeans")["Label_True"].value_counts(normalize=True).unstack().fillna(0)
    print("\n>> Apakah cluster memisahkan normal vs serangan?")
    print(distribusi)
    if (distribusi.max().max() > 0.8):
        print("✅ Cluster KMeans cukup baik memisahkan normal dan serangan.")
    else:
        print("⚠️  Cluster KMeans belum memisahkan dengan baik.")

# b. Apakah DBSCAN menemukan noise menarik?
print("\n>> Apakah DBSCAN menemukan noise menarik?")
if noise_ratio > 0.05:
    print(f"✅ Ya, sekitar {noise_ratio*100:.2f}% data terdeteksi sebagai noise (kemungkinan anomali).")
else:
    print(f"⚠️  Tidak banyak noise yang terdeteksi (hanya {noise_ratio*100:.2f}%).")

# c. Fitur pembeda
print("\n>> Fitur utama yang membedakan cluster:")
print(diff.sort_values("Difference", ascending=False).head(5))

# d. Apakah clustering membantu eksplorasi?
print("\n>> Apakah clustering cukup membantu?")
if sil_k > 0.3 or (sil_db is not None and sil_db > 0.3):
    print("✅ Ya, clustering membantu memahami pola trafik (indikasi cluster cukup terpisah).")
else:
    print("⚠️  Clustering belum terlalu efektif untuk memisahkan pola trafik secara jelas.")