In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
gm = pd.read_parquet('gmgm.parquet')

In [None]:
pd.set_option('display.max_rows', None)    # Display all rows (in case you use .head())
pd.set_option('display.max_columns', None) # Display all columns
pd.set_option('display.width', 1000)
print(gm.shape)
print(gm.dtypes)

(17143447, 63)
protocol_0           float32
protocol_6           float32
protocol_17          float32
flow_duration        float32
flow_byts_s          float32
flow_pkts_s          float32
fwd_pkts_s           float32
bwd_pkts_s           float32
tot_fwd_pkts         float32
tot_bwd_pkts         float32
totlen_fwd_pkts      float32
totlen_bwd_pkts      float32
fwd_pkt_len_max      float32
fwd_pkt_len_min      float32
fwd_pkt_len_mean     float32
fwd_pkt_len_std      float32
bwd_pkt_len_max      float32
bwd_pkt_len_min      float32
bwd_pkt_len_mean     float32
bwd_pkt_len_std      float32
pkt_len_max          float32
pkt_len_min          float32
pkt_len_mean         float32
pkt_len_std          float32
pkt_len_var          float32
fwd_seg_size_min     float32
fwd_act_data_pkts    float32
flow_iat_mean        float32
flow_iat_max         float32
flow_iat_min         float32
flow_iat_std         float32
fwd_iat_tot          float32
fwd_iat_max          float32
fwd_iat_min          float32

In [None]:
# shuffle data
gm = gm.sample(frac= 1, random_state=42).reset_index(drop=True)
print('done')

done


In [None]:
# separate label and data
y = gm['Label']
X = gm.drop('Label', axis = 1)
print('done')

done


In [None]:
#split data 80:20 for train and temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size = 0.20,
    stratify = y,
    random_state = 42,
    shuffle = True
)

In [None]:
#split temp 1:1 for validation and test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size = 0.50,
    stratify = y_temp,
    random_state = 42,
    shuffle = True
)

In [None]:
#log transform (don't run twice)
X_train= np.log1p(X_train)
X_validation = np.log1p(X_val)
X_test = np.log1p(X_test)
print('log transform done')

log transform done


In [None]:
#standardscaler (Standardization (Z-Score): ) and not normalization to preserve the shape better for anomaly detection
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
print("standardscaler done")

standardscaler done


In [None]:
#pca
pca = PCA(n_components = 0.95)
X_train_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(X_test)
print("pca done")

pca done


In [None]:
############## DI UNCOMMENT NANTI BANG
# import joblib

# # This saves the math used to scale and shrink your data
# joblib.dump(scaler, 'scaler_brain.joblib')
# joblib.dump(pca, 'pca_brain.joblib')

# print("Step 1 Done: Scaler and PCA saved!")

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import time

# Dictionary of models
# n_jobs=-1 means "Use all CPU cores"
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000, n_jobs=-1),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced', max_depth=20),
    "Random Forest": RandomForestClassifier(class_weight='balanced', n_estimators=100, max_depth=20, n_jobs=-1)
}

results = {}

for name, model in models.items():
    print(f"--- Training {name} ---")
    start = time.time()

    # Train on Full PCA Data
    model.fit(X_train_pca, y_train)

    # Predict
    y_pred = model.predict(X_val_pca)

    end = time.time()
    print(f"Training Time: {(end - start)/60:.2f} minutes")

    # Print Report
    print(classification_report(y_val, y_pred))

    # Save for comparison
    results[name] = model

--- Training Logistic Regression ---
Training Time: 0.46 minutes
              precision    recall  f1-score   support

           0       0.52      0.25      0.34    438449
           1       0.78      0.92      0.84   1275896

    accuracy                           0.75   1714345
   macro avg       0.65      0.59      0.59   1714345
weighted avg       0.71      0.75      0.72   1714345

--- Training Decision Tree ---


In [None]:
############## DI UNCOMMENT NANTI BANG
# import joblib

# # This saves the 3 models created in the loop above
# # We loop through the 'results' dictionary you made
# if 'results' in locals():
#     for name, model in results.items():
#         # Replaces spaces with underscores (e.g. "Random Forest" -> "Random_Forest.joblib")
#         filename = name.replace(" ", "_") + ".joblib"
#         joblib.dump(model, filename)
#         print(f"Saved: {filename}")
# else:
#     print("Error: Could not find the 'results' dictionary. Did you run the cell above?")

# print("Step 2 Done: Main models saved!")

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# 1. Create a "Mini" dataset just for these slow models
# Taking 50,000 rows is usually enough for SVM/KNN to converge
X_train_small = X_train_pca[:50000]
y_train_small = y_train[:50000]

slow_models = {
    "KNN": KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
    "SVM (RBF)": SVC(class_weight='balanced', kernel='rbf', cache_size=2000)
}

for name, model in slow_models.items():
    print(f"--- Training {name} (On Subset) ---")
    start = time.time()

    # Train on SMALL Data
    model.fit(X_train_small, y_train_small)

    # Predict on Validation (also subset if validation is huge, otherwise full is ok but slow)
    # Let's predict on full validation to see real performance
    print("Predicting... (This part might be slow for KNN)")
    y_pred = model.predict(X_val_pca)

    end = time.time()
    print(f"Time: {(end - start)/60:.2f} minutes")
    print(classification_report(y_val, y_pred))

In [None]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
y_pred = knn_clf.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
Classifier_accuracy.append(accuracy*100)
print("Accuracy of KNN Classifier : %.2f" % (accuracy*100))

In [None]:
############## DI UNCOMMENT NANTI BANG
# import joblib

# # This saves the KNN model
# joblib.dump(knn_clf, 'knn_model.joblib')

# print("Step 3 Done: KNN model saved!")

In [None]:
svc_clf = SVC()
svc_clf.fit(X_train,y_train)
y_pred = svc_clf.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
Classifier_accuracy.append(accuracy*100)
print("Accuracy of SVM Classifier : %.2f" % (accuracy*100) )

In [None]:
############## DI UNCOMMENT NANTI BANG
# import joblib

# # This saves the SVM model
# joblib.dump(svc_clf, 'svm_model.joblib')

# print("Step 4 Done: SVM model saved!")

In [None]:
import tensorflow as keras
from tensorflow.keras import layers
from sklearn.utils import class_weight

# 1. Calculate Class Weights for DNN
# (Since we didn't undersample, we need to tell the Neural Net that Attack is 75%)
weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
dnn_weights = {0: weights[0], 1: weights[1]}
print(f"DNN Class Weights: {dnn_weights}")

# 2. Build the Model
model_dnn = keras.Sequential([
    # Input Layer: Shape is the number of PCA components (e.g., 10)
    layers.InputLayer(input_shape=(X_train_pca.shape[1],)),

    # Hidden Layers
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2), # Prevents overfitting
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.2),

    # Output Layer: 1 Neuron, Sigmoid (for Binary 0/1)
    layers.Dense(1, activation='sigmoid')
])

# 3. Compile
model_dnn.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# 4. Train
print("--- Training DNN ---")
history = model_dnn.fit(
    X_train_pca, y_train,
    validation_data=(X_val_pca, y_val),
    epochs=10,             # Keep low for speed testing
    batch_size=2048,       # Large batch size helps speed up massive data
    class_weight=dnn_weights,
    verbose=1
)

# 5. Evaluate
# DNN gives probabilities (0.0 to 1.0). Convert to labels (0 or 1).
y_pred_probs = model_dnn.predict(X_val_pca)
y_pred_dnn = (y_pred_probs > 0.5).astype("int32")

print(classification_report(y_val, y_pred_dnn))

In [None]:
############## DI UNCOMMENT NANTI BANG
# # Keras models save differently (no joblib needed here)
# model_dnn.save('dnn_model.keras')

# print("Step 5 Done: Deep Learning model saved!")

In [None]:
dt_clf = DecisionTreeClassifier(max_depth=5)
dt_clf.fit(X_train,y_train)
y_pred = dt_clf.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
Classifier_accuracy.append(accuracy*100)
print("Accuracy of Decision Tree Classifier : %.2f" % (accuracy*100) )

In [None]:
############## DI UNCOMMENT NANTI BANG
# import joblib

# # Let's call this the "Simple" tree to distinguish it
# joblib.dump(dt_clf, 'simple_decision_tree_depth5.joblib')
# print("Step 6 Done: Simple Decision Tree saved!")