In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
import joblib
#pyarrow

In [2]:
gm = pd.read_parquet('gmgm.parquet')

In [3]:
pd.set_option('display.max_rows', None)    # Display all rows (in case you use .head())
pd.set_option('display.max_columns', None) # Display all columns
pd.set_option('display.width', 1000)
print(gm.shape)
print(gm.dtypes)

(17143447, 63)
protocol_0           float32
protocol_6           float32
protocol_17          float32
flow_duration        float32
flow_byts_s          float32
flow_pkts_s          float32
fwd_pkts_s           float32
bwd_pkts_s           float32
tot_fwd_pkts         float32
tot_bwd_pkts         float32
totlen_fwd_pkts      float32
totlen_bwd_pkts      float32
fwd_pkt_len_max      float32
fwd_pkt_len_min      float32
fwd_pkt_len_mean     float32
fwd_pkt_len_std      float32
bwd_pkt_len_max      float32
bwd_pkt_len_min      float32
bwd_pkt_len_mean     float32
bwd_pkt_len_std      float32
pkt_len_max          float32
pkt_len_min          float32
pkt_len_mean         float32
pkt_len_std          float32
pkt_len_var          float32
fwd_seg_size_min     float32
fwd_act_data_pkts    float32
flow_iat_mean        float32
flow_iat_max         float32
flow_iat_min         float32
flow_iat_std         float32
fwd_iat_tot          float32
fwd_iat_max          float32
fwd_iat_min          float32

In [4]:
import gc # Garbage Collector interface

print("Separating Label...")

# 1. Rip the 'Label' column out of 'gm' and put it into 'y'.
# This modifies 'gm' instantly. 'gm' no longer has the label.
y = gm.pop('Label') 

# 2. Now 'gm' IS your 'X' (features only). Just rename the variable.
X = gm 

# 3. Delete the reference to the name 'gm'
del gm 

# 4. Force Python to release memory immediately
gc.collect()

print("Separation Done. RAM cleaned.")

Separating Label...
Separation Done. RAM cleaned.


In [5]:
#split data 80:20 for train and temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size = 0.20,
    stratify = y,
    random_state = 42,
    shuffle = True # shuffle code
)

In [6]:
#split temp 1:1 for validation and test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size = 0.50,
    stratify = y_temp,
    random_state = 42,
    shuffle = True
)

In [6]:
# #log transform (don't run twice)
# X_train= np.log1p(X_train)
# x_val = np.log1p(X_val)
# X_test = np.log1p(X_test)
# print('log transform done')

In [8]:
# #standardscaler (Standardization (Z-Score): ) and not normalization to preserve the shape better for anomaly detection
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_val = scaler.transform(X_val)
# X_test = scaler.transform(X_test)
# print("standardscaler done")

In [9]:
# #pca
# pca = PCA(n_components = 0.95)
# X_train_pca = pca.fit_transform(X_train)
# X_val_pca = pca.transform(X_val)
# X_test_pca = pca.transform(X_test)
# print("pca done")
# joblib.dump(pca, 'pca_brain.joblib')
# print("Step 1 Done: PCA saved!")

In [10]:
Classifier_accuracy = []

In [11]:
#TRAINING DTREE, RF, LOGIRES, CLAS
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import time

# Dictionary of models
# n_jobs=-1 means "Use all CPU cores"
cores = 4
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000, n_jobs=cores),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced', max_depth=20),
    "Random Forest": RandomForestClassifier(class_weight='balanced', n_estimators=100, max_depth=20, n_jobs=cores)
}

results = {}

for name, model in models.items():
    print(f"--- Training {name} ---")
    start = time.time()

    # Train on Full PCA Data
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_val)

    end = time.time()
    print(f"Training Time: {(end - start)/60:.2f} minutes")

    # Print Report
    print(classification_report(y_val, y_pred))

    # Save for comparison
    results[name] = model

--- Training Logistic Regression ---
Training Time: 5.16 minutes
              precision    recall  f1-score   support

           0       0.69      0.54      0.61    438449
           1       0.85      0.92      0.88   1275896

    accuracy                           0.82   1714345
   macro avg       0.77      0.73      0.75   1714345
weighted avg       0.81      0.82      0.81   1714345

--- Training Decision Tree ---
Training Time: 8.94 minutes
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    438449
           1       1.00      1.00      1.00   1275896

    accuracy                           1.00   1714345
   macro avg       1.00      1.00      1.00   1714345
weighted avg       1.00      1.00      1.00   1714345

--- Training Random Forest ---
Training Time: 15.24 minutes
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    438449
           1       1.00      1.00      1.00   1275896

In [12]:
############# DI UNCOMMENT NANTI BANG
import joblib

# This saves the 3 models created in the loop above
# We loop through the 'results' dictionary you made
if 'results' in locals():
    for name, model in results.items():
        # Replaces spaces with underscores (e.g. "Random Forest" -> "Random_Forest.joblib")
        filename = name.replace(" ", "_") + ".joblib"
        joblib.dump(model, filename)
        print(f"Saved: {filename}")
else:
    print("Error: Could not find the 'results' dictionary. Did you run the cell above?")

print("Step 2 Done: Main models saved!")

Saved: Logistic_Regression.joblib
Saved: Decision_Tree.joblib
Saved: Random_Forest.joblib
Step 2 Done: Main models saved!


In [13]:
#TRAINING KNN AND SVC

###

In [15]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# 1. Create a "Mini" dataset just for these slow models
# Taking 50,000 rows is usually enough for SVM/KNN to converge
X_train_small = X_train[:50000]
y_train_small = y_train[:50000]

slow_models = {
    "KNN": KNeighborsClassifier(n_neighbors=5, n_jobs=cores),
    "SVM (RBF)": SVC(class_weight='balanced', kernel='rbf', cache_size=2000)
}

for name, model in slow_models.items():
    print(f"--- Training {name} (On Subset) ---")
    start = time.time()

    # Train on SMALL Data
    model.fit(X_train_small, y_train_small)

    # Predict on Validation (also subset if validation is huge, otherwise full is ok but slow)
    # Let's predict on full validation to see real performance
    print("Predicting... (This part might be slow for KNN)")
    y_pred = model.predict(X_val)

    end = time.time()
    print(f"Time: {(end - start)/60:.2f} minutes")
    print(classification_report(y_val, y_pred))

--- Training KNN (On Subset) ---
Predicting... (This part might be slow for KNN)
Time: 0.51 minutes
              precision    recall  f1-score   support

           0       0.98      0.97      0.97    438449
           1       0.99      0.99      0.99   1275896

    accuracy                           0.99   1714345
   macro avg       0.98      0.98      0.98   1714345
weighted avg       0.99      0.99      0.99   1714345

--- Training SVM (RBF) (On Subset) ---
Predicting... (This part might be slow for KNN)
Time: 50.47 minutes
              precision    recall  f1-score   support

           0       0.26      1.00      0.41    438449
           1       0.97      0.00      0.00   1275896

    accuracy                           0.26   1714345
   macro avg       0.61      0.50      0.21   1714345
weighted avg       0.79      0.26      0.11   1714345



In [23]:
if 'KNN' in slow_models:
    joblib.dump(slow_models['KNN'], 'knn_model.joblib')
    print("Saved KNN from dictionary.")

Saved KNN from dictionary.
Saved SVM from dictionary.


In [23]:
if 'SVM (RBF)' in slow_models:
    joblib.dump(slow_models['SVM (RBF)'], 'svm_model.joblib')
    print("Saved SVM from dictionary.")

Saved KNN from dictionary.
Saved SVM from dictionary.


In [None]:
#TRAINING DNN

###

In [25]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
import numpy as np

# 1. Calculate Class Weights
# We handle the imbalance by weighing the minority class higher
weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
dnn_weights = {0: weights[0], 1: weights[1]}
print(f"DNN Class Weights: {dnn_weights}")

# 2. Build the Model
# We use X_train.shape[1] to automatically get the number of columns (60+)
model_dnn = keras.Sequential([
    layers.InputLayer(input_shape=(X_train.shape[1],)), 
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(1, activation='sigmoid')
])

model_dnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 3. Train
print("--- Training DNN ---")
# batch_size=2048 is good for both CPU and GPU on large data
history = model_dnn.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=2048, 
    class_weight=dnn_weights,
    verbose=1
)

# 4. Save
model_dnn.save('dnn_model.keras')
print("DNN Model Saved.")

# 5. Evaluate
print("Evaluating on Validation Set...")
y_pred_probs = model_dnn.predict(X_val)
# Convert probabilities (0.1, 0.9) to binary labels (0, 1)
y_pred_dnn = (y_pred_probs > 0.5).astype("int32")

print(classification_report(y_val, y_pred_dnn))

DNN Class Weights: {0: np.float64(1.9550097459996072), 1: np.float64(0.6718201549421563)}
--- Training DNN ---


E0000 00:00:1766045739.863636 2952656 cuda_executor.cc:1309] INTERNAL: CUDA Runtime error: Failed call to cudaGetRuntimeVersion: Error loading CUDA libraries. GPU will not be used.: Error loading CUDA libraries. GPU will not be used.
W0000 00:00:1766045739.870530 2952656 gpu_device.cc:2342] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Epoch 1/10
[1m6697/6697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - accuracy: 0.3417 - loss: 1106128000.0000 - val_accuracy: 0.2558 - val_loss: 0.6944
Epoch 2/10
[1m6697/6697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - accuracy: 0.4445 - loss: 1069231.2500 - val_accuracy: 0.2558 - val_loss: 0.6942
Epoch 3/10
[1m6697/6697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - accuracy: 0.5037 - loss: 236227.1094 - val_accuracy: 0.7442 - val_loss: 0.6915
Epoch 4/10
[1m6697/6697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - accuracy: 0.5037 - loss: 1838013.6250 - val_accuracy: 0.2558 - val_loss: 0.6940
Epoch 5/10
[1m6697/6697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - accuracy: 0.5104 - loss: 1.0239 - val_accuracy: 0.7442 - val_loss: 0.6902
Epoch 6/10
[1m6697/6697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - accuracy: 0.5016 - loss: 1.3646 - val_accuracy: 0.2558 - v

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [26]:
model_dnn.save('dnn_model.keras')
print("Step 5 Done: Deep Learning model saved!")

Step 5 Done: Deep Learning model saved!


In [27]:
accuracy = accuracy_score(y_test, y_pred)
Classifier_accuracy.append(accuracy*100)
print("Accuracy of Decision Tree Classifier : %.2f" % (accuracy*100) )

Accuracy of Decision Tree Classifier : 25.67
