In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
import joblib
#pyarrow

In [2]:
gm = pd.read_parquet('gmgm.parquet')

In [3]:
pd.set_option('display.max_rows', None)    # Display all rows (in case you use .head())
pd.set_option('display.max_columns', None) # Display all columns
pd.set_option('display.width', 1000)
print(gm.shape)
print(gm.dtypes)

(17143447, 63)
protocol_0           float32
protocol_6           float32
protocol_17          float32
flow_duration        float32
flow_byts_s          float32
flow_pkts_s          float32
fwd_pkts_s           float32
bwd_pkts_s           float32
tot_fwd_pkts         float32
tot_bwd_pkts         float32
totlen_fwd_pkts      float32
totlen_bwd_pkts      float32
fwd_pkt_len_max      float32
fwd_pkt_len_min      float32
fwd_pkt_len_mean     float32
fwd_pkt_len_std      float32
bwd_pkt_len_max      float32
bwd_pkt_len_min      float32
bwd_pkt_len_mean     float32
bwd_pkt_len_std      float32
pkt_len_max          float32
pkt_len_min          float32
pkt_len_mean         float32
pkt_len_std          float32
pkt_len_var          float32
fwd_seg_size_min     float32
fwd_act_data_pkts    float32
flow_iat_mean        float32
flow_iat_max         float32
flow_iat_min         float32
flow_iat_std         float32
fwd_iat_tot          float32
fwd_iat_max          float32
fwd_iat_min          float32

In [4]:
import gc # Garbage Collector interface

print("Separating Label...")

# 1. Rip the 'Label' column out of 'gm' and put it into 'y'.
# This modifies 'gm' instantly. 'gm' no longer has the label.
y = gm.pop('Label') 

# 2. Now 'gm' IS your 'X' (features only). Just rename the variable.
X = gm 

# 3. Delete the reference to the name 'gm'
del gm 

# 4. Force Python to release memory immediately
gc.collect()

print("Separation Done. RAM cleaned.")

Separating Label...
Separation Done. RAM cleaned.


In [5]:
#split data 80:20 for train and temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size = 0.20,
    stratify = y,
    random_state = 42,
    shuffle = True # shuffle code
)

In [6]:
#split temp 1:1 for validation and test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size = 0.50,
    stratify = y_temp,
    random_state = 42,
    shuffle = True
)

In [8]:
from sklearn.preprocessing import StandardScaler
import numpy as np

print("--- Creating Scaled Data for DNN/SVM ---")

# 1. Log Transform (Compress the huge numbers)
# We create NEW variables so we don't mess up the raw data for trees
X_train_scaled = np.log1p(X_train)
X_val_scaled   = np.log1p(X_val)

# 2. Standard Scaler (Center around 0)
scaler_dnn = StandardScaler()
X_train_scaled = scaler_dnn.fit_transform(X_train_scaled)
X_val_scaled   = scaler_dnn.transform(X_val_scaled)

print("Data Scaled. Ready for DNN/SVM.")

--- Creating Scaled Data for DNN/SVM ---
Data Scaled. Ready for DNN/SVM.


In [None]:
# #pca
# pca = PCA(n_components = 0.95)
# X_train_pca = pca.fit_transform(X_train)
# X_val_pca = pca.transform(X_val)
# X_test_pca = pca.transform(X_test)
# print("pca done")
# joblib.dump(pca, 'pca_brain.joblib')
# print("Step 1 Done: PCA saved!")

In [9]:
Classifier_accuracy = []

In [None]:
#TRAINING DNN

###

In [11]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
import numpy as np

# 1. Weights
weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
dnn_weights = {0: weights[0], 1: weights[1]}

# 2. Build Model
model_dnn = keras.Sequential([
    # This was the broken line. It should look exactly like this:
    layers.InputLayer(input_shape=(X_train_scaled.shape[1],)), 
    
    layers.Dense(128, activation='relu'), 
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    
    layers.Dense(1, activation='sigmoid')
])

model_dnn.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), 
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])

# 3. Train
print("--- Training DNN on SCALED Data ---")
history = model_dnn.fit(
    X_train_scaled, y_train,
    validation_data=(X_val_scaled, y_val),
    epochs=10,
    batch_size=2048, 
    class_weight=dnn_weights,
    verbose=1
)

# 4. Evaluate
y_pred_probs = model_dnn.predict(X_val_scaled)
y_pred_dnn = (y_pred_probs > 0.5).astype("int32")
print(classification_report(y_val, y_pred_dnn))

# 5. Save
model_dnn.save('dnn_model_scaled.keras')

2025-12-18 16:41:02.463257: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-12-18 16:41:02.463447: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-18 16:41:02.488478: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-12-18 16:41:03.069940: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off,

--- Training DNN on SCALED Data ---
Epoch 1/10
[1m6697/6697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3ms/step - accuracy: 0.9945 - loss: 0.0153 - val_accuracy: 0.9976 - val_loss: 0.0068
Epoch 2/10
[1m6697/6697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3ms/step - accuracy: 0.9973 - loss: 0.0079 - val_accuracy: 0.9979 - val_loss: 0.0063
Epoch 3/10
[1m6697/6697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3ms/step - accuracy: 0.9975 - loss: 0.0072 - val_accuracy: 0.9978 - val_loss: 0.0058
Epoch 4/10
[1m6697/6697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3ms/step - accuracy: 0.9976 - loss: 0.0069 - val_accuracy: 0.9981 - val_loss: 0.0055
Epoch 5/10
[1m6697/6697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3ms/step - accuracy: 0.9977 - loss: 0.0066 - val_accuracy: 0.9979 - val_loss: 0.0057
Epoch 6/10
[1m6697/6697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 3ms/step - accuracy: 0.9978 - loss: 0.0065 - val_accuracy: 

In [None]:
#TRAINING KNN AND SVC

###

In [12]:
from sklearn.svm import SVC
import joblib

# 1. Create Scaled Subset
# We take the first 50k rows of the ALREADY SCALED data
X_train_scaled_small = X_train_scaled[:50000]
y_train_small = y_train[:50000]

print("--- Training SVM on SCALED Subset ---")

# We use class_weight='balanced' to handle the 75/25 split
svc_clf = SVC(class_weight='balanced', kernel='rbf', cache_size=2000)

svc_clf.fit(X_train_scaled_small, y_train_small)

# Evaluate on a portion of validation (to be fast) or full validation
print("Predicting SVM...")
y_pred_svm = svc_clf.predict(X_val_scaled) # <--- Predict on SCALED validation

print(classification_report(y_val, y_pred_svm))

joblib.dump(svc_clf, 'svm_final_scaled.joblib')

--- Training SVM on SCALED Subset ---
Predicting SVM...
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    438449
           1       1.00      0.99      1.00   1275896

    accuracy                           0.99   1714345
   macro avg       0.99      0.99      0.99   1714345
weighted avg       0.99      0.99      0.99   1714345



['svm_final_scaled.joblib']