## Imports

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (classification_report, confusion_matrix,
                             roc_auc_score, average_precision_score)
from sklearn.utils.class_weight import compute_class_weight

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, optimizers

2025-10-23 19:23:41.804751: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-23 19:23:53.689785: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-10-23 19:23:57.525141: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-10-23 19:24:20.816929: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## rutas

In [2]:
DATASET_PATH = "..\dataset\dataset_final.csv"

In [3]:
df = pd.read_csv(DATASET_PATH)

In [4]:
print(df.shape)
print(df.info())
df.head()

(2660377, 20)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2660377 entries, 0 to 2660376
Data columns (total 20 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Flow Duration                int64  
 1   Total Fwd Packets            int64  
 2   Total Backward Packets       int64  
 3   Total Length of Fwd Packets  float64
 4   Total Length of Bwd Packets  float64
 5   Fwd Packet Length Max        float64
 6   Fwd Packet Length Min        float64
 7   Fwd Packet Length Mean       float64
 8   Bwd Packet Length Max        float64
 9   Bwd Packet Length Min        float64
 10  Bwd Packet Length Mean       float64
 11  Flow Bytes/s                 float64
 12  Flow Packets/s               float64
 13  Init_Win_bytes_forward       int64  
 14  Init_Win_bytes_backward      int64  
 15  act_data_pkt_fwd             int64  
 16  min_seg_size_forward         int64  
 17  Active Mean                  float64
 18  Idle Mean                   

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Flow Bytes/s,Flow Packets/s,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Idle Mean,Attack_Label
0,4,2,0,12.0,0.0,6.0,6.0,6.0,0.0,0.0,0.0,3000000.0,500000.0,329,-1,1,20,0.0,0.0,BENIGN
1,1,2,0,12.0,0.0,6.0,6.0,6.0,0.0,0.0,0.0,12000000.0,2000000.0,329,-1,1,20,0.0,0.0,BENIGN
2,1,2,0,12.0,0.0,6.0,6.0,6.0,0.0,0.0,0.0,12000000.0,2000000.0,329,-1,1,20,0.0,0.0,BENIGN
3,1,2,0,12.0,0.0,6.0,6.0,6.0,0.0,0.0,0.0,12000000.0,2000000.0,329,-1,1,20,0.0,0.0,BENIGN
4,3,2,0,12.0,0.0,6.0,6.0,6.0,0.0,0.0,0.0,4000000.0,666666.7,245,-1,1,20,0.0,0.0,BENIGN


In [5]:
df['Attack_Label'].value_counts()

Attack_Label
BENIGN              2104911
DoS Hulk             231073
PortScan             158930
DDoS                 128027
DoS GoldenEye         10293
FTP-Patator            7938
SSH-Patator            5897
DoS slowloris          5796
DoS Slowhttptest       5499
Bot                    1966
Infiltration             36
Heartbleed               11
Name: count, dtype: int64

In [6]:
df["y"] = (df["Attack_Label"].astype(str) != "BENIGN").astype(int)

In [7]:
df['y'].value_counts()

y
0    2104911
1     555466
Name: count, dtype: int64

In [8]:
feature_cols = [c for c in df.columns if c not in ["Attack_Label", "y"]]
feature_cols

['Flow Duration',
 'Total Fwd Packets',
 'Total Backward Packets',
 'Total Length of Fwd Packets',
 'Total Length of Bwd Packets',
 'Fwd Packet Length Max',
 'Fwd Packet Length Min',
 'Fwd Packet Length Mean',
 'Bwd Packet Length Max',
 'Bwd Packet Length Min',
 'Bwd Packet Length Mean',
 'Flow Bytes/s',
 'Flow Packets/s',
 'Init_Win_bytes_forward',
 'Init_Win_bytes_backward',
 'act_data_pkt_fwd',
 'min_seg_size_forward',
 'Active Mean',
 'Idle Mean']

In [9]:
X = df[feature_cols].copy()

In [10]:
for col in ["Init_Win_bytes_forward", "Init_Win_bytes_backward"]:
    if col in X.columns:
        X.loc[X[col] < 0, col] = 0

In [11]:
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median(numeric_only=True))

In [12]:
y = df["y"].values.astype(int)

In [13]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X.values, y, test_size=0.30, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

In [14]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)

In [15]:

input_dim = X_train.shape[1]

In [16]:
# -----------------------------
# 2) Stacked Autoencoder (no supervisado)
# -----------------------------
enc_units = [256, 128, 64]  # puedes ajustar
bottleneck = 32             # tamaño del código comprimido

ae_in = layers.Input(shape=(input_dim,), name="ae_input")
h = layers.Dense(enc_units[0], activation="relu")(ae_in)
h = layers.Dense(enc_units[1], activation="relu")(h)
h = layers.Dense(enc_units[2], activation="relu")(h)
code = layers.Dense(bottleneck, activation="relu", name="bottleneck")(h)

h = layers.Dense(enc_units[2], activation="relu")(code)
h = layers.Dense(enc_units[1], activation="relu")(h)
h = layers.Dense(enc_units[0], activation="relu")(h)
ae_out = layers.Dense(input_dim, activation="linear")(h)

ae = models.Model(ae_in, ae_out, name="stacked_autoencoder")
ae.compile(optimizer=optimizers.Adam(1e-3), loss="mse")

ckpt_dir = Path("checkpoints")
ckpt_dir.mkdir(exist_ok=True, parents=True)
ckpt_ae = callbacks.ModelCheckpoint(
    filepath=str(ckpt_dir / "ae_best.keras"),
    save_best_only=True, monitor="val_loss", mode="min"
)
early_ae = callbacks.EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)

hist_ae = ae.fit(
    X_train, X_train,
    validation_data=(X_val, X_val),
    epochs=100, batch_size=512,
    callbacks=[ckpt_ae, early_ae],
    verbose=1
)


Epoch 1/100

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100


In [17]:

# Extraer encoder (hasta la capa bottleneck)
encoder = models.Model(ae.input, ae.get_layer("bottleneck").output, name="encoder")

In [18]:
# Representaciones comprimidas
Z_train = encoder.predict(X_train, batch_size=1024, verbose=0)
Z_val   = encoder.predict(X_val,   batch_size=1024, verbose=0)
Z_test  = encoder.predict(X_test,  batch_size=1024, verbose=0)

In [None]:
# -----------------------------
# 3) Clasificador CNN + LSTM
#    Tratamos el code (bottleneck) como secuencia 1D: (len=bottleneck, ch=1)
# -----------------------------
def to_seq(arr):
    return arr.reshape((arr.shape[0], arr.shape[1], 1))

Ztr_seq = to_seq(Z_train)
Zva_seq = to_seq(Z_val)
Zte_seq = to_seq(Z_test)

seq_len = Ztr_seq.shape[1]

inp_seq = layers.Input(shape=(seq_len, 1), name="seq_input")
x = layers.Conv1D(filters=64, kernel_size=3, padding="same", activation="relu")(inp_seq)
x = layers.MaxPooling1D(pool_size=2)(x)
x = layers.Conv1D(filters=128, kernel_size=3, padding="same", activation="relu")(x)
x = layers.Dropout(0.3)(x)
x = layers.LSTM(64, return_sequences=False)(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.3)(x)
out = layers.Dense(1, activation="sigmoid")(x)

clf = models.Model(inp_seq, out, name="cnn_lstm_ddos")

# Class weights para desbalance (0=BENIGN, 1=ATTACK)
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weight_dict = {int(c): w for c, w in zip(classes, class_weights)}
print("Class weights:", class_weight_dict)

clf.compile(optimizer=optimizers.Adam(1e-3),
            loss="binary_crossentropy",
            metrics=["accuracy"])

ckpt_clf = callbacks.ModelCheckpoint(
    filepath=str(ckpt_dir / "cnn_lstm_best.keras"),
    save_best_only=True, monitor="val_accuracy", mode="max"
)
early_clf = callbacks.EarlyStopping(monitor="val_accuracy", patience=10, restore_best_weights=True)
reduce_lr  = callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=5, verbose=1)

hist_clf = clf.fit(
    Ztr_seq, y_train,
    validation_data=(Zva_seq, y_val),
    epochs=100, batch_size=512,
    class_weight=class_weight_dict,
    callbacks=[ckpt_clf, early_clf, reduce_lr],
    verbose=1
)


Class weights: {0: 0.6319452409570276, 1: 2.394725404165359}
Epoch 1/100

Epoch 2/100
Epoch 3/100
Epoch 4/100

In [None]:
# -----------------------------
# 4) Evaluación
# -----------------------------
y_prob = clf.predict(Zte_seq, batch_size=1024, verbose=0).ravel()
y_pred = (y_prob >= 0.5).astype(int)

print("\nClassification report (Test):")
print(classification_report(y_test, y_pred, digits=4, target_names=["BENIGN","ATTACK"]))

print("Confusion matrix (Test):")
print(confusion_matrix(y_test, y_pred))

try:
    roc = roc_auc_score(y_test, y_prob)
    pr  = average_precision_score(y_test, y_prob)
    print(f"ROC-AUC: {roc:.5f} | PR-AUC: {pr:.5f}")
except Exception as e:
    print("AUC calc error:", e)


Classification report (Test):
              precision    recall  f1-score   support

      BENIGN     0.9990    0.9895    0.9942    315737
      ATTACK     0.9617    0.9962    0.9786     83320

    accuracy                         0.9909    399057
   macro avg     0.9803    0.9929    0.9864    399057
weighted avg     0.9912    0.9909    0.9910    399057

Confusion matrix (Test):
[[312428   3309]
 [   315  83005]]
ROC-AUC: 0.99959 | PR-AUC: 0.99841


In [None]:
# -----------------------------
# 5) Guardado de artefactos
# -----------------------------
import joblib
joblib.dump(scaler, str(ckpt_dir / "scaler.pkl"))
encoder.save(str(ckpt_dir / "encoder_best.keras"))
clf.save(str(ckpt_dir / "cnn_lstm_best_final.keras"))

In [None]:
# -----------------------------
# 6) Función de inferencia en crudo (DataFrame -> predicción)
# -----------------------------
def predict_from_df(df_in: pd.DataFrame) -> np.ndarray:
    """Devuelve probabilidades de ATTACK para un df con mismas columnas."""
    tmp = df_in.copy()
    tmp["y"] = (tmp["Attack_Label"].astype(str) != "BENIGN").astype(int) if "Attack_Label" in tmp else 0
    feats = [c for c in tmp.columns if c not in ["Attack_Label","y"]]
    Xx = tmp[feats].copy()

    # mismo saneamiento que train
    for col in ["Init_Win_bytes_forward", "Init_Win_bytes_backward"]:
        if col in Xx.columns:
            Xx.loc[Xx[col] < 0, col] = 0
    Xx = Xx.replace([np.inf, -np.inf], np.nan)
    Xx = Xx.fillna(Xx.median(numeric_only=True))

    Xx = scaler.transform(Xx.values)
    Zx = encoder.predict(Xx, batch_size=1024, verbose=0).reshape((-1, bottleneck, 1))
    p  = clf.predict(Zx, batch_size=1024, verbose=0).ravel()
    return p