In [8]:
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    roc_auc_score, confusion_matrix
)

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset


In [9]:
train_path = "../raw_data/UNSW_NB15_training-set.csv"
test_path  = "../raw_data/UNSW_NB15_testing-set.csv"

assert os.path.exists(train_path), "Arquivo de treino não encontrado"
assert os.path.exists(test_path), "Arquivo de teste não encontrado"

train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)

train_df.shape, test_df.shape

((175341, 45), (82332, 45))

In [10]:
leak_cols = ["label", "attack_cat"]

X_train = train_df.drop(columns=leak_cols)
y_train = train_df["label"]

X_test  = test_df.drop(columns=leak_cols)
y_test  = test_df["label"]




In [11]:
cat_cols = X_train.select_dtypes(include=["object"]).columns.tolist()
num_cols = X_train.select_dtypes(exclude=["object"]).columns.tolist()

print("Categóricas:", cat_cols)
print("Numéricas :", len(num_cols))


Categóricas: ['proto', 'service', 'state']
Numéricas : 40


In [12]:
n_train = len(X_train)

X_all = pd.concat([X_train, X_test], axis=0)

X_all = pd.get_dummies(
    X_all,
    columns=cat_cols,
    drop_first=True
)

X_train = X_all.iloc[:n_train].copy()
X_test  = X_all.iloc[n_train:].copy()

X_train.shape, X_test.shape


((175341, 194), (82332, 194))

In [13]:
scaler = StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols]  = scaler.transform(X_test[num_cols])


In [14]:
# === Conversão final para NumPy ===
X_train_np = X_train.to_numpy(dtype=np.float32)
X_test_np  = X_test.to_numpy(dtype=np.float32)

y_train_np = y_train.to_numpy(dtype=np.float32)
y_test_np  = y_test.to_numpy(dtype=np.float32)

print(X_train_np.shape, X_test_np.shape)



(175341, 194) (82332, 194)


In [15]:
X_train_t = torch.from_numpy(X_train_np)
y_train_t = torch.from_numpy(y_train_np)

X_test_t  = torch.from_numpy(X_test_np)
y_test_t  = torch.from_numpy(y_test_np)

train_ds = TensorDataset(X_train_t, y_train_t)
test_ds  = TensorDataset(X_test_t, y_test_t)

train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=512, shuffle=False)


In [16]:
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.net(x).squeeze(1)


In [17]:
pos_weight = torch.tensor(
    (y_train_np == 0).sum() / (y_train_np == 1).sum(),
    dtype=torch.float32
)

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)


In [18]:
model = MLP(X_train_np.shape[1])
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


In [19]:
epochs = 10

for epoch in range(epochs):
    model.train()
    total_loss = 0.0

    for xb, yb in train_loader:
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f}")


Epoch 1/10 - Loss: 0.0930
Epoch 2/10 - Loss: 0.0678
Epoch 3/10 - Loss: 0.0587
Epoch 4/10 - Loss: 0.0504
Epoch 5/10 - Loss: 0.0457
Epoch 6/10 - Loss: 0.0432
Epoch 7/10 - Loss: 0.0409
Epoch 8/10 - Loss: 0.0397
Epoch 9/10 - Loss: 0.0377
Epoch 10/10 - Loss: 0.0370


In [20]:
model.eval()
all_probs = []

with torch.no_grad():
    for xb, _ in test_loader:
        logits = model(xb)
        probs = torch.sigmoid(logits)
        all_probs.append(probs)

y_scores = torch.cat(all_probs).cpu().numpy()


In [21]:
threshold = 0.32
y_pred = (y_scores >= threshold).astype(int)

tn, fp, fn, tp = confusion_matrix(y_test_np, y_pred).ravel()

accuracy  = accuracy_score(y_test_np, y_pred)
precision = precision_score(y_test_np, y_pred)
recall    = recall_score(y_test_np, y_pred)
roc_auc   = roc_auc_score(y_test_np, y_scores)
fpr       = fp / (fp + tn)

print(f"Threshold : {threshold}")
print(f"Accuracy  : {accuracy*100:.2f}%")
print(f"Precision : {precision*100:.2f}%")
print(f"Recall    : {recall*100:.2f}%")
print(f"FPR       : {fpr*100:.2f}%")
print(f"ROC AUC   : {roc_auc*100:.2f}%")


Threshold : 0.32
Accuracy  : 57.44%
Precision : 67.49%
Recall    : 43.82%
FPR       : 25.86%
ROC AUC   : 66.21%


In [77]:
print([c for c in X_train.columns if "attack" in c.lower()])


[]
