In [4]:
import os, pickle, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except Exception:
    HAS_XGB = False

from sklearn.neighbors import KNeighborsClassifier
from imblearn.under_sampling import NearMiss

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input



DATA_PATH  = "rawdata/combined_iot_data.csv"  
TARGET_COL = "label"                           

DROP_COLS  = ["device"]                       

TEST_SIZE  = 0.30
RANDOM_STATE = 47

DO_BALANCE = True 
DO_SCALE   = True  

df = pd.read_csv(DATA_PATH)
df.columns = df.columns.str.strip()

print("Dataset shape:", df.shape)
print("\nClass distribution:\n", df[TARGET_COL].value_counts().head(20))

df = df.dropna(subset=[TARGET_COL]).copy()

drop_now = [c for c in ([TARGET_COL] + DROP_COLS) if c in df.columns]
X_df = df.drop(columns=drop_now).copy()
y = df[TARGET_COL].astype(str).values

cat_cols = X_df.select_dtypes(include=["object"]).columns
for c in cat_cols:
    X_df[c] = LabelEncoder().fit_transform(X_df[c].astype(str))

X_df = X_df.replace([np.inf, -np.inf], np.nan).fillna(0)

X = X_df.values
feature_names = list(X_df.columns)

print("\nX shape:", X.shape, " | y shape:", y.shape)
print("Num features:", len(feature_names))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

if DO_BALANCE:
    nm = NearMiss()
    X_train, y_train = nm.fit_resample(X_train, y_train)
    print("\nAfter NearMiss - X_train:", X_train.shape, " y_train:", np.unique(y_train, return_counts=True))

scaler = None
if DO_SCALE:
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test  = scaler.transform(X_test)

def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1  = f1_score(y_test, y_pred, average="weighted")
    print("\n" + "="*70)
    print(name)
    print("="*70)
    print("Accuracy:", acc)
    print("F1 (weighted):", f1)
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    return {"name": name, "acc": acc, "f1": f1}


Dataset shape: (701648, 117)

Class distribution:
 label
Mirai     652100
Benign     49548
Name: count, dtype: int64

X shape: (701648, 115)  | y shape: (701648,)
Num features: 115

After NearMiss - X_train: (69368, 115)  y_train: (array(['Benign', 'Mirai'], dtype=object), array([34684, 34684]))


In [5]:
results = []

lr = LogisticRegression(max_iter=2000, n_jobs=-1)
lr.fit(X_train, y_train)
results.append(evaluate_model("Logistic Regression", lr, X_test, y_test))

dt = DecisionTreeClassifier(random_state=RANDOM_STATE)
dt.fit(X_train, y_train)
results.append(evaluate_model("Decision Tree", dt, X_test, y_test))

rf = RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1)
rf.fit(X_train, y_train)
results.append(evaluate_model("Random Forest", rf, X_test, y_test))

if HAS_XGB:
    le_y = LabelEncoder()
    y_train_enc = le_y.fit_transform(y_train)
    y_test_enc  = le_y.transform(y_test)

    xgb = XGBClassifier(
        n_estimators=400,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        random_state=RANDOM_STATE,
        n_jobs=-1
    )

    xgb.fit(X_train, y_train_enc)

    y_pred_enc = xgb.predict(X_test)
    y_pred = le_y.inverse_transform(y_pred_enc)

    acc = accuracy_score(y_test, y_pred)
    f1  = f1_score(y_test, y_pred, average="weighted")

    print("\n" + "="*70)
    print("XGBoost")
    print("="*70)
    print("Accuracy:", acc)
    print("F1 (weighted):", f1)
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

    results.append({"name": "XGBoost", "acc": acc, "f1": f1})

else:
    print("\nXGBoost no disponible. Instala con: pip install xgboost")




Logistic Regression
Accuracy: 0.7144777785695622
F1 (weighted): 0.7840843358587943

Confusion Matrix:
 [[ 14861      3]
 [ 60098 135533]]

Classification Report:
               precision    recall  f1-score   support

      Benign       0.20      1.00      0.33     14864
       Mirai       1.00      0.69      0.82    195631

    accuracy                           0.71    210495
   macro avg       0.60      0.85      0.57    210495
weighted avg       0.94      0.71      0.78    210495


Decision Tree
Accuracy: 0.8677355756668804
F1 (weighted): 0.8946515744969729

Confusion Matrix:
 [[ 14864      0]
 [ 27841 167790]]

Classification Report:
               precision    recall  f1-score   support

      Benign       0.35      1.00      0.52     14864
       Mirai       1.00      0.86      0.92    195631

    accuracy                           0.87    210495
   macro avg       0.67      0.93      0.72    210495
weighted avg       0.95      0.87      0.89    210495


Random Forest
Accuracy:

In [6]:

from sklearn.ensemble import ExtraTreesClassifier

et = ExtraTreesClassifier(
    n_estimators=500,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

et.fit(X_train, y_train)
results.append(evaluate_model("Extra Trees", et, X_test, y_test))



Extra Trees
Accuracy: 0.9995344307465736
F1 (weighted): 0.9995351372920427

Confusion Matrix:
 [[ 14864      0]
 [    98 195533]]

Classification Report:
               precision    recall  f1-score   support

      Benign       0.99      1.00      1.00     14864
       Mirai       1.00      1.00      1.00    195631

    accuracy                           1.00    210495
   macro avg       1.00      1.00      1.00    210495
weighted avg       1.00      1.00      1.00    210495



In [7]:
classes = sorted(list(np.unique(y_train)))
n_classes = len(classes)


label_to_id = {c:i for i,c in enumerate(classes)}
y_train_i = np.array([label_to_id[v] for v in y_train], dtype=np.int32)
y_test_i  = np.array([label_to_id[v] for v in y_test],  dtype=np.int32)

nn = Sequential()
nn.add(Input(shape=(X_train.shape[1],)))
nn.add(Dense(64, activation="relu"))
nn.add(Dropout(0.2))
nn.add(Dense(32, activation="relu"))
nn.add(Dropout(0.2))
nn.add(Dense(16, activation="relu"))

if n_classes == 2:
    nn.add(Dense(1, activation="sigmoid"))
    nn.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    nn.fit(X_train, (y_train_i == 1).astype(np.float32), validation_split=0.2, epochs=10, batch_size=256, verbose=1)
    y_pred_prob = nn.predict(X_test, verbose=0).ravel()
    y_pred_i = (y_pred_prob >= 0.5).astype(int)
else:
    nn.add(Dense(n_classes, activation="softmax"))
    nn.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    nn.fit(X_train, y_train_i, validation_split=0.2, epochs=10, batch_size=256, verbose=1)
    y_pred_i = nn.predict(X_test, verbose=0).argmax(axis=1)

# vuelve a etiquetas originales
id_to_label = {i:c for c,i in label_to_id.items()}
y_pred = np.array([id_to_label[i] for i in y_pred_i])

print("\n" + "="*70)
print("Neural Network (Keras)")
print("="*70)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Epoch 1/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9941 - loss: 0.0366 - val_accuracy: 1.0000 - val_loss: 6.8801e-05
Epoch 2/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9999 - loss: 0.0059 - val_accuracy: 1.0000 - val_loss: 1.0210e-05
Epoch 3/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9999 - loss: 0.0038 - val_accuracy: 1.0000 - val_loss: 3.0062e-06
Epoch 4/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9999 - loss: 7.0250e-04 - val_accuracy: 1.0000 - val_loss: 4.6617e-07
Epoch 5/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9999 - loss: 1.9095e-04 - val_accuracy: 1.0000 - val_loss: 2.4108e-07
Epoch 6/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9999 - loss: 2.0604e-04 - val_accuracy: 1.0000 - val_loss: 1.9