In [1]:

# 1) Dependencias
!pip install -q scikit-learn pandas matplotlib seaborn

import os
import time
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")



# 5) Cargar datos
df_train = pd.read_csv("/content/KDDTrain+.txt", names=columns)
df_test  = pd.read_csv("/content/KDDTest+.txt",  names=columns)

print("Train:", df_train.shape)
print("Test :", df_test.shape)

# 6) Preprocesamiento
# 6.1 Etiqueta binaria
df_train["label_binary"] = df_train["label"].apply(lambda x: 0 if x == "normal" else 1)
df_test["label_binary"]  = df_test["label"].apply(lambda x: 0 if x == "normal" else 1)

# 6.2 Eliminar columnas no usadas
df_train.drop(columns=["label","difficulty"], inplace=True)
df_test.drop(columns=["label","difficulty"], inplace=True)

# 6.3 One-hot encoding
categorical_cols = ["protocol_type","service","flag"]
df_all = pd.concat([df_train, df_test], axis=0)
df_all = pd.get_dummies(df_all, columns=categorical_cols)

# Separar nuevamente
df_train_enc = df_all.iloc[:len(df_train)]
df_test_enc  = df_all.iloc[len(df_train):]

X_train = df_train_enc.drop(columns=["label_binary"])
y_train = df_train_enc["label_binary"]
X_test  = df_test_enc.drop(columns=["label_binary"])
y_test  = df_test_enc["label_binary"]

# 6.4 Escalado
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

# 7) Modelos
models = {
    "KNN (k=5)": KNeighborsClassifier(n_neighbors=5),
    "Decision Tree": DecisionTreeClassifier(max_depth=10, random_state=42),
    "SVM RBF": SVC(kernel="rbf", probability=True),
    "SVM Lineal": SVC(kernel="linear", probability=True)
}

# 8) Entrenamiento y evaluación
results = []

for name, model in models.items():
    start = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]

    results.append({
        "Modelo": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "ROC_AUC": roc_auc_score(y_test, y_prob),
        "Train_time_s": train_time
    })

    print(f"\n{name}")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred, digits=4))

# 9) Resultados finales
df_results = pd.DataFrame(results).sort_values("F1", ascending=False)
display(df_results)

# 10) ROC curves
plt.figure(figsize=(7,5))
for name, model in models.items():
    y_prob = model.predict_proba(X_test)[:,1]
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    plt.plot(fpr, tpr, label=name)
plt.plot([0,1],[0,1],"--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC - NSL-KDD")
plt.legend()
plt.grid()
plt.show()



⚠️ Descarga manual requerida (1 sola vez):

1. Ve a: https://www.unb.ca/cic/datasets/nsl.html
2. Descarga:
   - KDDTrain+.txt
   - KDDTest+.txt
3. Súbelos a Colab en la carpeta /content/

Luego vuelve a ejecutar esta celda.



AssertionError: Falta KDDTrain+.txt en /content/