In [None]:
# PHASE 2: ADVANCED MODEL (DEEP LEARNING - LSTM)
# -----------------------------------------------
# VERSION 2 (Corrected): Includes a fix for the `dtype: object` error.
#
# INSTRUCTIONS FOR COLAB:
# 1. Ensure `Runtime > Change runtime type` is set to `T4 GPU`.
# 2. Ensure 'KDDTrain+.txt' and 'KDDTest+.txt' are uploaded.
# 3. Run this cell.

# Step 1: Import All Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import os

# Import TensorFlow and Keras libraries for Deep Learning
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

print("--- Phase 2 (Corrected): Advanced LSTM Model ---")
print(f"TensorFlow Version: {tf.__version__}")
print("-" * 30)

# Step 2: Define Column Names and Load Data
if not (os.path.exists('KDDTrain+.txt') and os.path.exists('KDDTest+.txt')):
    print("--- ERROR ---")
    print("Please upload 'KDDTrain+.txt' and 'KDDTest+.txt' using the file sidebar on the left.")
    print("-" * 30)
else:
    print("--- Data Files Found. Loading... ---")
    col_names = [
        'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
        'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
        'num_compromised', 'root_shell', 'su_attempted', 'num_root',
        'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
        'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
        'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
        'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
        'dst_host_srv_count', 'dst_host_same_srv_rate',
        'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
        'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
        'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
        'dst_host_srv_rerror_rate', 'label', 'difficulty'
    ]
    train_data = pd.read_csv('KDDTrain+.txt', header=None, names=col_names)
    test_data = pd.read_csv('KDDTest+.txt', header=None, names=col_names)
    train_data = train_data.drop('difficulty', axis=1)
    test_data = test_data.drop('difficulty', axis=1)
    print("--- Data Loading Successful ---")
    print("-" * 30)


    # Step 3: Data Preprocessing
    print("--- Starting Data Preprocessing ---")
    train_data['label'] = train_data['label'].apply(lambda x: 0 if x == 'normal' else 1)
    test_data['label'] = test_data['label'].apply(lambda x: 0 if x == 'normal' else 1)
    categorical_cols = ['protocol_type', 'service', 'flag']
    combined_data = pd.concat([train_data, test_data], axis=0)
    for col in categorical_cols:
        dummies = pd.get_dummies(combined_data[col], prefix=col)
        combined_data = pd.concat([combined_data, dummies], axis=1)
        combined_data.drop(col, axis=1, inplace=True)

    train_processed = combined_data.iloc[:len(train_data)]
    test_processed = combined_data.iloc[len(train_data):]

    y_train = train_processed['label']
    y_test = test_processed['label']
    X_train = train_processed.drop('label', axis=1)
    X_test = test_processed.drop('label', axis=1)

    X_train_cols = X_train.columns
    X_test_cols = X_test.columns
    missing_in_test = set(X_train_cols) - set(X_test_cols)
    for c in missing_in_test:
        X_test[c] = 0
    missing_in_train = set(X_test_cols) - set(X_train_cols)
    for c in missing_in_train:
        X_train[c] = 0
    X_test = X_test[X_train_cols]

    numerical_cols = list(set(col_names) - set(categorical_cols) - set(['label', 'difficulty']))
    scaler = StandardScaler()
    X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
    X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

    # Save feature names for Phase 3 (XAI)
    feature_names = X_train.columns.tolist()

    print("--- Data Preprocessing Complete ---")


    # ------------------------------------------
    # !! THE FIX IS HERE !!
    # We must cast the entire DataFrame to a single numerical type
    # (float32) *before* converting to NumPy for TensorFlow.
    # ------------------------------------------
    print("--- Casting data to uniform float32 type ---")
    X_train = X_train.astype(np.float32)
    X_test = X_test.astype(np.float32)
    y_train = y_train.astype(np.float32)
    y_test = y_test.astype(np.float32)
    print("--- Casting Complete ---")
    print("-" * 30)


    # Step 4: Reshape Data for LSTM
    print("--- Reshaping Data for LSTM ---")

    # Convert from Pandas DataFrame to NumPy array
    X_train_np = X_train.values
    X_test_np = X_test.values

    # Reshape
    X_train_reshaped = np.reshape(X_train_np, (X_train_np.shape[0], 1, X_train_np.shape[1]))
    X_test_reshaped = np.reshape(X_test_np, (X_test_np.shape[0], 1, X_test_np.shape[1]))

    print(f"X_train shape (3D): {X_train_reshaped.shape}")
    print(f"X_test shape (3D): {X_test_reshaped.shape}")
    print("-" * 30)


    # Step 5: Build the LSTM Model Architecture
    print("--- Building LSTM Model ---")

    model = Sequential()
    model.add(LSTM(64, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    print("-" * 30)


    # Step 6: Train the LSTM Model
    print("--- Starting Model Training (LSTM) ---")
    print("This will take a few minutes, even with a GPU...")

    history = model.fit(
        X_train_reshaped,
        y_train,
        epochs=10,
        batch_size=64,
        validation_data=(X_test_reshaped, y_test),
        verbose=1
    )

    print("--- Model Training Complete ---")
    print("-" * 30)


    # Step 7: Evaluate the "Promise" Model
    print("--- Starting Model Evaluation (LSTM) ---")

    y_pred_probs = model.predict(X_test_reshaped)
    y_pred = (y_pred_probs > 0.5).astype("int32")

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy (LSTM): {accuracy * 100:.2f}%")
    print("\n")

    print("Classification Report (LSTM):")
    print(classification_report(y_test, y_pred, target_names=['Normal (0)', 'Attack (1)']))
    print("-" * 30)