In [2]:
# PHASE 1: BASELINE MODEL (RANDOM FOREST)
# ----------------------------------------
# This script loads the NSL-KDD dataset, preprocesses it,
# and trains a Random Forest classifier to establish
# a baseline for our "AI-Powered Cyber Defense" project.
#
# INSTRUCTIONS FOR COLAB:
# 1. Click the "Files" icon on the left sidebar.
# 2. Click "Upload" and select 'KDDTrain+.txt' and 'KDDTest+.txt'.
# 3. Once uploaded, run this cell.

# Step 1: Import All Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import os # Import 'os' to check if files exist

print("--- Phase 1: Random Forest Baseline ---")

# Step 2: Define Column Names and Load Data
# Check if data files are uploaded to the Colab session
if not (os.path.exists('KDDTrain+.txt') and os.path.exists('KDDTest+.txt')):
    print("--- ERROR ---")
    print("Please upload 'KDDTrain+.txt' and 'KDDTest+.txt' using the file sidebar on the left.")
    print("-" * 30)
else:
    print("--- Data Files Found. Loading... ---")
    col_names = [
        'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
        'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
        'num_compromised', 'root_shell', 'su_attempted', 'num_root',
        'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
        'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
        'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
        'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
        'dst_host_srv_count', 'dst_host_same_srv_rate',
        'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
        'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
        'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
        'dst_host_srv_rerror_rate', 'label', 'difficulty'
    ]

    # Load the training and testing data
    train_data = pd.read_csv('KDDTrain+.txt', header=None, names=col_names)
    test_data = pd.read_csv('KDDTest+.txt', header=None, names=col_names)

    train_data = train_data.drop('difficulty', axis=1)
    test_data = test_data.drop('difficulty', axis=1)

    print("--- Data Loading Successful ---")
    print(f"Training data shape: {train_data.shape}")
    print(f"Testing data shape: {test_data.shape}")
    print("-" * 30)


    # Step 3: Data Preprocessing
    print("--- Starting Data Preprocessing ---")

    # 3a. Label Binarization: 'normal' = 0, 'attack' = 1
    train_data['label'] = train_data['label'].apply(lambda x: 0 if x == 'normal' else 1)
    test_data['label'] = test_data['label'].apply(lambda x: 0 if x == 'normal' else 1)

    # 3b. Identify Categorical Features
    categorical_cols = ['protocol_type', 'service', 'flag']
    print(f"Categorical features to encode: {categorical_cols}")

    # 3c. One-Hot Encoding (Combined Train + Test)
    combined_data = pd.concat([train_data, test_data], axis=0)
    for col in categorical_cols:
        dummies = pd.get_dummies(combined_data[col], prefix=col)
        combined_data = pd.concat([combined_data, dummies], axis=1)
        combined_data.drop(col, axis=1, inplace=True)

    # Separate back into train and test
    train_processed = combined_data.iloc[:len(train_data)]
    test_processed = combined_data.iloc[len(train_data):]

    # 3d. Create Final X (Features) and y (Labels)
    X_train = train_processed.drop('label', axis=1)
    y_train = train_processed['label']
    X_test = test_processed.drop('label', axis=1)
    y_test = test_processed['label']

    # 3e. Align columns (in case of mismatched categories)
    X_train_cols = X_train.columns
    X_test_cols = X_test.columns

    missing_in_test = set(X_train_cols) - set(X_test_cols)
    for c in missing_in_test:
        X_test[c] = 0

    missing_in_train = set(X_test_cols) - set(X_train_cols)
    for c in missing_in_train:
        X_train[c] = 0

    X_test = X_test[X_train_cols]

    # 3f. Feature Scaling
    # Identify numerical columns (those that weren't one-hot encoded)
    numerical_cols = list(set(col_names) - set(categorical_cols) - set(['label', 'difficulty']))

    scaler = StandardScaler()
    X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
    X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

    print("--- Data Preprocessing Complete ---")
    print(f"X_train shape after processing: {X_train.shape}")
    print(f"X_test shape after processing: {X_test.shape}")
    print("-" * 30)


    # Step 4: Model Training (Random Forest)
    print("--- Starting Model Training (Random Forest) ---")

    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    rf_classifier.fit(X_train, y_train)

    print("--- Model Training Complete ---")
    print("-" * 30)


    # Step 5: Model Evaluation
    print("--- Starting Model Evaluation ---")

    y_pred = rf_classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Model Accuracy: {accuracy * 100:.2f}%")
    print("\n")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['Normal (0)', 'Attack (1)']))
    print("-" * 30)

--- Phase 1: Random Forest Baseline ---
--- Data Files Found. Loading... ---
--- Data Loading Successful ---
Training data shape: (125973, 42)
Testing data shape: (22544, 42)
------------------------------
--- Starting Data Preprocessing ---
Categorical features to encode: ['protocol_type', 'service', 'flag']
--- Data Preprocessing Complete ---
X_train shape after processing: (125973, 122)
X_test shape after processing: (22544, 122)
------------------------------
--- Starting Model Training (Random Forest) ---
--- Model Training Complete ---
------------------------------
--- Starting Model Evaluation ---
Model Accuracy: 76.48%


Classification Report:
              precision    recall  f1-score   support

  Normal (0)       0.65      0.97      0.78      9711
  Attack (1)       0.97      0.61      0.75     12833

    accuracy                           0.76     22544
   macro avg       0.81      0.79      0.76     22544
weighted avg       0.83      0.76      0.76     22544

------------