### Imports

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import joblib
import os
from sklearn.preprocessing import StandardScaler
import glob
from pathlib import Path

# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)

### Data Clean

In [None]:
# Load dataset
# Use the known path or relative path
if os.path.exists(r"c:\Users\pompk\Desktop\Final-Car\Csci-Capstone\Dataset"):
    data_path = Path(r"c:\Users\pompk\Desktop\Final-Car\Csci-Capstone\Dataset")
elif os.path.exists(r"../Dataset"): # Try relative path if in a different env
    data_path = Path(r"../Dataset")
else:
    # Fallback or raise error
    data_path = Path("Dataset")
    print(f"Warning: Defaulting to local 'Dataset' folder. Ensure it exists.")

csv_files = list(data_path.glob("*.csv"))

if not csv_files:
    raise FileNotFoundError(f"No CSV files found in {data_path}")

print("Loading CSV files...")
df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)
print(f"Loaded {len(csv_files)} files with shape {df.shape}")

# CLEANING STEPS FROM IsoForest.ipynb
# 1. Column standardization
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# 2. String stripping
for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].str.strip()

# 3. Replace inf/nan
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# 4. Drop columns with > 30% missing
df = df.loc[:, df.isna().mean() < 0.3]

# 5. Fill numeric NaN with median
numeric_cols = df.select_dtypes(include="number").columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# 6. Drop duplicates
df.drop_duplicates(inplace=True)

# 7. Drop specific columns
drop_cols = [
    "fwd_header_length.1",
    "fwd_avg_bytes/bulk","fwd_avg_packets/bulk","fwd_avg_bulk_rate",
    "bwd_avg_bytes/bulk","bwd_avg_packets/bulk","bwd_avg_bulk_rate"
]
df = df.drop(columns=drop_cols, errors="ignore")

In [None]:
# -------------------------------
# Layered IDS: Autoencoder (Anomaly Detection)
# -------------------------------

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import joblib
import os
from sklearn.preprocessing import StandardScaler

# -------------------------------
# 1️⃣ Feature Selection & Correlation
# -------------------------------
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
if 'label' in numeric_features:
    numeric_features.remove('label')

X_unsup = df[numeric_features]

# Remove highly correlated features (> 0.9)
corr_matrix = X_unsup.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > 0.9)]
X_unsup_clean = X_unsup.drop(columns=to_drop)

print(f"Dropped correlated features: {len(to_drop)}")

# -------------------------------
# 2️⃣ Scaling
# -------------------------------
scaler = StandardScaler()
data_normalized = scaler.fit_transform(X_unsup_clean)

D, K = data_normalized.shape
print(f"Dataset shape: {D} samples, {K} features")

# Convert the numpy array to a PyTorch tensor
data_tensor = torch.tensor(data_normalized, dtype=torch.float32)

# -------------------------------
# 3️⃣ Model Definition
# -------------------------------
class Autoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(Autoencoder, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU()
        )
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Hyperparameters
input_dim = K
hidden_dim = 16

# Initialize
model = Autoencoder(input_dim=input_dim, hidden_dim=hidden_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# -------------------------------
# 4️⃣ Training Loop
# -------------------------------
num_epochs = 30
batch_size = 32

print("Training model...")
model.train()
for epoch in range(num_epochs):
    for i in range(0, D, batch_size):
        batch_data = data_tensor[i:i+batch_size]

        # Forward pass
        outputs = model(batch_data)
        loss = criterion(outputs, batch_data)

        # Backward pass
        optimizer.zero_grad()  
        loss.backward()
        optimizer.step()

    if (epoch+1) % 5 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# -------------------------------
# 5️⃣ Evaluation & Threshold
# -------------------------------
model.eval()
with torch.no_grad():
    encoded_output = model.encoder(data_tensor).numpy()
    ae_outputs = model(data_tensor)
    reconstruction_errors = torch.mean((ae_outputs - data_tensor) ** 2, dim=1).numpy()

# Calculate statistics for threshold
mean_error = np.mean(reconstruction_errors)
std_error = np.std(reconstruction_errors)
threshold_value = mean_error + (3 * std_error)

print(f"Reconstruction Error Statistics:")
print(f"Mean: {mean_error:.6f}")
print(f"Std Dev: {std_error:.6f}")
print(f"Threshold (mean + 3*std): {threshold_value:.6f}")

# -------------------------------
# 6️⃣ Save Artifacts
# -------------------------------
dump_dir = 'AutoEncoderDumps'
if not os.path.exists(dump_dir):
    os.makedirs(dump_dir)

joblib.dump(model, os.path.join(dump_dir, 'autoencoder_model.pkl'))
joblib.dump(scaler, os.path.join(dump_dir, 'autoencoder_scaler.pkl'))
joblib.dump({'mean': mean_error, 'std': std_error, 'threshold': threshold_value}, os.path.join(dump_dir, 'autoencoder_threshold.pkl'))

print(f"Artifacts saved to {dump_dir}: autoencoder_model.pkl, autoencoder_scaler.pkl, autoencoder_threshold.pkl")