In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from imblearn.over_sampling import SMOTE
from collections import Counter
import torch
import torch.nn as nn
import torch.utils.data as data_utils
import torch.nn.functional as F


from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
    roc_auc_score, 
    roc_curve
)



from sklearn.metrics import classification_report
from torch.utils.data import TensorDataset, DataLoader
from tqdm.notebook import tqdm



import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)


In [None]:
df2 = pd.read_parquet(r"E:\Thesis\Defence\Datasets\CIC_IDS_2017_Binary_label_is_Label.parquet")
print(f"Dataset CIC IDS 2017 Shape: {df2.shape}")
display(df2.tail(10))

In [None]:
class LightAutoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dims=[64, 32], dropout_rate=0.1):
        super(LightAutoencoder, self).__init__()
        
        
        encoder_layers = []
        decoder_layers = []
        

        prev_dim = input_dim
        for dim in encoding_dims:
            encoder_layers.extend([
                nn.Linear(prev_dim, dim),
                nn.BatchNorm1d(dim),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            ])
            prev_dim = dim
    
        encoder_layers = encoder_layers[:-1]
        self.encoder = nn.Sequential(*encoder_layers)
        
        prev_dim = encoding_dims[-1]
        for dim in reversed(encoding_dims[:-1]):
            decoder_layers.extend([
                nn.Linear(prev_dim, dim),
                nn.BatchNorm1d(dim),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            ])
            prev_dim = dim
        
        decoder_layers.extend([
            nn.Linear(prev_dim, input_dim),
            nn.Sigmoid()
        ])
        self.decoder = nn.Sequential(*decoder_layers)
        
        self.apply(self._init_weights)
    
    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.xavier_uniform_(m.weight)
            if m.bias is not None:
                nn.init.zeros_(m.bias)
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    
    def encode(self, x):
        return self.encoder(x)

def train_and_evaluate_autoencoder(X_train, X_val, X_test, device):

    input_dim = X_train.shape[1]
    model = LightAutoencoder(input_dim, encoding_dims=[64, 32], dropout_rate=0.1).to(device)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)
    criterion = nn.MSELoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=0.5)
    
    train_tensor = torch.tensor(X_train, dtype=torch.float32)
    val_tensor = torch.tensor(X_val, dtype=torch.float32)
    
    train_loader = data_utils.DataLoader(
        data_utils.TensorDataset(train_tensor, train_tensor),
        batch_size=512, shuffle=True
    )
    val_loader = data_utils.DataLoader(
        data_utils.TensorDataset(val_tensor, val_tensor),
        batch_size=512, shuffle=False
    )
    
    best_val_loss = float('inf')
    patience_counter = 0
    train_losses, val_losses = [], []
    
    for epoch in range(100):
        model.train()
        train_loss = 0
        for batch_x, _ in train_loader:
            batch_x = batch_x.to(device)
            
            optimizer.zero_grad()
            reconstructed = model(batch_x)
            loss = criterion(reconstructed, batch_x)
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            train_loss += loss.item()
        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_x, _ in val_loader:
                batch_x = batch_x.to(device)
                reconstructed = model(batch_x)
                loss = criterion(reconstructed, batch_x)
                val_loss += loss.item()
        
        
        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)
        train_losses.append(avg_train_loss)
        val_losses.append(avg_val_loss)
        
        scheduler.step(avg_val_loss)
        

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            
            torch.save(model.state_dict(), r'E:\Thesis\Defence\ModelWeights\AE_CIC2017Final.pth')
        else:
            patience_counter += 1
            if patience_counter >= 5:
                print(f"Early stopping at epoch {epoch+1}")
                break
        
        if epoch % 10 == 0:
            print(f"Epoch {epoch+1}: Train Loss: {avg_train_loss:.6f}, Val Loss: {avg_val_loss:.6f}")
    
    # Load best model
    model.load_state_dict(torch.load(r'E:\Thesis\Defence\ModelWeights\AE_CIC2017Final.pth'))


    results = {
        "train_losses": train_losses,
        "val_losses": val_losses
    }
    return model, results

In [None]:
x_bin2 = df2.drop(columns=['Label'])
y_bin2 = df2['Label'] 

X_temp, X_test_final, y_temp, y_test_final = train_test_split(
    x_bin2, y_bin2, test_size=0.2, random_state=42, stratify=y_bin2
)

X_train_bin, X_val_bin, y_train_bin, y_val_bin = train_test_split(
    X_temp, y_temp, test_size=0.2, random_state=42, stratify=y_temp
)

X_train = X_train_bin.values
X_val = X_val_bin.values  
X_test = X_test_final.values

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nDevice: {device}")

model, results = train_and_evaluate_autoencoder(X_train, X_val, X_test, device)
