<a href="https://colab.research.google.com/github/fadhluibnu/ANOMALY_IOT_NETWORK_DETECTION/blob/main/2_IOT_ANOMALI_DETECTION_GROX_GPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bagian 1: Preprocessing

In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.feature_selection import mutual_info_classif, SelectKBest, VarianceThreshold
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from scipy.stats import zscore
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

def enhanced_preprocessing(train_data, test_data):
    """
    Enhanced preprocessing pipeline for IoT network anomaly detection.
    Includes improved feature engineering, selection, and balancing.

    Args:
        train_data: Original training dataframe
        test_data: Original testing dataframe

    Returns:
        processed_data: Dictionary with all processed datasets and loaders
    """
    print("\n🔍 Starting enhanced preprocessing pipeline...")

    # Make copies to avoid modifying originals
    train_data = train_data.copy()
    test_data = test_data.copy()

    # 1. Data Exploration
    print("\n📊 Dataset Overview:")
    print(f"Training data shape: {train_data.shape}")
    print(f"Testing data shape: {test_data.shape}")

    # Check for missing values
    train_missing = train_data.isnull().sum().sum()
    test_missing = test_data.isnull().sum().sum()
    print(f"Missing values - Training: {train_missing}, Testing: {test_missing}")

    # 2. Drop rows with missing attack_cat (target variable)
    print("\n🔍 Handling missing target values...")
    train_data = train_data.dropna(subset=['attack_cat'])
    test_data = test_data.dropna(subset=['attack_cat'])
    print(f"After dropping rows with missing targets - Train: {train_data.shape}, Test: {test_data.shape}")

    # 3. Identify numerical and categorical columns
    numerical_cols = train_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
    numerical_cols = [col for col in numerical_cols if col not in ['id', 'label', 'attack_cat']]
    categorical_cols = train_data.select_dtypes(include=['object']).columns.tolist()

    # 4. Handle remaining missing values
    print("\n🔍 Handling remaining missing values...")
    for col in numerical_cols:
        # Use median for numerical features (more robust than mean)
        train_data[col] = train_data[col].fillna(train_data[col].median())
        test_data[col] = test_data[col].fillna(train_data[col].median())

    for col in categorical_cols:
        if col != 'attack_cat':
            # Use mode (most frequent) for categorical features
            train_data[col] = train_data[col].fillna(train_data[col].mode()[0])
            test_data[col] = test_data[col].fillna(train_data[col].mode()[0])
    train_data.to_csv('/content/drive/MyDrive/ANOMALI DUN DUN/train_data.csv', index=False)
    test_data.to_csv('/content/drive/MyDrive/ANOMALI DUN DUN/test_data.csv', index=False)

    # 5. Remove duplicate rows
    print("\n🔍 Removing duplicate entries...")
    train_data = train_data.drop_duplicates()
    test_data = test_data.drop_duplicates()
    print(f"After removing duplicates - Train: {train_data.shape}, Test: {test_data.shape}")

    # 6. Encode attack_cat (target variable)
    print("\n🔍 Encoding target variable...")
    attack_mapping = {
        'Normal': 0, 'Generic': 1, 'Exploits': 2, 'Fuzzers': 3, 'DoS': 4,
        'Reconnaissance': 5, 'Analysis': 6, 'Backdoor': 7, 'Shellcode': 8, 'Worms': 9
    }
    train_data['attack_cat'] = train_data['attack_cat'].map(attack_mapping)
    test_data['attack_cat'] = test_data['attack_cat'].map(attack_mapping)

    # print("Info")
    # print(train_data.info())

    # Check class distribution
    class_counts = train_data['attack_cat'].value_counts().sort_index()
    print("Class distribution after encoding:")
    for class_id, count in class_counts.items():
        print(f"  Class {class_id}: {count} samples ({100*count/len(train_data):.2f}%)")

    # 7. IMPROVED: Handle outliers first, then transform data (before scaling)
    print("\n🔍 Handling outliers with improved Winsorization...")
    for col in numerical_cols:
        # Use more conservative percentiles for winsorization (0.5% - 99.5%)
        lower_bound = train_data[col].quantile(0.005)
        upper_bound = train_data[col].quantile(0.995)

        # Apply clipping to both train and test data
        train_data[col] = train_data[col].clip(lower=lower_bound, upper=upper_bound)
        test_data[col] = test_data[col].clip(lower=lower_bound, upper=upper_bound)


    # 8. Advanced feature transformation for skewed features
    print("\n🔍 Applying transformations for skewed numerical features...")
    for col in numerical_cols:
        # Check if data is significantly skewed
        skewness = train_data[col].skew()
        if abs(skewness) > 1.5:  # More aggressive threshold for transformation
            # Apply log transformation (adding a constant to handle zeros/negatives)
            train_min = train_data[col].min()
            offset = 1 - min(0, train_min)  # Ensure all values are positive

            train_data[col] = np.log1p(train_data[col] + offset)
            test_data[col] = np.log1p(test_data[col] + offset)
            print(f"  Applied log transform to {col} (skewness: {skewness:.2f})")

    # 9. Encoding categorical features with enhanced handling
    print("\n🔍 Encoding categorical features...")
    # Use OneHotEncoder with improved handling for test data
    if len(categorical_cols) > 0:
        # encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        # encoded_train = encoder.fit_transform(train_data[categorical_cols])
        # encoded_test = encoder.transform(test_data[categorical_cols])

        # # Get feature names
        # feature_names = encoder.get_feature_names_out(categorical_cols)

        # # Create DataFrames with encoded features
        # encoded_train_df = pd.DataFrame(encoded_train, columns=feature_names, index=train_data.index)
        # encoded_test_df = pd.DataFrame(encoded_test, columns=feature_names, index=test_data.index)

        # # Drop original categorical columns and join encoded ones
        # train_data = train_data.drop(columns=categorical_cols).reset_index(drop=True)
        # test_data = test_data.drop(columns=categorical_cols).reset_index(drop=True)

        # train_data = pd.concat([train_data, encoded_train_df], axis=1)
        # test_data = pd.concat([test_data, encoded_test_df], axis=1)

        categorical_cols = train_data.select_dtypes(include=['object']).columns
        encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        encoded_train = encoder.fit_transform(train_data[categorical_cols])
        encoded_test = encoder.transform(test_data[categorical_cols])

        encoded_train = pd.DataFrame(encoded_train, columns=encoder.get_feature_names_out(categorical_cols))
        encoded_test = pd.DataFrame(encoded_test, columns=encoder.get_feature_names_out(categorical_cols))

        train_data = train_data.drop(columns=categorical_cols).reset_index(drop=True)
        test_data = test_data.drop(columns=categorical_cols).reset_index(drop=True)

        train_data = pd.concat([train_data, encoded_train], axis=1)
        test_data = pd.concat([test_data, encoded_test], axis=1)

        print(f"  Encoded {len(categorical_cols)} categorical features into {encoded_train.shape[1]} binary features")



    print("AWKWKWInfo")
    print(train_data.info())
    print(test_data.info())
    # 10. IMPROVED: Two-stage feature selection using variance and mutual information
    print("\n🔍 Performing improved two-stage feature selection...")
    # Exclude target and ID columns
    X_train = train_data.drop(columns=['id', 'label', 'attack_cat'])
    y_train = train_data['attack_cat']

    # Stage 1: Remove low variance features
    selector_var = VarianceThreshold(threshold=0.01)
    selector_var.fit(X_train)
    low_var_features = X_train.columns[~selector_var.get_support()].tolist()
    if low_var_features:
        print(f"  Removed {len(low_var_features)} low variance features")
        X_train = X_train.drop(columns=low_var_features)

    # Stage 2: Select features using mutual information
    # Keep more features (95% of remaining or max 150 features)
    k = min(150, int(X_train.shape[1] * 0.95))
    selector_mi = SelectKBest(mutual_info_classif, k=k)
    selector_mi.fit(X_train, y_train)

    # Get selected feature names
    selected_features = X_train.columns[selector_mi.get_support()].tolist()
    print(f"  Selected {len(selected_features)} features after two-stage selection")

    # Keep only selected features
    features_to_keep = ['id', 'label', 'attack_cat'] + selected_features
    train_data = train_data[features_to_keep]
    test_data = test_data[features_to_keep]

    # Update numerical columns list to reflect selected features only
    numerical_cols = [col for col in selected_features if col in numerical_cols]

    # 11. IMPROVED: Apply scaling after outlier handling
    print("\n🔍 Applying robust scaling to numerical features...")
    scaler = RobustScaler()  # Still more robust than StandardScaler

    # If no numerical columns remain after selection, skip scaling
    if numerical_cols:
        train_data[numerical_cols] = scaler.fit_transform(train_data[numerical_cols])
        test_data[numerical_cols] = scaler.transform(test_data[numerical_cols])

    # 12. IMPROVED: Enhanced resampling with ADASYN and undersampling
    print("\n🔍 Applying enhanced balanced resampling...")
    # Prepare data for resampling
    X_train = train_data.drop(columns=['id', 'label', 'attack_cat'])
    y_train = train_data['attack_cat']

    # Print class distribution before resampling
    print("  Class distribution before resampling:")
    for class_id, count in y_train.value_counts().sort_index().items():
        print(f"    Class {class_id}: {count} samples ({100*count/len(y_train):.2f}%)")

    # Calculate max samples per class - balanced but with limits
    max_samples = min(8000, int(len(y_train) * 0.15))  # Limit max samples per minority class
    sampling_strategy = {i: min(max_samples, count) for i, count in y_train.value_counts().items()}

    # For majority class, keep more samples but not too many
    majority_class = y_train.value_counts().idxmax()
    sampling_strategy[majority_class] = min(int(len(y_train) * 0.3), y_train.value_counts()[majority_class])

    print("  Target sample counts per class:")
    for class_id, target_count in sorted(sampling_strategy.items()):
        print(f"    Class {class_id}: {target_count} samples")

    # Apply the resampling pipeline
    try:
        # First try ADASYN which requires multiple samples per class
        resampler = Pipeline([
            ('over', ADASYN(sampling_strategy='minority', random_state=42, n_neighbors=5)),
            ('under', RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42))
        ])
        X_train_resampled, y_train_resampled = resampler.fit_resample(X_train, y_train)
    except ValueError as e:
        print(f"  ADASYN failed: {e}. Falling back to SMOTE.")
        # Fall back to SMOTE with different neighbor settings
        from imblearn.over_sampling import SMOTE
        resampler = Pipeline([
            ('over', SMOTE(sampling_strategy='minority', random_state=42, k_neighbors=3)),
            ('under', RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42))
        ])
        X_train_resampled, y_train_resampled = resampler.fit_resample(X_train, y_train)

    # Print class distribution after resampling
    print("  Class distribution after resampling:")
    for class_id, count in pd.Series(y_train_resampled).value_counts().sort_index().items():
        print(f"    Class {class_id}: {count} samples ({100*count/len(y_train_resampled):.2f}%)")

    # Create new DataFrame with resampled data
    train_data_resampled = pd.DataFrame(X_train_resampled, columns=X_train.columns)
    train_data_resampled['attack_cat'] = y_train_resampled
    train_data_resampled['id'] = range(len(train_data_resampled))
    train_data_resampled['label'] = train_data_resampled['attack_cat'] != 0  # Binary label (0=Normal)

    # 13. Check for any remaining issues
    print("\n🔍 Final data quality check...")
    # Check for infinities
    train_data_resampled = train_data_resampled.replace([np.inf, -np.inf], np.nan)
    test_data = test_data.replace([np.inf, -np.inf], np.nan)

    # Check for NaN and fill if any
    if train_data_resampled.isnull().sum().sum() > 0:
        print(f"  Found {train_data_resampled.isnull().sum().sum()} NaN values in training data. Filling with column medians.")
        for col in train_data_resampled.columns:
            if train_data_resampled[col].isnull().sum() > 0:
                if train_data_resampled[col].dtype in ['int64', 'float64']:
                    train_data_resampled[col] = train_data_resampled[col].fillna(train_data_resampled[col].median())

    if test_data.isnull().sum().sum() > 0:
        print(f"  Found {test_data.isnull().sum().sum()} NaN values in test data. Filling with column medians.")
        for col in test_data.columns:
            if test_data[col].isnull().sum() > 0:
                if test_data[col].dtype in ['int64', 'float64']:
                    test_data[col] = test_data[col].fillna(test_data[col].median())

    # 14. IMPROVED: Create validation set from training data
    print("\n🔍 Creating train/validation split...")
    # Extract features and targets
    X_train_final = train_data_resampled.drop(columns=['id', 'label', 'attack_cat'])
    y_train_final = train_data_resampled['attack_cat']
    X_test_final = test_data.drop(columns=['id', 'label', 'attack_cat'])
    y_test_final = test_data['attack_cat']

    # Split into training and validation sets with stratification
    from sklearn.model_selection import train_test_split
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X_train_final, y_train_final, test_size=0.2, random_state=42, stratify=y_train_final
    )

    print(f"  Training data: {X_train_split.shape[0]} samples")
    print(f"  Validation data: {X_val_split.shape[0]} samples")
    print(f"  Test data: {X_test_final.shape[0]} samples")

    # 15. Convert to tensors
    X_train_tensor = torch.tensor(X_train_split.values, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train_split.values, dtype=torch.long)

    X_val_tensor = torch.tensor(X_val_split.values, dtype=torch.float32)
    y_val_tensor = torch.tensor(y_val_split.values, dtype=torch.long)

    X_test_tensor = torch.tensor(X_test_final.values, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test_final.values, dtype=torch.long)

    # 16. Create datasets
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

    # 17. Create data loaders with optimized parameters
    print("\n🔍 Creating data loaders...")
    batch_size = 64  # Can adjust based on available memory

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=2,
        pin_memory=True,
        drop_last=False
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=2,
        pin_memory=True,
        drop_last=False
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=2,
        pin_memory=True,
        drop_last=False
    )

    print(f"\n✅ Preprocessing complete!")
    print(f"  Final training set: {len(train_dataset)} samples with {X_train_split.shape[1]} features")
    print(f"  Final validation set: {len(val_dataset)} samples")
    print(f"  Final test set: {len(test_dataset)} samples with {X_test_final.shape[1]} features")

    # 18. Visualize class distribution
    try:
        plt.figure(figsize=(15, 5))

        plt.subplot(1, 3, 1)
        train_class_counts = pd.Series(y_train_split).value_counts().sort_index()
        plt.bar(train_class_counts.index.astype(str), train_class_counts.values)
        plt.title('Training Data Class Distribution')
        plt.xlabel('Attack Category')
        plt.ylabel('Count')
        plt.xticks(rotation=45)

        plt.subplot(1, 3, 2)
        val_class_counts = pd.Series(y_val_split).value_counts().sort_index()
        plt.bar(val_class_counts.index.astype(str), val_class_counts.values)
        plt.title('Validation Data Class Distribution')
        plt.xlabel('Attack Category')
        plt.ylabel('Count')
        plt.xticks(rotation=45)

        plt.subplot(1, 3, 3)
        test_class_counts = pd.Series(y_test_final).value_counts().sort_index()
        plt.bar(test_class_counts.index.astype(str), test_class_counts.values)
        plt.title('Test Data Class Distribution')
        plt.xlabel('Attack Category')
        plt.ylabel('Count')
        plt.xticks(rotation=45)

        plt.tight_layout()
        plt.savefig('class_distribution.png')
        plt.close()
    except:
        print("  Could not create visualization (possibly running in non-graphical environment)")

    # Calculate class weights for later use in loss function
    num_classes = len(np.unique(y_train_final))
    class_counts = np.bincount(y_train_split)

    # Handle potential missing classes in the bincount
    if len(class_counts) < num_classes:
        temp_counts = np.zeros(num_classes)
        temp_counts[:len(class_counts)] = class_counts
        class_counts = temp_counts

    # Calculate inverse frequency class weights, bounded to prevent extreme values
    class_weights = np.ones(num_classes)
    non_zero_counts = class_counts[class_counts > 0]
    if len(non_zero_counts) > 0:
        class_weights[class_counts > 0] = 1.0 / class_counts[class_counts > 0]
        # Normalize weights to sum to num_classes
        class_weights = class_weights * (num_classes / class_weights.sum())
        # Bound weights to reasonable range
        class_weights = np.clip(class_weights, 0.1, 10.0)

    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32)

    # Return processed data and related objects
    return {
        'train_data': train_data_resampled,
        'test_data': test_data,
        'X_train': X_train_split,
        'y_train': y_train_split,
        'X_val': X_val_split,
        'y_val': y_val_split,
        'X_test': X_test_final,
        'y_test': y_test_final,
        'train_loader': train_loader,
        'val_loader': val_loader,
        'test_loader': test_loader,
        'input_dim': X_train_split.shape[1],
        'num_classes': num_classes,
        'class_weights': class_weights_tensor
    }

# Bagian 2: Model CNN + DBN Ensemble

In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import confusion_matrix, classification_report
import time
import os
import seaborn as sns
from tqdm import tqdm

# Improved 2D CNN for tabular data
class RevisedCNN(nn.Module):
    def __init__(self, input_dim, output_dim, dropout_rate=0.3):
        super(RevisedCNN, self).__init__()

        # Reshape input to 2D representation
        self.input_dim = input_dim
        self.reshape_dim = int(np.sqrt(input_dim)) + 1
        self.pad_size = self.reshape_dim**2 - input_dim

        # Convolutional layers with larger kernel and better stride
        self.conv1 = nn.Conv2d(1, 64, kernel_size=5, stride=1, padding=2)
        self.bn1 = nn.BatchNorm2d(64)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)

        # Compute output size after first conv+pool
        conv1_out_size = self.reshape_dim // 2

        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(128)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)

        # Compute output size after second conv+pool
        conv2_out_size = conv1_out_size // 2

        # Calculate flattened size
        self.flattened_size = 128 * max(1, conv2_out_size) * max(1, conv2_out_size)

        # Fully connected layers with proper sizes
        self.fc1 = nn.Linear(self.flattened_size, 512)
        self.bn_fc1 = nn.BatchNorm1d(512)
        self.dropout_fc1 = nn.Dropout(dropout_rate)

        self.fc2 = nn.Linear(512, 256)
        self.bn_fc2 = nn.BatchNorm1d(256)
        self.dropout_fc2 = nn.Dropout(dropout_rate)

        self.fc3 = nn.Linear(256, output_dim)

    def forward(self, x):
        batch_size = x.size(0)

        # Reshape input to 2D (square-like format for better convolution)
        x_padded = F.pad(x, (0, self.pad_size))
        x = x_padded.view(batch_size, 1, self.reshape_dim, self.reshape_dim)

        # Apply convolutional layers
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.leaky_relu(x, negative_slope=0.1)
        x = self.pool1(x)

        x = self.conv2(x)
        x = self.bn2(x)
        x = F.leaky_relu(x, negative_slope=0.1)
        x = self.pool2(x)

        # Flatten
        x = x.view(batch_size, -1)

        # Fully connected layers
        x = self.fc1(x)
        x = self.bn_fc1(x)
        x = F.leaky_relu(x, negative_slope=0.1)
        x = self.dropout_fc1(x)

        x = self.fc2(x)
        x = self.bn_fc2(x)
        x = F.leaky_relu(x, negative_slope=0.1)
        x = self.dropout_fc2(x)

        x = self.fc3(x)

        return x


# Improved RBM with proper pretraining abilities
class ImprovedRBM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(ImprovedRBM, self).__init__()

        # Initialize weights with small values for stability
        self.W = nn.Parameter(torch.randn(input_size, hidden_size) * 0.01)
        self.b = nn.Parameter(torch.zeros(hidden_size))
        self.c = nn.Parameter(torch.zeros(input_size))
        self.input_size = input_size
        self.hidden_size = hidden_size

        # Xavier/Glorot initialization for better convergence
        nn.init.xavier_uniform_(self.W)

    def forward(self, v):
        """Visible to hidden layer probabilities"""
        batch_size = v.size(0)

        # Reshape input to 2D if needed
        if v.dim() > 2:
            v = v.view(batch_size, -1)

        # Handle dimension mismatch gracefully
        if v.size(1) != self.input_size:
            v_resized = torch.zeros(batch_size, self.input_size, device=v.device)
            min_size = min(v.size(1), self.input_size)
            v_resized[:, :min_size] = v[:, :min_size]
            v = v_resized

        # Apply L2 normalization for better stability
        v = F.normalize(v, p=2, dim=1)

        # Compute hidden activations
        h_activation = torch.matmul(v, self.W) + self.b
        h_probs = torch.sigmoid(h_activation)

        return h_probs

    def sample_h(self, v):
        """Sample from the hidden layer given visible state"""
        h_probs = self.forward(v)
        h_samples = torch.bernoulli(h_probs)
        return h_samples, h_probs

    def sample_v(self, h):
        """Sample from the visible layer given hidden state"""
        v_activation = torch.matmul(h, self.W.t()) + self.c
        v_probs = torch.sigmoid(v_activation)
        v_samples = torch.bernoulli(v_probs)
        return v_samples, v_probs

    def cd_k(self, v_data, k=1):
        """Contrastive Divergence with k steps"""
        h_data, h_data_probs = self.sample_h(v_data)

        # Initialize the chain with data samples
        h_model = h_data

        # Gibbs sampling
        for _ in range(k):
            v_model, v_model_probs = self.sample_v(h_model)
            h_model, h_model_probs = self.sample_h(v_model)

        return v_data, h_data_probs, v_model_probs, h_model_probs

    def free_energy(self, v):
        """Calculate free energy"""
        wx_b = torch.matmul(v, self.W) + self.b
        hidden_term = torch.sum(F.softplus(wx_b), dim=1)
        vbias_term = torch.matmul(v, self.c)
        return -hidden_term - vbias_term


# Function to pretrain an RBM
def pretrain_rbm(rbm, dataloader, device, epochs=5, lr=0.001):
    """Pretrain RBM using Contrastive Divergence"""
    optimizer = torch.optim.Adam(rbm.parameters(), lr=lr)
    rbm.train()

    print(f"Pretraining RBM {rbm.input_size} -> {rbm.hidden_size}...")

    for epoch in range(epochs):
        mean_loss = 0
        num_batches = 0

        for batch_idx, (data, _) in enumerate(dataloader):
            data = data.to(device)

            # Run k-step Contrastive Divergence
            v_data, h_data, v_model, h_model = rbm.cd_k(data, k=1)

            # Compute gradients using CD loss
            # Positive phase - negative phase
            pos_associations = torch.matmul(v_data.t(), h_data)
            neg_associations = torch.matmul(v_model.t(), h_model)

            # Update weights and biases
            optimizer.zero_grad()

            # Manually set gradients
            rbm.W.grad = -(pos_associations - neg_associations) / data.size(0)
            rbm.b.grad = -(h_data - h_model).mean(0)
            rbm.c.grad = -(v_data - v_model).mean(0)

            optimizer.step()

            # Compute reconstruction error
            recon_error = F.mse_loss(v_model, v_data)
            mean_loss += recon_error.item()
            num_batches += 1

        # Print epoch stats
        mean_loss /= num_batches
        print(f"  Epoch {epoch+1}/{epochs}, Reconstruction Error: {mean_loss:.6f}")

    return rbm


# Improved DBN with proper pretraining
class ImprovedDBN(nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim, dropout_rate=0.3):
        super(ImprovedDBN, self).__init__()
        self.input_dim = input_dim
        self.hidden_dims = hidden_dims
        self.output_dim = output_dim

        # Stack of RBM layers
        self.rbm_layers = nn.ModuleList()

        # First RBM
        self.rbm_layers.append(ImprovedRBM(input_dim, hidden_dims[0]))

        # Additional RBM layers
        for i in range(1, len(hidden_dims)):
            self.rbm_layers.append(ImprovedRBM(hidden_dims[i-1], hidden_dims[i]))

        # Fully connected layers after RBMs
        self.fc_layers = nn.ModuleList()

        # First FC layer from last RBM's output
        self.fc_layers.append(nn.Sequential(
            nn.Linear(hidden_dims[-1], 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.1),
            nn.Dropout(dropout_rate)
        ))

        # Second FC layer
        self.fc_layers.append(nn.Sequential(
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(0.1),
            nn.Dropout(dropout_rate)
        ))

        # Output layer
        self.output_layer = nn.Linear(128, output_dim)

        # Better initialization for linear layers
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.constant_(m.bias, 0)

    def pretrain(self, dataloader, device, epochs=5):
        """Pretrain the DBN layer by layer"""
        print("Pretraining DBN layers...")

        # Train first RBM with input data
        self.rbm_layers[0] = pretrain_rbm(
            self.rbm_layers[0], dataloader, device, epochs)

        # For each subsequent layer, train with features from previous layer
        for i in range(1, len(self.rbm_layers)):
            # Extract features from previous layer
            prev_layer_features = []
            with torch.no_grad():
                for data, _ in dataloader:
                    data = data.to(device)

                    # Forward pass through previous layers
                    for j in range(i):
                        data = self.rbm_layers[j](data)

                    prev_layer_features.append(data.cpu())

            # Create dataset with extracted features
            prev_features = torch.cat(prev_layer_features, dim=0)
            feature_dataset = TensorDataset(prev_features, torch.zeros(prev_features.size(0)))
            feature_loader = DataLoader(
                feature_dataset,
                batch_size=dataloader.batch_size,
                shuffle=True,
                num_workers=0
            )

            # Pretrain this RBM with extracted features
            self.rbm_layers[i] = pretrain_rbm(
                self.rbm_layers[i], feature_loader, device, epochs)

        print("DBN pretraining complete.")
        return self

    def forward(self, x):
        """Forward pass through the DBN"""
        batch_size = x.size(0)

        # Flatten input if not already flat
        if x.dim() > 2:
            x = x.view(batch_size, -1)

        # Forward through RBM layers
        for rbm in self.rbm_layers:
            x = rbm(x)

        # Forward through FC layers
        for fc in self.fc_layers:
            x = fc(x)

        # Output layer
        x = self.output_layer(x)

        return x


# Improved dynamic ensemble model
class ImprovedDynamicEnsembleModel(nn.Module):
    def __init__(self, cnn_model, dbn_model, output_dim):
        super(ImprovedDynamicEnsembleModel, self).__init__()
        self.cnn_model = cnn_model
        self.dbn_model = dbn_model
        self.output_dim = output_dim

        # Class-wise attention - one attention weight per class
        self.class_attention = nn.ModuleList([
            nn.Sequential(
                nn.Linear(2, 16),
                nn.LeakyReLU(0.1),
                nn.Linear(16, 2),
                nn.Softmax(dim=1)
            ) for _ in range(output_dim)
        ])

        # Gating network to decide contribution from each model
        self.gate_network = nn.Sequential(
            nn.Linear(output_dim * 2, 64),
            nn.LeakyReLU(0.1),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

        # Combiner network
        self.combiner = nn.Sequential(
            nn.Linear(output_dim * 2, 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.3),
            nn.Linear(256, output_dim)
        )

    def forward(self, x):
        # Save original input for DBN
        x_original = x.clone()

        # Forward pass through CNN
        cnn_logits = self.cnn_model(x_original)

        # Forward pass through DBN
        dbn_logits = self.dbn_model(x_original)

        # Get probabilities
        cnn_probs = F.softmax(cnn_logits, dim=1)
        dbn_probs = F.softmax(dbn_logits, dim=1)

        # Confidence values
        cnn_conf = torch.max(cnn_probs, dim=1, keepdim=True)[0]
        dbn_conf = torch.max(dbn_probs, dim=1, keepdim=True)[0]

        # Calculate gating value to balance models
        model_confs = torch.cat([cnn_logits, dbn_logits], dim=1)
        gate_value = self.gate_network(model_confs)

        # Class-wise weighting
        weighted_outputs = []
        for i in range(self.output_dim):
            # Get logits for this class
            class_values = torch.cat([
                cnn_logits[:, i:i+1],
                dbn_logits[:, i:i+1]
            ], dim=1)

            # Get weights for this class
            weights = self.class_attention[i](class_values)

            # Weight the outputs
            weighted_class = (weights[:, 0:1] * cnn_logits[:, i:i+1] +
                             weights[:, 1:2] * dbn_logits[:, i:i+1])
            weighted_outputs.append(weighted_class)

        # Stack all class outputs
        class_weighted_output = torch.cat(weighted_outputs, dim=1)

        # Combine with original outputs
        combined = torch.cat([cnn_logits, dbn_logits], dim=1)

        # Dynamic final output
        final_output = gate_value * self.combiner(combined) + (1 - gate_value) * class_weighted_output

        return final_output


# Focal Loss implementation for handling class imbalance
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha  # Class weights
        self.gamma = gamma  # Focusing parameter
        self.reduction = reduction

    def forward(self, inputs, targets):
        # Standard cross entropy
        ce_loss = F.cross_entropy(
            inputs, targets, weight=self.alpha,
            reduction='none'
        )

        # Get probabilities
        pt = torch.exp(-ce_loss)

        # Calculate focal term
        focal_term = (1 - pt) ** self.gamma

        # Apply focal term to CE loss
        loss = focal_term * ce_loss

        # Apply reduction
        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            return loss


# Mixup data augmentation function
def mixup_data(x, y, alpha=0.2, device='cuda'):
    """Mixup data augmentation function"""
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size(0)
    index = torch.randperm(batch_size).to(device)

    mixed_x = lam * x + (1 - lam) * x[index]
    y_a, y_b = y, y[index]

    return mixed_x, y_a, y_b, lam


# Mixup criterion function
def mixup_criterion(criterion, pred, y_a, y_b, lam):
    """Criterion for mixup data augmentation"""
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)


# Training function with advanced techniques
def train_ensemble_model(ensemble_model, train_loader, val_loader, test_loader,
                        device, class_weights=None, epochs=30, lr=0.001,
                        focal_gamma=2.0, mixup_alpha=0.2, save_dir='.'):
    print("Starting advanced training process...")

    # Create save directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)

    # Setup optimizer with weight decay
    optimizer = optim.AdamW(ensemble_model.parameters(), lr=lr, weight_decay=1e-4)

    # Learning rate scheduler
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=3, verbose=True, min_lr=1e-6
    )

    # Loss function - use focal loss for imbalanced classes
    if class_weights is not None:
        class_weights = class_weights.to(device)

    criterion = FocalLoss(alpha=class_weights, gamma=focal_gamma)

    # Tracking metrics
    train_losses = []
    val_losses = []
    train_accs = []
    val_accs = []
    best_val_acc = 0.0
    best_val_loss = float('inf')
    patience_counter = 0

    # Training loop with early stopping
    for epoch in range(epochs):
        # Training phase
        start_time = time.time()
        ensemble_model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        # Progress bar for training
        train_iterator = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Train]")

        for inputs, labels in train_iterator:
            inputs, labels = inputs.to(device), labels.to(device)

            # Apply mixup augmentation randomly with 50% probability
            use_mixup = np.random.random() < 0.5 and mixup_alpha > 0

            if use_mixup:
                inputs, labels_a, labels_b, lam = mixup_data(inputs, labels, mixup_alpha, device)
                optimizer.zero_grad()
                outputs = ensemble_model(inputs)
                loss = mixup_criterion(criterion, outputs, labels_a, labels_b, lam)
            else:
                optimizer.zero_grad()
                outputs = ensemble_model(inputs)
                loss = criterion(outputs, labels)

            # Backward pass
            loss.backward()

            # Gradient clipping to avoid exploding gradients
            torch.nn.utils.clip_grad_norm_(ensemble_model.parameters(), max_norm=1.0)

            optimizer.step()

            # Update statistics
            running_loss += loss.item()

            # Calculate accuracy only for non-mixup batches
            if not use_mixup:
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        # Calculate epoch metrics
        epoch_train_loss = running_loss / len(train_loader)

        # If we used mixup on all batches, run a separate evaluation pass
        if total == 0:
            ensemble_model.eval()
            correct = 0
            total = 0
            with torch.no_grad():
                for inputs, labels in train_loader:
                    inputs, labels = inputs.to(device), labels.to(device)
                    outputs = ensemble_model(inputs)
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()

        epoch_train_acc = 100.0 * correct / total
        train_losses.append(epoch_train_loss)
        train_accs.append(epoch_train_acc)

        # Validation phase
        ensemble_model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        all_preds = []
        all_targets = []

        # Progress bar for validation
        val_iterator = tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} [Val]")

        with torch.no_grad():
            for inputs, labels in val_iterator:
                inputs, labels = inputs.to(device), labels.to(device)

                # Forward pass
                outputs = ensemble_model(inputs)

                # Calculate loss
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                # Calculate accuracy
                _, predicted = torch.max(outputs.data, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()

                # Save predictions and targets for confusion matrix
                all_preds.extend(predicted.cpu().numpy())
                all_targets.extend(labels.cpu().numpy())

        # Calculate validation metrics
        epoch_val_loss = val_loss / len(val_loader)
        epoch_val_acc = 100.0 * val_correct / val_total
        val_losses.append(epoch_val_loss)
        val_accs.append(epoch_val_acc)

        # Update learning rate based on validation loss
        scheduler.step(epoch_val_loss)

        # Calculate epoch duration
        epoch_time = time.time() - start_time

        # Print epoch results
        print(f"Epoch {epoch+1}/{epochs} | Time: {epoch_time:.1f}s | "
              f"Train Loss: {epoch_train_loss:.4f} | Train Acc: {epoch_train_acc:.2f}% | "
              f"Val Loss: {epoch_val_loss:.4f} | Val Acc: {epoch_val_acc:.2f}%")

        # Check if this is the best model so far
        if epoch_val_acc > best_val_acc:
            best_val_acc = epoch_val_acc
            best_val_loss = epoch_val_loss
            patience_counter = 0

            # Save the best model
            torch.save(ensemble_model.state_dict(), os.path.join(save_dir, 'best_model.pt'))
            print(f"✅ New best model saved with Val Acc: {epoch_val_acc:.2f}%")

            # Generate confusion matrix for best model
            try:
                cm = confusion_matrix(all_targets, all_preds)
                plt.figure(figsize=(12, 10))
                sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
                plt.title(f'Validation Confusion Matrix - Epoch {epoch+1}')
                plt.xlabel('Predicted')
                plt.ylabel('True')
                plt.savefig(os.path.join(save_dir, f'confusion_matrix_epoch_{epoch+1}.png'))
                plt.close()

                # Print detailed classification report
                print("\nClassification Report:")
                print(classification_report(all_targets, all_preds))
            except Exception as e:
                print(f"Could not generate confusion matrix: {e}")
        else:
            patience_counter += 1

        # Early stopping check
        if patience_counter >= 7:  # Stop if no improvement for 7 epochs
            print(f"Early stopping triggered after {epoch+1} epochs")
            break

    # Final evaluation on test set
    print("\nEvaluating final model on test set...")
    ensemble_model.load_state_dict(torch.load(os.path.join(save_dir, 'best_model.pt')))
    ensemble_model.eval()
    test_correct = 0
    test_total = 0
    all_test_preds = []
    all_test_targets = []
    all_test_scores = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = ensemble_model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            test_total += labels.size(0)
            test_correct += (predicted == labels).sum().item()
            all_test_preds.extend(predicted.cpu().numpy())
            all_test_targets.extend(labels.cpu().numpy())
            all_test_scores.extend(F.softmax(outputs, dim=1).cpu().numpy())

    test_acc = 100.0 * test_correct / test_total
    print(f"Test Accuracy: {test_acc:.2f}%")

    # Calculate metrics per class
    from sklearn.metrics import precision_recall_fscore_support, roc_auc_score

    # Convert to 1D arrays
    y_true = np.array(all_test_targets)
    y_pred = np.array(all_test_preds)
    y_score = np.array(all_test_scores)

    # Calculate precision, recall, and F1-score per class
    precision, recall, f1, support = precision_recall_fscore_support(
        y_true, y_pred, average=None)

    # Calculate metrics
    weighted_precision, weighted_recall, weighted_f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average='weighted')

    # Attempt to calculate ROC AUC score
    try:
        roc_auc = roc_auc_score(y_true, y_score, multi_class='ovr', average='weighted')
        print(f"ROC AUC Score (weighted): {roc_auc:.4f}")
    except Exception as e:
        print(f"Could not calculate ROC AUC: {e}")

    print(f"Weighted Precision: {weighted_precision:.4f}")
    print(f"Weighted Recall: {weighted_recall:.4f}")
    print(f"Weighted F1 Score: {weighted_f1:.4f}")

    # Generate final confusion matrix and report
    try:
        cm = confusion_matrix(all_test_targets, all_test_preds)
        plt.figure(figsize=(14, 12))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title('Final Test Confusion Matrix')
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.savefig(os.path.join(save_dir, 'final_test_confusion_matrix.png'))
        plt.close()

        # Per-class metrics visualization
        plt.figure(figsize=(14, 8))
        indices = np.arange(len(precision))
        width = 0.25

        plt.bar(indices - width, precision, width, label='Precision')
        plt.bar(indices, recall, width, label='Recall')
        plt.bar(indices + width, f1, width, label='F1-Score')

        plt.xlabel('Class')
        plt.ylabel('Score')
        plt.title('Per-Class Classification Metrics')
        plt.xticks(indices)
        plt.legend()
        plt.tight_layout()
        plt.savefig(os.path.join(save_dir, 'per_class_metrics.png'))
        plt.close()

        print("\nFinal Classification Report:")
        print(classification_report(all_test_targets, all_test_preds))
    except Exception as e:
        print(f"Could not generate final confusion matrix: {e}")

    # Plot training history
    plt.figure(figsize=(15, 6))

    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.title('Loss over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)

    plt.subplot(1, 2, 2)
    plt.plot(train_accs, label='Training Accuracy')
    plt.plot(val_accs, label='Validation Accuracy')
    plt.title('Accuracy over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy (%)')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, 'training_history.png'))
    plt.close()

    return ensemble_model, test_acc


# Running Preprocessing dan CNN+DBN

In [25]:
# Main function to run the whole pipeline
def run_improved_iot_anomaly_detection(train_data, test_data, save_dir='model_results'):
    print("\n🚀 Starting improved IoT network anomaly detection pipeline...")

    # Step 1: Preprocessing data
    print("\n📊 Running enhanced data preprocessing...")
    processed_data = enhanced_preprocessing(train_data, test_data)

    # Extract necessary components from processed data
    train_loader = processed_data['train_loader']
    val_loader = processed_data['val_loader']
    test_loader = processed_data['test_loader']
    input_dim = processed_data['input_dim']
    num_classes = processed_data['num_classes']
    class_weights = processed_data['class_weights']
    train_data = processed_data['train_data']  # Get updated train_data
    test_data = processed_data['test_data']

    # Step 2: Set up device for training
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    class_weights = class_weights.to(device)
    print(f"\n🔧 Using device: {device}")

    # Step 3: Create improved CNN model
    print("\n🧠 Creating improved CNN model...")
    cnn_model = RevisedCNN(
        input_dim=input_dim,
        output_dim=num_classes,
        dropout_rate=0.3
    ).to(device)
    print(f"CNN model created with {input_dim} input features and {num_classes} output classes")

    # Step 4: Create improved DBN model with pretraining
    print("\n🧠 Creating improved DBN model...")
    dbn_model = ImprovedDBN(
        input_dim=input_dim,
        hidden_dims=[512, 384, 256],
        output_dim=num_classes,
        dropout_rate=0.3
    ).to(device)

    # Pretrain DBN layers
    dbn_model.pretrain(train_loader, device, epochs=3)
    print("DBN model created and pretrained")

    # Step 5: Create improved ensemble model
    print("\n🧠 Creating improved dynamic ensemble model...")
    ensemble_model = ImprovedDynamicEnsembleModel(
        cnn_model=cnn_model,
        dbn_model=dbn_model,
        output_dim=num_classes
    ).to(device)
    print("Ensemble model created")

    # Step 6: Train the ensemble model
    print("\n🏋️‍♀️ Starting advanced training process...")
    trained_model, test_accuracy = train_ensemble_model(
        ensemble_model=ensemble_model,
        train_loader=train_loader,
        val_loader=val_loader,
        test_loader=test_loader,
        device=device,
        class_weights=class_weights,
        epochs=30,
        lr=0.001,
        focal_gamma=2.0,
        mixup_alpha=0.2,
        save_dir=save_dir
    )

    print(f"\n🏁 Final test accuracy: {test_accuracy:.2f}%")

    # Return the trained model and results
    results = {
        'model': trained_model,
        'test_accuracy': test_accuracy,
        'input_dim': input_dim,
        'num_classes': num_classes,
        'model_path': os.path.join(save_dir, 'best_model.pt')
    }

    return results

import pandas as pd
from sklearn.model_selection import train_test_split

# 1. Load dataset UNSW-NB15
# Asumsikan Anda memiliki file CSV untuk training dan testing
train_data = pd.read_csv('/content/drive/MyDrive/ANOMALI DUN DUN/UNSW_NB15_training-set.csv')
test_data = pd.read_csv('/content/drive/MyDrive/ANOMALI DUN DUN/UNSW_NB15_testing-set.csv')

# 2. Jalankan pipeline pemrosesan dan model
results = run_improved_iot_anomaly_detection(train_data, test_data, save_dir='iot_anomaly_results')

print(f"Final model accuracy: {results['test_accuracy']:.2f}%")
print(f"Model saved to: {results['model_path']}")


🚀 Starting improved IoT network anomaly detection pipeline...

📊 Running enhanced data preprocessing...

🔍 Starting enhanced preprocessing pipeline...

📊 Dataset Overview:
Training data shape: (82332, 45)
Testing data shape: (175341, 45)
Missing values - Training: 0, Testing: 0

🔍 Handling missing target values...
After dropping rows with missing targets - Train: (82332, 45), Test: (175341, 45)

🔍 Handling remaining missing values...

🔍 Removing duplicate entries...
After removing duplicates - Train: (82332, 45), Test: (175341, 45)

🔍 Encoding target variable...
Class distribution after encoding:
  Class 0: 37000 samples (44.94%)
  Class 1: 18871 samples (22.92%)
  Class 2: 11132 samples (13.52%)
  Class 3: 6062 samples (7.36%)
  Class 4: 4089 samples (4.97%)
  Class 5: 3496 samples (4.25%)
  Class 6: 677 samples (0.82%)
  Class 7: 583 samples (0.71%)
  Class 8: 378 samples (0.46%)
  Class 9: 44 samples (0.05%)

🔍 Handling outliers with improved Winsorization...

🔍 Applying transforma

Epoch 1/30 [Train]:   0%|          | 0/701 [00:00<?, ?it/s]


RuntimeError: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 2 is not equal to len(dims) = 3