# Installing Scikit-learn GPUs acceleration

In [1]:
%load_ext cuml.accel
import sklearn

[2025-05-27 03:07:09.931] [CUML] [info] cuML: Installed accelerator for sklearn.


2025-05-27 03:07:24.435627: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748315244.633373      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748315244.693322      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[2025-05-27 03:07:43.618] [CUML] [info] cuML: Installed accelerator for umap.
[2025-05-27 03:07:43.705] [CUML] [info] cuML: Installed accelerator for hdbscan.
[2025-05-27 03:07:43.705] [CUML] [info] cuML: Successfully initialized accelerator.


### Use this command to run python file

In [2]:
import os
import warnings
import io
import contextlib
import optuna
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter
from sklearn.exceptions import NotFittedError
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import IncrementalPCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, average_precision_score, classification_report
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

# MAB

In [3]:
def get_data(path):
    fnames = os.listdir(path)
    datalist = [pd.read_csv(os.path.join(path, fname)) for fname in tqdm(fnames)]
    dataframe = pd.concat(datalist)
    for d in datalist: del d
    dataframe.columns = dataframe.columns.str.strip()
    return dataframe

def clean_data(dataframe):
    dataframe.columns = dataframe.columns.str.strip()
    dataframe.drop_duplicates(inplace=True)
    
    numeric_cols = dataframe.select_dtypes(include=np.number).columns
    dataframe.replace([np.inf, -np.inf], np.nan, inplace=True)
    dataframe.drop(columns=["Fwd Header Length.1"], errors="ignore", inplace=True)

    # Fill missing values
    if 'Flow Bytes/s' in dataframe.columns:
        med_flow_bytes = dataframe['Flow Bytes/s'].median()
        dataframe['Flow Bytes/s'] = dataframe['Flow Bytes/s'].fillna(med_flow_bytes)
    
    if 'Flow Packets/s' in dataframe.columns:
        med_flow_packets = dataframe['Flow Packets/s'].median()
        dataframe['Flow Packets/s'] = dataframe['Flow Packets/s'].fillna(med_flow_packets)

    dataframe['Attack Type'] = dataframe['Label'].apply(lambda x: 'BENIGN' if str(x).lower() == 'benign' or str(x) == '0' else 'ATTACK')
    dataframe.drop('Label', axis=1, inplace=True)
    return dataframe

def preprocess_data(dataframe, is_train=True, scaler=None, ipca=None, kept_columns=None):
    # Optimize dtypes
    for col in dataframe.columns:
        col_type = dataframe[col].dtype
        if col_type != object:
            c_min = dataframe[col].min()
            c_max = dataframe[col].max()
            if str(col_type).find('float') >= 0 and c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                dataframe[col] = dataframe[col].astype(np.float32)
            elif str(col_type).find('int') >= 0 and c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                dataframe[col] = dataframe[col].astype(np.int32)

    if is_train:
        # Drop constant columns
        num_unique = dataframe.nunique()
        kept_columns = num_unique[num_unique > 1].index
        dataframe = dataframe[kept_columns]
    else:
        dataframe = dataframe[kept_columns]

    # Separate features and label
    attacks = dataframe['Attack Type']
    features = dataframe.drop('Attack Type', axis=1)

    if is_train:
        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(features)

        n_components = len(features.columns) // 2
        ipca = IncrementalPCA(n_components=n_components, batch_size=500)
        for batch in np.array_split(scaled_features, len(features) // 500 + 1):
            ipca.partial_fit(batch)
    else:
        scaled_features = scaler.transform(features)

    transformed_features = ipca.transform(scaled_features)
    data = pd.DataFrame(transformed_features, columns=[f'PC{i+1}' for i in range(transformed_features.shape[1])])
    
    # Add labels after feature columns
    data['Attack Number'] = attacks.apply(lambda x: 1 if x == 'ATTACK' else 0).values

    # Optional: shuffle rows
    data = data.sample(frac=1, random_state=42).reset_index(drop=True)

    if is_train:
        return data, scaler, ipca, kept_columns
    else:
        return data

In [4]:
class ModelFineTuner:
    def __init__(self, model_name, n_trials=50):
        self.model_name = model_name
        self.n_trials = n_trials
        self.study = optuna.create_study(direction='maximize')
        self.best_model = None
        self.best_params = None

    def _rf_objective(self, trial, X_train, y_train, X_test, y_test):
        n_estimators = trial.suggest_int("n_estimators", 100, 400)
        max_depth = trial.suggest_int("max_depth", 5, 50)
    
        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            class_weight='balanced',
            n_jobs=-1,
            random_state=42
        )
        
        model.fit(X_train, y_train)
        y_scores = model.predict_proba(X_test)[:, 1]
        return average_precision_score(y_test, y_scores)

    def _xgb_objective(self, trial, X_train, y_train, X_test, y_test):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 300),
            "max_depth": trial.suggest_int("max_depth", 4, 40),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
            "scale_pos_weight": trial.suggest_float("scale_pos_weight", 0.5, 5),
            "use_label_encoder": False,
            "eval_metric": "logloss"
        }
    
        model = XGBClassifier(**params)
        model.fit(X_train, y_train)
        y_scores = model.predict_proba(X_test)[:, 1]
        return average_precision_score(y_test, y_scores)

    def _lgbm_objective(self, trial, X_train, y_train, X_test, y_test):
        params = {
            "max_depth": trial.suggest_int("max_depth", 4, 30),
            "num_leaves": trial.suggest_int("num_leaves", 20, 150),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
            "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
            "min_child_samples": trial.suggest_int("min_child_samples", 10, 50),
            "n_estimators": trial.suggest_int("n_estimators", 100, 500),
            "class_weight": 'balanced',
            "verbosity": -1
        }
        model = LGBMClassifier(**params)
        model.fit(X_train, y_train)
        y_scores = model.predict_proba(X_test)[:, 1]
        return average_precision_score(y_test, y_scores)

    def fine_tune(self, X_train, y_train, X_test, y_test):
        objective_mapping = {
            'RandomForest': self._rf_objective,
            'XGBoost': self._xgb_objective,
            'LightGBM': self._lgbm_objective
        }
    
        if self.model_name not in objective_mapping:
            raise ValueError(f"Unsupported model name: {self.model_name}")
    
        print(f"=== Tuning {self.model_name} ===")
        self.study.optimize(lambda trial: objective_mapping[self.model_name](trial, X_train, y_train, X_test, y_test), 
                            n_trials=self.n_trials)
    
        best_params = self.study.best_params
        self.best_params = best_params
    
        if self.model_name == 'RandomForest':
            self.best_model = RandomForestClassifier(**best_params, random_state=42)
        elif self.model_name == 'XGBoost':
            self.best_model = XGBClassifier(**best_params, use_label_encoder=False, eval_metric='mlogloss', random_state=42)
        elif self.model_name == 'LightGBM':
            self.best_model = LGBMClassifier(**best_params)
        else:
            raise ValueError(f"Unsupported model: {self.model_name}")
    
        self.best_model.fit(X_train, y_train)
        preds = self.best_model.predict(X_test)
        acc = accuracy_score(y_test, preds)
        report = classification_report(y_test, preds, output_dict=True)
    
        return self.best_model, acc, report

In [5]:
class OptunaMABThompsonSampling:
    def __init__(self, n_clusters, n_trials=3):
        self.n_clusters = n_clusters
        self.n_trials = n_trials
        self.model_names = ['RandomForest', 'XGBoost', 'LightGBM']
        self.n_arms = len(self.model_names)
        self.arms = [None] * self.n_arms
        self.alpha = np.ones((n_clusters, self.n_arms))
        self.beta = np.ones((n_clusters, self.n_arms))
        self.cluster_models = [None] * n_clusters
        self.max_merges_per_cluster = 4
        self.cluster_merge_count = {i: 0 for i in range(n_clusters)}
        self.history = [[] for _ in range(self.n_clusters)]
        self.cluster_mapping = {}  # Track cluster merging

    def select_arm(self, cluster_id):
        samples = np.random.beta(self.alpha[cluster_id], self.beta[cluster_id])
        return np.argmax(samples)

    def update(self, cluster_id, arm_id, reward):
        self.alpha[cluster_id, arm_id] += reward
        self.beta[cluster_id, arm_id] += 1 - reward

    def _merge_clusters(self, source_cluster, target_cluster, X_train, y_train):
        print(f"[Cluster {source_cluster}] merging into [Cluster {target_cluster}]")
    
        source_mask = self.cluster_assignments == source_cluster
        self.cluster_assignments[source_mask] = target_cluster
    
        self.cluster_mapping[source_cluster] = target_cluster
        self.cluster_merge_count[target_cluster] += 1
    
        # Recompute center
        target_mask = self.cluster_assignments == target_cluster
        self.cluster_centers[target_cluster] = np.mean(X_train[target_mask], axis=0)

    def train_and_evaluate(self, X_train, y_train, X_val, y_val):
        # Initial clustering
        self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=42)
        self.cluster_assignments = self.kmeans.fit_predict(X_train)
        self.cluster_centers = self.kmeans.cluster_centers_.copy()
        
        # Initialize cluster mapping
        self.cluster_mapping = {i: i for i in range(self.n_clusters)}
        
        # Handle clusters with insufficient class diversity
        for cluster_id in range(self.n_clusters):
            cluster_mask = self.cluster_assignments == cluster_id
            cluster_y_train = y_train[cluster_mask]
        
            if len(cluster_y_train) == 0:
                continue
        
            if len(np.unique(cluster_y_train)) < 2:
                # Tìm cluster gần nhất về mặt khoảng cách + có đủ nhãn + chưa bị merge quá nhiều
                dists = np.linalg.norm(self.cluster_centers - self.cluster_centers[cluster_id], axis=1)
                nearest_ids = np.argsort(dists)
        
                merged = False
                for target_id in nearest_ids:
                    if target_id == cluster_id:
                        continue
        
                    if self.cluster_merge_count[target_id] >= self.max_merges_per_cluster:
                        continue
        
                    target_mask = self.cluster_assignments == target_id
                    target_y = y_train[target_mask]
        
                    if len(target_y) > 0 and len(np.unique(target_y)) >= 2:
                        self._merge_clusters(cluster_id, target_id, X_train, y_train)
                        merged = True
                        break
        
                if not merged:
                    print(f"[Warning] Could not merge cluster {cluster_id} - no suitable target found")

        # Get validation cluster assignments using updated mapping
        raw_val_clusters = self.kmeans.predict(X_val)
        val_cluster_assignments = np.array([self.cluster_mapping.get(c, c) for c in raw_val_clusters])
        
        total_preds = []
        total_labels = []
        
        # Train models for each effective cluster
        active_clusters = set(self.cluster_assignments)
        
        for cluster_id in active_clusters:
            cluster_mask = self.cluster_assignments == cluster_id
            cluster_X_train = X_train[cluster_mask]
            cluster_y_train = y_train[cluster_mask]
            
            val_mask = val_cluster_assignments == cluster_id
            cluster_X_val = X_val[val_mask]
            cluster_y_val = y_val[val_mask]
            
            if len(cluster_X_train) == 0:
                print(f"[Cluster {cluster_id}] has no training data after merging.")
                continue
                
            if len(cluster_X_val) == 0:
                print(f"[Cluster {cluster_id}] has no validation data.")
                # Still train the model for this cluster
                pass
            
            print(f"\n[Cluster {cluster_id}] Training data: {len(cluster_X_train)}, "
                  f"Validation data: {len(cluster_X_val)}, "
                  f"Classes: {np.unique(cluster_y_train)}")
            
            best_arm_id = -1
            best_reward = -1
            best_model = None
            
            print(f"[Cluster {cluster_id}] Training all arms...")
            for arm_id, model_name in enumerate(self.model_names):
                try:
                    tuner = ModelFineTuner(model_name, n_trials=self.n_trials)
                    if len(cluster_X_val) > 0:
                        model, acc, report = tuner.fine_tune(cluster_X_train, cluster_y_train, 
                                                           cluster_X_val, cluster_y_val)
                    else:
                        # Use cross-validation if no validation data
                        from sklearn.model_selection import cross_val_score
                        model, _, _ = tuner.fine_tune(cluster_X_train, cluster_y_train, 
                                                    cluster_X_train[:1], cluster_y_train[:1])  # dummy validation
                        acc = np.mean(cross_val_score(model, cluster_X_train, cluster_y_train, cv=3))
                    
                    print(f"[Cluster {cluster_id}] Arm {arm_id} ({model_name}) Accuracy: {acc:.4f}")
                    
                    if acc > best_reward:
                        best_reward = acc
                        best_arm_id = arm_id
                        best_model = model
                    
                    self.history[cluster_id].append((arm_id, acc))
                    
                except Exception as e:
                    print(f"[Cluster {cluster_id}] Arm {arm_id} ({model_name}) failed: {str(e)}")
                    continue
            
            if best_model is not None:
                # Update Thompson Sampling
                self.update(cluster_id, best_arm_id, best_reward)
                self.cluster_models[cluster_id] = best_model
                self.arms[best_arm_id] = best_model
                
                # Make predictions on validation data if available
                if len(cluster_X_val) > 0:
                    preds = best_model.predict(cluster_X_val)
                    total_preds.extend(preds)
                    total_labels.extend(cluster_y_val)
                
                print(f"[Cluster {cluster_id}] Selected Arm: {best_arm_id} "
                      f"({self.model_names[best_arm_id]}) with Accuracy: {best_reward:.4f}")
            else:
                print(f"[Cluster {cluster_id}] No successful model training")
        
        if len(total_preds) > 0:
            print("\n=== Overall Evaluation ===")
            print("Accuracy:", accuracy_score(total_labels, total_preds))
            print(classification_report(total_labels, total_preds))
        else:
            print("\n=== No validation predictions available ===")

    def lenapollon_predict(self, X_test, y_test):
        """Fixed prediction method that respects cluster merging"""
        # Get raw cluster assignments
        raw_test_clusters = self.kmeans.predict(X_test)
        
        # Apply cluster mapping to get effective cluster assignments
        test_cluster_assignments = np.array([self.cluster_mapping.get(c, c) for c in raw_test_clusters])
        
        y_pred = np.zeros(len(X_test))
        
        for cluster_id in range(self.n_clusters):
            cluster_mask = test_cluster_assignments == cluster_id
            cluster_X_test = X_test[cluster_mask]
            
            if len(cluster_X_test) == 0:
                continue

            print(f"Predicting cluster {cluster_id} with {len(cluster_X_test)} samples")
            
            model = self.cluster_models[cluster_id]
            if model is not None:
                try:
                    y_pred[cluster_mask] = model.predict(cluster_X_test)
                except Exception as e:
                    print(f"[Warning] Prediction failed for cluster {cluster_id}: {str(e)}")
            else:
                print(f"[Warning] No trained model for cluster {cluster_id}. "
                      f"Using fallback prediction.")
                # Fallback: use the most successful arm globally
                fallback_arm = np.argmax([len([h for hist in self.history for h in hist if h[0] == i]) 
                                        for i in range(self.n_arms)])
                if self.arms[fallback_arm] is not None:
                    y_pred[cluster_mask] = self.arms[fallback_arm].predict(cluster_X_test)
        
        print("\n=== Testing Performance ===")
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred))
        
        # Additional diagnostics
        print(f"\nCluster distribution in test set:")
        unique_clusters, counts = np.unique(test_cluster_assignments, return_counts=True)
        for cluster_id, count in zip(unique_clusters, counts):
            print(f"Cluster {cluster_id}: {count} samples")

In [6]:
train_df = get_data(path='/kaggle/input/network-intrusion-dataset')
cleaned_train_df = clean_data(train_df)
preprocessed_train_df, scaler, ipca, kept_columns = preprocess_data(cleaned_train_df)

100%|██████████| 8/8 [00:24<00:00,  3.07s/it]


In [7]:
cleaned_train_df['Attack Type'].value_counts()

Attack Type
BENIGN    2096484
ATTACK     425878
Name: count, dtype: int64

In [8]:
preprocessed_train_df['Attack Number'].value_counts()

Attack Number
0    2096484
1     425878
Name: count, dtype: int64

In [9]:
def standardize_data(test_df):
    test_df.drop(columns=['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Protocol', 'Timestamp'], errors="ignore", inplace=True)
    column_mapping = {
        'Dst Port': 'Destination Port',
        'Tot Fwd Pkts': 'Total Fwd Packets',
        'Tot Bwd Pkts': 'Total Backward Packets',
        'TotLen Fwd Pkts': 'Total Length of Fwd Packets',
        'TotLen Bwd Pkts': 'Total Length of Bwd Packets',
        'Fwd Pkt Len Max': 'Fwd Packet Length Max',
        'Fwd Pkt Len Min': 'Fwd Packet Length Min',
        'Fwd Pkt Len Mean': 'Fwd Packet Length Mean',
        'Fwd Pkt Len Std': 'Fwd Packet Length Std',
        'Bwd Pkt Len Max': 'Bwd Packet Length Max',
        'Bwd Pkt Len Min': 'Bwd Packet Length Min',
        'Bwd Pkt Len Mean': 'Bwd Packet Length Mean',
        'Bwd Pkt Len Std': 'Bwd Packet Length Std',
        'Flow Byts/s': 'Flow Bytes/s',
        'Flow Pkts/s': 'Flow Packets/s',
        'Fwd IAT Tot': 'Fwd IAT Total',
        'Bwd IAT Tot': 'Bwd IAT Total',
        'Fwd IAT Mean': 'Fwd IAT Mean',
        'Fwd IAT Std': 'Fwd IAT Std',
        'Fwd IAT Max': 'Fwd IAT Max',
        'Fwd IAT Min': 'Fwd IAT Min',
        'Bwd IAT Mean': 'Bwd IAT Mean',
        'Bwd IAT Std': 'Bwd IAT Std',
        'Bwd IAT Max': 'Bwd IAT Max',
        'Bwd IAT Min': 'Bwd IAT Min',
        'Fwd PSH Flags': 'Fwd PSH Flags',
        'Bwd PSH Flags': 'Bwd PSH Flags',
        'Fwd URG Flags': 'Fwd URG Flags',
        'Bwd URG Flags': 'Bwd URG Flags',
        'Fwd Header Len': 'Fwd Header Length',
        'Bwd Header Len': 'Bwd Header Length',
        'Fwd Pkts/s': 'Fwd Packets/s',
        'Bwd Pkts/s': 'Bwd Packets/s',
        'Pkt Len Min': 'Min Packet Length',
        'Pkt Len Max': 'Max Packet Length',
        'Pkt Len Mean': 'Packet Length Mean',
        'Pkt Len Std': 'Packet Length Std',
        'Pkt Len Var': 'Packet Length Variance',
        'FIN Flag Cnt': 'FIN Flag Count',
        'SYN Flag Cnt': 'SYN Flag Count',
        'RST Flag Cnt': 'RST Flag Count',
        'PSH Flag Cnt': 'PSH Flag Count',
        'ACK Flag Cnt': 'ACK Flag Count',
        'URG Flag Cnt': 'URG Flag Count',
        'CWE Flag Count': 'CWE Flag Count',
        'ECE Flag Cnt': 'ECE Flag Count',
        'Pkt Size Avg': 'Average Packet Size',
        'Fwd Seg Size Avg': 'Avg Fwd Segment Size',
        'Bwd Seg Size Avg': 'Avg Bwd Segment Size',
        'Fwd Byts/b Avg': 'Fwd Avg Bytes/Bulk',
        'Fwd Pkts/b Avg': 'Fwd Avg Packets/Bulk',
        'Fwd Blk Rate Avg': 'Fwd Avg Bulk Rate',
        'Bwd Byts/b Avg': 'Bwd Avg Bytes/Bulk',
        'Bwd Pkts/b Avg': 'Bwd Avg Packets/Bulk',
        'Bwd Blk Rate Avg': 'Bwd Avg Bulk Rate',
        'Subflow Fwd Pkts': 'Subflow Fwd Packets',
        'Subflow Fwd Byts': 'Subflow Fwd Bytes',
        'Subflow Bwd Pkts': 'Subflow Bwd Packets',
        'Subflow Bwd Byts': 'Subflow Bwd Bytes',
        'Init Fwd Win Byts': 'Init_Win_bytes_forward',
        'Init Bwd Win Byts': 'Init_Win_bytes_backward',
        'Fwd Act Data Pkts': 'act_data_pkt_fwd',
        'Fwd Seg Size Min': 'min_seg_size_forward',
        'Fwd Header Len': 'Fwd Header Length',
    }
    test_df = test_df.rename(columns=column_mapping)
    test_cleaned_df = clean_data(test_df)
    return test_cleaned_df

In [10]:
def balance_labels(X, y, random_state=42):
    """
    Trả về X và y đã được cân bằng giữa các nhãn bằng cách undersample.
    """
    # Tách nhãn
    mask_0 = y == 0
    mask_1 = y == 1

    count_0 = mask_0.sum()
    count_1 = mask_1.sum()
    min_count = min(count_0, count_1)

    # Undersample cho cả hai lớp
    X_bal = pd.concat([
        X[mask_0].sample(n=min_count, random_state=random_state),
        X[mask_1].sample(n=min_count, random_state=random_state)
    ])
    y_bal = pd.concat([
        y[mask_0].sample(n=min_count, random_state=random_state),
        y[mask_1].sample(n=min_count, random_state=random_state)
    ])

    # Shuffle
    X_bal = X_bal.sample(frac=1, random_state=random_state).reset_index(drop=True)
    y_bal = y_bal.reset_index(drop=True)

    return X_bal, y_bal

# GAN of duckduck (using PCAP file)

In [11]:
benign_df = pd.read_csv("/kaggle/input/duckduck-gan/benign_adv.csv")
benign_df['Label'] = 'Benign'
attack_df = pd.read_csv("/kaggle/input/duckduck-gan/apacheddos_adv.csv")
ddos_ip = ['172.28.30.88', '172.28.31.235', '172.28.30.90']
attack_df['Label'] = attack_df.apply(lambda x: 'Attack' if x['Dst IP'] == '172.28.31.229' and x['Src IP'] in ddos_ip else 'Benign', axis=1)
test_df = pd.concat([benign_df, attack_df], ignore_index=True)
standardized_test_df = standardize_data(test_df)
preprocessed_test_df = preprocess_data(standardized_test_df, False, scaler, ipca, kept_columns)

# Demo Apache DDoS

In [12]:
# benign_df = pd.read_csv("/kaggle/input/ml4sec-project-demo/benign_traffic.csv")
# benign_df['Label'] = 'Benign'
# attack_df = pd.read_csv("/kaggle/input/ml4sec-project-demo/apache_ddos.csv")
# ddos_ip = ['172.28.30.88', '172.28.31.235', '172.28.30.90']
# attack_df['Label'] = attack_df.apply(lambda x: 'Attack' if x['Dst IP'] == '172.28.31.229' and x['Src IP'] in ddos_ip else 'Benign', axis=1)
# test_df = pd.concat([benign_df, attack_df], ignore_index=True)
# standardized_test_df = standardize_data(test_df)
# preprocessed_test_df = preprocess_data(standardized_test_df, False, scaler, ipca, kept_columns)

# GAN of baobao (using CSV)

In [13]:
# test_df = pd.read_csv("/kaggle/input/baobao-ganadv/adv_cicids2017.csv")
# test_df = test_df.drop(test_df.columns[0], axis=1)
# cleaned_test_df = clean_data(test_df)
# preprocessed_test_df = preprocess_data(cleaned_test_df, False, scaler, ipca, kept_columns)

In [14]:
X = preprocessed_train_df.copy().drop('Attack Number', axis=1)
y = preprocessed_train_df['Attack Number']
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)
mab = OptunaMABThompsonSampling(n_clusters=5)
mab.train_and_evaluate(X_train.values, y_train.values, X_val.values, y_val.values)
X_test = preprocessed_test_df.copy().drop('Attack Number', axis=1)
y_test = preprocessed_test_df['Attack Number']
mab.lenapollon_predict(X_test.values, y_test.values)

[Cluster 1] merging into [Cluster 0]
[Cluster 2] merging into [Cluster 0]
[Cluster 3] merging into [Cluster 0]
[Cluster 4] merging into [Cluster 0]


[I 2025-05-27 03:09:13,666] A new study created in memory with name: no-name-aa67d8a4-fdf4-48ae-8aa1-8eb83782f48d



[Cluster 0] Training data: 1765653, Validation data: 756709, Classes: [0 1]
[Cluster 0] Training all arms...
=== Tuning RandomForest ===


[I 2025-05-27 03:10:12,374] Trial 0 finished with value: 0.9996811272641396 and parameters: {'n_estimators': 194, 'max_depth': 24}. Best is trial 0 with value: 0.9996811272641396.
[I 2025-05-27 03:10:38,162] Trial 1 finished with value: 0.9937404299367041 and parameters: {'n_estimators': 150, 'max_depth': 8}. Best is trial 0 with value: 0.9996811272641396.
[I 2025-05-27 03:12:19,833] Trial 2 finished with value: 0.999581152129107 and parameters: {'n_estimators': 339, 'max_depth': 41}. Best is trial 0 with value: 0.9996811272641396.
[I 2025-05-27 03:13:19,915] A new study created in memory with name: no-name-5c512dd7-cd01-4828-b6ad-50fb0023c698


[Cluster 0] Arm 0 (RandomForest) Accuracy: 0.9980
=== Tuning XGBoost ===


[I 2025-05-27 03:14:39,097] Trial 0 finished with value: 0.9997679905791479 and parameters: {'n_estimators': 270, 'max_depth': 37, 'learning_rate': 0.07846656934268532, 'scale_pos_weight': 2.252702559609424}. Best is trial 0 with value: 0.9997679905791479.
[I 2025-05-27 03:15:13,364] Trial 1 finished with value: 0.9997646435062961 and parameters: {'n_estimators': 99, 'max_depth': 36, 'learning_rate': 0.19884439201216852, 'scale_pos_weight': 4.277785800287642}. Best is trial 0 with value: 0.9997679905791479.
[I 2025-05-27 03:15:27,045] Trial 2 finished with value: 0.9990897688101951 and parameters: {'n_estimators': 70, 'max_depth': 5, 'learning_rate': 0.1830138391213584, 'scale_pos_weight': 4.435285527148478}. Best is trial 0 with value: 0.9997679905791479.
[I 2025-05-27 03:16:48,198] A new study created in memory with name: no-name-f5ee7fa1-6704-42d8-a942-13bbb20809c2


[Cluster 0] Arm 1 (XGBoost) Accuracy: 0.9982
=== Tuning LightGBM ===


[I 2025-05-27 03:18:14,958] Trial 0 finished with value: 0.9991138339355614 and parameters: {'max_depth': 11, 'num_leaves': 29, 'learning_rate': 0.019090414940533523, 'feature_fraction': 0.7055586276558419, 'bagging_fraction': 0.8067345834114653, 'bagging_freq': 3, 'min_child_samples': 40, 'n_estimators': 461}. Best is trial 0 with value: 0.9991138339355614.
[I 2025-05-27 03:20:09,215] Trial 1 finished with value: 0.9996796322149771 and parameters: {'max_depth': 12, 'num_leaves': 85, 'learning_rate': 0.022282262840819397, 'feature_fraction': 0.7986692042750179, 'bagging_fraction': 0.6022735116528658, 'bagging_freq': 2, 'min_child_samples': 29, 'n_estimators': 491}. Best is trial 1 with value: 0.9996796322149771.
[I 2025-05-27 03:20:39,670] Trial 2 finished with value: 0.9993171650621409 and parameters: {'max_depth': 30, 'num_leaves': 93, 'learning_rate': 0.03519357215580011, 'feature_fraction': 0.7485317706788674, 'bagging_fraction': 0.9267613025444202, 'bagging_freq': 2, 'min_child_sa

[Cluster 0] Arm 2 (LightGBM) Accuracy: 0.9980
[Cluster 0] Selected Arm: 1 (XGBoost) with Accuracy: 0.9982

=== Overall Evaluation ===
Accuracy: 0.9981697059239417
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    628946
           1       0.99      1.00      0.99    127763

    accuracy                           1.00    756709
   macro avg       1.00      1.00      1.00    756709
weighted avg       1.00      1.00      1.00    756709

Predicting cluster 0 with 24945 samples

=== Testing Performance ===
Accuracy: 0.24401683704149127
              precision    recall  f1-score   support

           0       0.22      1.00      0.37      5458
           1       0.96      0.03      0.07     19487

    accuracy                           0.24     24945
   macro avg       0.59      0.51      0.22     24945
weighted avg       0.80      0.24      0.13     24945


Cluster distribution in test set:
Cluster 0: 24945 samples
