<a href="https://colab.research.google.com/github/funway/nid-imbalance-study/blob/main/classification/cnn_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CNN 卷积神经网络
🚀 NYIT 880 | 🧑🏻‍💻 funway

## Modules import & Globals setup

In [2]:
### Modules ###
from pathlib import Path
from datetime import datetime
from collections import Counter
from io import StringIO

import pandas as pd
import numpy as np
import tensorflow as tf

## mount google drive
from google.colab import drive
drive.mount('/content/drive')


### Globals ###
## 数据文件目录
dataset = 'cse-cic-ids2018'
project_folder = Path('/content/drive/MyDrive/NYIT/880')
preprocessed_folder = project_folder / 'data/preprocessed'
scaled_folder = preprocessed_folder / 'scaled'
splits_folder = preprocessed_folder / 'splits'
balanced_folder = project_folder / 'data/balanced'
cgan_folder = balanced_folder / 'models'
model_folder = project_folder / 'data/classification/models'
report_folder = project_folder / 'data/classification/reports'

## Label 列的所有可能值(有序)
unique_labels = ['Benign', 'Bot', 'Brute Force -Web', 'Brute Force -XSS', 'DDOS attack-HOIC', 'DDOS attack-LOIC-UDP', 'DDoS attacks-LOIC-HTTP', 'DoS attacks-GoldenEye', 'DoS attacks-Hulk', 'DoS attacks-SlowHTTPTest', 'DoS attacks-Slowloris', 'FTP-BruteForce', 'Infilteration', 'SQL Injection', 'SSH-Bruteforce']
label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
print(f"[{datetime.now().strftime('%x %X')}] 🏷️ Label mapping: {label_mapping}")

### 全局随机数种子 ###
np.random.seed(42)
tf.random.set_seed(42)
op_seed = 42

### Utilities ###
# 导入 utility.ipynb 模块
%run /content/drive/MyDrive/NYIT/880/code/utils/utility.ipynb

Mounted at /content/drive
[06/25/25 06:10:10] 🏷️ Label mapping: {'Benign': 0, 'Bot': 1, 'Brute Force -Web': 2, 'Brute Force -XSS': 3, 'DDOS attack-HOIC': 4, 'DDOS attack-LOIC-UDP': 5, 'DDoS attacks-LOIC-HTTP': 6, 'DoS attacks-GoldenEye': 7, 'DoS attacks-Hulk': 8, 'DoS attacks-SlowHTTPTest': 9, 'DoS attacks-Slowloris': 10, 'FTP-BruteForce': 11, 'Infilteration': 12, 'SQL Injection': 13, 'SSH-Bruteforce': 14}
导入 utility.ipynb 模块. version 1.0.1


## 可调参数

In [3]:
# 是否强制重新训练分类器模型
retrain = False

# 选择 scaling 方法. 可选[standard, minmax, robust, l1pminmax]
scaling_method = 'l1pminmax'

# 选择 resample 的目标模式, 0 表示未经过 resample 平衡处理
resample_scheme = 2

# 过采样方法
cgan_model = 'cgan-b(n128,f70,c15,e100,b512,gen[128,[128, 256, 512],0.0003],dis[64,[256, 128],0.0001])_generator.keras'
cgan_filter_strategy = 0
cgan_filter_keep_high = True
cgan_filter_mark = f"f{'h' if cgan_filter_keep_high else 'l'}{cgan_filter_strategy}" if cgan_filter_strategy else ''
oversampling_method = cgan_model[:-16] + (f'f{cgan_filter_strategy}' if cgan_filter_strategy else '')

# 选择 欠采样 方法 (NA 表示不使用欠采样)
undersampling_method = 'rus'

# 选择 分类器
classifier = 'CNN'

##########################
if resample_scheme == 0:
    oversampling_method = 'NA'
    undersampling_method = 'NA'
##########################

# 训练结束后最终模型保存路径
model_file_final = model_folder / f'{dataset}_{scaling_method}_s{resample_scheme}_{oversampling_method}_{undersampling_method}_{classifier}-final.keras'

# 训练过程中最佳模型保存路径
model_file_best  = model_folder / f'{dataset}_{scaling_method}_s{resample_scheme}_{oversampling_method}_{undersampling_method}_{classifier}-best.keras'

# 需要评估的模型
models_to_evaluate = [model_file_best, model_file_final]

## Loading Data

In [4]:
if resample_scheme == 0:
    file_X_train = splits_folder / f'train_X_{scaling_method}.npy'
    file_y_train = splits_folder / f'train_y.npy'
elif undersampling_method in ['NA', 'rus']:
    file_X_train = balanced_folder / f'train_X_{scaling_method}_s{resample_scheme}_{oversampling_method}.npy'
    file_y_train = balanced_folder / f'train_y_{scaling_method}_s{resample_scheme}_{oversampling_method}.npy'
else:
    file_X_train = balanced_folder / f'train_X_{scaling_method}_s{resample_scheme}_{oversampling_method}_{undersampling_method}.npy'
    file_y_train = balanced_folder / f'train_y_{scaling_method}_s{resample_scheme}_{oversampling_method}_{undersampling_method}.npy'

file_X_valid = splits_folder / f'valid_X_{scaling_method}.npy'
file_y_valid = splits_folder / f'valid_y.npy'

file_X_test = splits_folder / f'test_X_{scaling_method}.npy'
file_y_test = splits_folder / f'test_y.npy'

print(f"[{now()}] Loading datasets...")
if file_X_train.exists():
    X_train = np.load(file_X_train)
    y_train = np.load(file_y_train)
    neet_oversample = False
else:
    print(f'[{now()}] ⚠️ File not found: {file_X_train}')
    print('Load original files')
    X_train = np.load(splits_folder / f'train_X_{scaling_method}.npy')
    y_train = np.load(splits_folder / f'train_y.npy')
    neet_oversample = True
X_valid = np.load(file_X_valid)
y_valid = np.load(file_y_valid)
X_test = np.load(file_X_test)
y_test = np.load(file_y_test)

[06/24/25 23:10:14 PDT] Loading datasets...
[06/24/25 23:10:15 PDT] ⚠️ File not found: /content/drive/MyDrive/NYIT/880/data/balanced/train_X_l1pminmax_s2_cgan-b(n128,f70,c15,e100,b512,gen[128,[128, 256, 512],0.0003],dis[64,[256, 128],0.0001]).npy
Load original files


### 重采样函数定义

In [5]:
def generate_samples(generator: tf.keras.Model, target_class: int, num_samples: int):
    """
    Generates samples using the generator for a specific target class.

    Args:
        generator (tensorflow.keras.Model): The generator model.
        target_class (int): The target class for which to generate samples.
        num_samples (int): The number of samples to generate.

    Returns:
        numpy.ndarray: Generated samples as a NumPy array.
    """
    noise_dim = generator.input_shape[0][1]

    # 随机生成一组噪声向量 shape=(num_samples, noise_dim)
    noise = np.random.normal(0, 1, size=(num_samples, noise_dim))

    # 随机生成一组类别标签 shape=(num_samples, 1), 全部为 target_class
    labels = np.full((num_samples, 1), fill_value=target_class, dtype=np.int32)

    # 使用生成器生成数据
    generated_data = generator.predict([noise, labels], verbose=0)
    return generated_data


def cgan_undersample(cgan_discriminator: tf.keras.Model, sampling_strategy: dict, X: np.ndarray, y: np.ndarray, keep_high_score=True):
    """
    使用 CGAN 判别器对目标数据集进行欠采样，删除评分低的数据
    """
    print(f'[{now()}] 📉 CGAN Undersampling ...')
    print(f'  original X.shaep: {X.shape}')
    print(f'  original labels_counts: {get_label_counts(y)}')
    print(f'  undersample to: {sampling_strategy}')

    keep_idxs = []

    for cls, target_n in sampling_strategy.items():
        idxs = np.where(y == cls)[0]

        # nothing to drop if already <= target
        if len(idxs) <= target_n:
            keep_idxs.extend(idxs.tolist())
            print(f'[{now()}] Skipping class [{cls}]: {len(idxs)} ≤ {target_n}')
            continue

        # Score all samples of this cls
        X_cls = X[idxs]
        y_cls = y[idxs].reshape(-1, 1)
        scores = cgan_discriminator([X_cls, y_cls], training=False)
        scores = tf.reshape(scores, [-1]).numpy()

        if keep_high_score:
            # 按照判别器评分降序排列，取前 n 个保留(保留评分高的)
            top_idxs = idxs[np.argsort(scores)[::-1][:target_n]]
        else:
            # 按照判别器评分升序排列，取前 n 个保留(保留评分低的)
            top_idxs = idxs[np.argsort(scores)[:target_n]]

        keep_idxs.extend(top_idxs.tolist())
        print(f'[{now()}] Dropping {len(idxs) - target_n} samples for class [{cls}]: {len(idxs)} -> {target_n}')

    # For any classes not in sampling_strategy, keep all
    all_classes = set(np.unique(y))
    leftover = all_classes - set(sampling_strategy.keys())
    for cls in leftover:
        keep_idxs.extend(np.where(y == cls)[0].tolist())

    # produce final undersampled arrays
    keep_idxs = np.sort(keep_idxs)
    X_res = X[keep_idxs]
    y_res = y[keep_idxs]

    print(f'[{now()}] 📉 After CGAN Undersampling:')
    print(f'  X_res.shape: {X_res.shape}')
    print(f'  Labels: {get_label_counts(y_res)}')

    return X_res, y_res

def cgan_oversample(cgan_generator: tf.keras.Model, sampling_strategy: dict, X: np.ndarray, y: np.ndarray):
    current_counts = get_label_counts(y)

    print(f'[{now()}] 📈 CGAN Oversampling ...')
    print(f'  original X.shaep: {X.shape}')
    print(f'  original labels_counts: {current_counts}')
    print(f'  oversample to: {sampling_strategy}')

    all_X = [X]
    all_y = [y]

    for cls, desired_n in sampling_strategy.items():
        current_n = current_counts.get(cls, 0)
        n_to_gen = desired_n - current_n
        if n_to_gen > 0:
            print(f'[{now()}] Generating {n_to_gen} samples for class [{cls}]: {current_n} -> {desired_n}')
            gen_samples = generate_samples(cgan_generator, cls, n_to_gen)
            all_X.append(gen_samples)
            all_y.append(np.full(n_to_gen, cls, dtype=np.int32))
        else:
            print(f'[{now()}] Skipping class [{cls}]: {current_n} ≥ {desired_n}')

    X_res = np.concatenate(all_X)
    y_res = np.concatenate(all_y)

    print(f'[{now()}] 📈 After CGAN Oversampling:')
    print(f'  X_res.shape: {X_res.shape}')
    print(f'  Labels: {get_label_counts(y_res)}')

    return X_res, y_res

### 过采样

In [6]:
if neet_oversample:
    print(f"[{now()}] 📈 Oversampling with {oversampling_method}")

    # ROS 过采样
    from imblearn.over_sampling import RandomOverSampler

    # 判断 oversampling_method 字符串开头是否为 ros
    if oversampling_method.startswith('ros'):
        ros_scheme = int(oversampling_method[3])
        oversample_to = ros_schemes[ros_scheme]
        print(f'[{now()}] Applying ROS oversampling to: {oversample_to}')

        oversampler = RandomOverSampler(sampling_strategy=oversample_to, random_state=op_seed)
        X_train, y_train = oversampler.fit_resample(X_train, y_train)

        print(f'[{now()}] After ROS oversampling:')
        print(f'  X.shape: {X_train.shape}, y.shape: {y_train.shape}')
        print(f'  Labels: { {int(k): v for k, v in sorted(Counter(y_train).items())} }\n')
    else:
        print(f'[{now()}] No need to apply ROS oversampling.')


    # 加载 CGAN
    generator_file = cgan_folder / scaling_method / cgan_model
    discriminator_file = cgan_folder / scaling_method / cgan_model.replace('generator', 'discriminator')

    print(f"[{now()}] 📡 Loading pre-trained generator from {generator_file}")
    generator = tf.keras.models.load_model(generator_file)

    print(f"[{now()}] 📡 Loading pre-trained discriminator from {discriminator_file}")
    discriminator = tf.keras.models.load_model(discriminator_file)


    # CGAN 欠采样 (如果需要的话)
    if cgan_filter_strategy:
        print(f'[{now()}] 🟡 Apply CGAN Undersampling.')
        X_train, y_train = cgan_undersample(discriminator, cgan_filter_schemes[cgan_filter_strategy], X_train, y_train)


    # CGAN 过采样
    print(f'[{now()}] 🟢 Apply CGAN Oversampling.')
    resample_to = resample_schemes[resample_scheme]
    X_train, y_train = cgan_oversample(generator, resample_to, X_train, y_train)


[06/24/25 23:10:43 PDT] 📈 Oversampling with cgan-b(n128,f70,c15,e100,b512,gen[128,[128, 256, 512],0.0003],dis[64,[256, 128],0.0001])
[06/24/25 23:10:44 PDT] No need to apply ROS oversampling.
[06/24/25 23:10:44 PDT] 📡 Loading pre-trained generator from /content/drive/MyDrive/NYIT/880/data/balanced/models/l1pminmax/cgan-b(n128,f70,c15,e100,b512,gen[128,[128, 256, 512],0.0003],dis[64,[256, 128],0.0001])_generator.keras
[06/24/25 23:10:50 PDT] 📡 Loading pre-trained discriminator from /content/drive/MyDrive/NYIT/880/data/balanced/models/l1pminmax/cgan-b(n128,f70,c15,e100,b512,gen[128,[128, 256, 512],0.0003],dis[64,[256, 128],0.0001])_discriminator.keras
[06/24/25 23:10:51 PDT] 🟢 Apply CGAN Oversampling.
[06/24/25 23:10:52 PDT] 📈 CGAN Oversampling ...
  original X.shaep: (3797547, 70)
  original labels_counts: {0: 1600000, 1: 228953, 2: 489, 3: 184, 4: 548809, 5: 1384, 6: 460953, 7: 33206, 8: 369530, 9: 111912, 10: 8792, 11: 154683, 12: 128511, 13: 70, 14: 150071}
  oversample to: {0: 80000

### 欠采样

In [7]:
## 处理 RUS 欠采样
from imblearn.under_sampling import RandomUnderSampler

label_counts = get_label_counts(y_train)
resample_to = resample_schemes[resample_scheme]
undersample_to = {
    k: resample_to[k]
    for k in resample_to
    if resample_to[k] < label_counts.get(k, 0)
}

print(f"[{now()}] 当前样本数: {X_train.shape}, {label_counts}")
print(f"[{now()}] Undersample to: {undersample_to}")

if undersampling_method == 'rus':
    print(f"[{now()}] 📉 使用 RUS 欠采样")

    rus = RandomUnderSampler(sampling_strategy=undersample_to, random_state=op_seed)
    X_train, y_train = rus.fit_resample(X_train, y_train)

elif undersampling_method == 'cgan':
    print(f"[{now()}] 📉 使用 CGAN 欠采样")
    X_train, y_train = cgan_undersample(discriminator, undersample_to, X_train, y_train)

elif undersampling_method == 'cganlow':
    print(f"[{now()}] 📉 使用 CGAN 欠采样 (保留评分低的)")
    X_train, y_train = cgan_undersample(discriminator, undersample_to, X_train, y_train, keep_high_score=False)

elif undersampling_method == 'iht':
    print(f"[{now()}] 📉 使用 IHT 欠采样")
    from imblearn.under_sampling import InstanceHardnessThreshold
    from xgboost import XGBClassifier

    # 使用 GPU 加速的 XGBClassifier 作为 IHT 的分类器
    iht = InstanceHardnessThreshold(
        estimator=XGBClassifier(tree_method='hist', device='cuda', n_estimators=200, max_depth=5, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, eval_metric='mlogloss'),
        sampling_strategy={9: 100_000, 12: 110_000},
        random_state=op_seed)
    X_train, y_train = iht.fit_resample(X_train, y_train)
    print(f'[{now()}] After IHT Undersampling: {X_train.shape}, {get_label_counts(y_train)}')

    # IHT 无法保证欠采样到目标数量，所以要再补一个 RUS
    rus = RandomUnderSampler(sampling_strategy=undersample_to, random_state=op_seed)
    X_train, y_train = rus.fit_resample(X_train, y_train)

elif undersampling_method == 'NA':
    print(f"[{now()}] 不进行欠采样")

else:
    raise Exception(f'[{now()}] ❌ 未定义的欠采样方法: {undersampling_method}')

print(f"[{now()}] 欠采样完成: {X_train.shape}, {get_label_counts(y_train)}")

[06/24/25 23:11:17 PDT] 当前样本数: (4103422, 70), {0: 1600000, 1: 228953, 2: 50000, 3: 50000, 4: 548809, 5: 50000, 6: 460953, 7: 100000, 8: 369530, 9: 111912, 10: 50000, 11: 154683, 12: 128511, 13: 50000, 14: 150071}
[06/24/25 23:11:17 PDT] Undersample to: {0: 800000, 1: 200000, 4: 200000, 6: 200000, 8: 200000, 9: 110000, 11: 150000, 12: 120000, 14: 150000}
[06/24/25 23:11:17 PDT] 📉 使用 RUS 欠采样
[06/24/25 23:11:18 PDT] 欠采样完成: (2480000, 70), {0: 800000, 1: 200000, 2: 50000, 3: 50000, 4: 200000, 5: 50000, 6: 200000, 7: 100000, 8: 200000, 9: 110000, 10: 50000, 11: 150000, 12: 120000, 13: 50000, 14: 150000}


In [8]:
report_dataset = ''
report_dataset += f'\nTrain set: {file_X_train.name}, {file_y_train.name} \n'
report_dataset += f'    shape: {X_train.shape}, {y_train.shape} \n'
report_dataset += f'    labels: {get_label_counts(y_train)} \n'
report_dataset += f'\nValid set: {file_X_valid.name}, {file_y_valid.name} \n'
report_dataset += f'    shape: {X_valid.shape}, {y_valid.shape} \n'
report_dataset += f'    labels: {get_label_counts(y_valid)} \n'
report_dataset += f'\nTest set: {file_X_test.name}, {file_y_test.name} \n'
report_dataset += f'    shape: {X_test.shape}, {y_test.shape} \n'
report_dataset += f'    labels: {get_label_counts(y_test)} \n'

print(report_dataset)


Train set: train_X_l1pminmax_s2_cgan-b(n128,f70,c15,e100,b512,gen[128,[128, 256, 512],0.0003],dis[64,[256, 128],0.0001]).npy, train_y_l1pminmax_s2_cgan-b(n128,f70,c15,e100,b512,gen[128,[128, 256, 512],0.0003],dis[64,[256, 128],0.0001]).npy 
    shape: (2480000, 70), (2480000,) 
    labels: {0: 800000, 1: 200000, 2: 50000, 3: 50000, 4: 200000, 5: 50000, 6: 200000, 7: 100000, 8: 200000, 9: 110000, 10: 50000, 11: 150000, 12: 120000, 13: 50000, 14: 150000} 

Valid set: valid_X_l1pminmax.npy, valid_y.npy 
    shape: (474693, 70), (474693,) 
    labels: {0: 200000, 1: 28619, 2: 61, 3: 23, 4: 68601, 5: 173, 6: 57619, 7: 4151, 8: 46191, 9: 13989, 10: 1099, 11: 19335, 12: 16064, 13: 9, 14: 18759} 

Test set: test_X_l1pminmax.npy, test_y.npy 
    shape: (474694, 70), (474694,) 
    labels: {0: 200000, 1: 28619, 2: 61, 3: 23, 4: 68602, 5: 173, 6: 57619, 7: 4151, 8: 46191, 9: 13989, 10: 1099, 11: 19336, 12: 16064, 13: 8, 14: 18759} 



In [9]:
## 给特征矩阵X增加纬度，标签向量y进行独热编码
from tensorflow.keras.utils import to_categorical

if X_train.ndim == 2:  # 只在原始二维时加维度
    # 为每个特征矩阵增加一个纬度(CNN 卷积神经网络中的通道)
    X_train = np.expand_dims(X_train, 2)
    X_valid = np.expand_dims(X_valid, 2)
    X_test = np.expand_dims(X_test, 2)

    # 对标签数据进行独热编码 one-hot
    y_train = to_categorical(y_train)
    y_valid = to_categorical(y_valid)
    y_test = to_categorical(y_test)

print(f"[{now()}] 升维之后:", X_train.shape, X_valid.shape, X_test.shape)
print(f"[{now()}] After one-hot:", y_train.shape, y_valid.shape, y_test.shape)

[06/24/25 23:11:19 PDT] 升维之后: (2480000, 70, 1) (474693, 70, 1) (474694, 70, 1)
[06/24/25 23:11:19 PDT] After one-hot: (2480000, 15) (474693, 15) (474694, 15)


## Train Model

In [10]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Nadam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau


def train_model(x_train, y_train, x_val, y_val, batch_size=256, epochs=100):
    """模型训练函数

    Args:
        x_train (_type_): 训练集特征矩阵
        y_train (_type_): 训练集标签
        x_val (_type_): 验证集特征矩阵
        y_val (_type_): 验证集标签
        batch_size (int, optional): _description_. Defaults to 256.
        epochs (int, optional): _description_. Defaults to 100.
    """

    # 1. 输入层，指定输入数据的形状 (features, depth)
    input_signal = tf.keras.Input(x_train.shape[1:])

    # 2. 第一组卷积层，使用 32 个卷积核，大小为 3，激活函数为 ReLU，填充方式为 same, 初始化方法为 he_uniform
    x = layers.Conv1D(32, 3, activation='relu', padding='same', kernel_initializer='he_uniform')(input_signal)
    x = layers.Conv1D(32, 3, activation='relu', padding='same', kernel_initializer='he_uniform')(x)
    x = layers.MaxPooling1D(pool_size=2, strides=2)(x)
    x = layers.Dropout(0.2)(x)
    x = layers.BatchNormalization()(x)

    # 3. 第二组卷积层，使用 64 个卷积核，大小为 3，激活函数为 ReLU，填充方式为 same, 初始化方法为 he_uniform
    x = layers.Conv1D(64, 3, activation='relu', padding='same', kernel_initializer='he_uniform')(x)
    x = layers.Conv1D(64, 3, activation='relu', padding='same', kernel_initializer='he_uniform')(x)
    x = layers.MaxPooling1D(pool_size=2, strides=2)(x)
    x = layers.Dropout(0.2)(x)
    x = layers.BatchNormalization()(x)

    # 4. 全连接层, 将卷积层的输出展平
    x = layers.Flatten()(x)
    x = layers.Dense(32, activation='relu')(x)
    x = layers.Dense(y_train.shape[1], activation='softmax')(x)  # y_train.shape[1] 是类别数

    # 5. 创建模型
    model = models.Model(inputs=input_signal, outputs=x)

    # 6. 打印模型信息
    print(f"[{now()}] model summary:")
    model.summary()

    # 7. 定义优化器
    # learning_rate 指定步长大小，越大则收敛越快，但越不稳定。默认值为 0.001
    # 其他参数保持默认值即可
    learning_rate = 0.001
    nadam = Nadam(learning_rate=learning_rate)

    # 8. 定义 callbacks 方法 (每个 epoch 训练结束后调用)
    callbacks = [
        # ⏹️ 提前停止训练：如果验证集 loss 连续 15 轮没有下降，就停止训练，防止过拟合
        EarlyStopping(
            monitor='val_loss',          # 监控指标是验证集的 loss
            min_delta=1e-6,              #
            patience=15,                 # 容忍 15 个 epoch 不进步
            restore_best_weights=True    # 自动回滚到验证集 loss 最优的模型参数
        ),

        # 🔽 自动调整学习率：如果验证集 loss 停滞 10 个 epoch，就将学习率乘以 0.1
        ReduceLROnPlateau(
            monitor='val_loss',         # 监控验证集 loss
            factor=0.1,                 # 学习率降低为原来的 1/10
            patience=10                 # 容忍多少个 epoch 无进步
        ),

        # 💾 模型保存器：在验证集 accuracy 最佳时保存整个模型到文件
        ModelCheckpoint(
            filepath=model_file_best,  # 模型保存路径
            monitor='val_accuracy',     # 保存依据是验证集准确率
            save_best_only=True         # 只在 val_accuracy 最佳时才保存
        )
    ]

    # 9. 编译模型
    model.compile(loss='categorical_crossentropy',
                  optimizer=nadam,
                  metrics=['accuracy'])

    # 10. 开始训练模型
    time_start = datetime.now()
    print(f"[{now()}] 🏋️‍♂️ Training model...")
    history = model.fit(x_train, y_train,
                        batch_size=batch_size, epochs=epochs,
                        validation_data=(x_val, y_val),
                        verbose=1,
                        callbacks=callbacks)
    time_end = datetime.now()
    print(f"[{now()}] 🕗 Training completed. Time elapsed: {time_end - time_start}")

    return model, history

In [None]:
model_file = model_file_final

if model_file.exists() and not retrain:
    print(f"[{now()}] ⏭️ 模型已存在，不重新训练 ({model_file})")
else:
    # 训练模型
    print(f"[{now()}] 🤖 build and train model ({model_file.name})...")
    model, history = train_model(X_train, y_train, X_valid, y_valid)

    # 打印history
    print(f"[{now()}] history:\n  {history.history}")

    # 保存模型
    model.save(model_file)
    print(f"[{now()}] 💾 model saved to {model_file}")

[06/24/25 23:11:21 PDT] 🤖 build and train model (cse-cic-ids2018_l1pminmax_s2_cgan-b(n128,f70,c15,e100,b512,gen[128,[128, 256, 512],0.0003],dis[64,[256, 128],0.0001])_rus_CNN-final.keras)...
[06/24/25 23:11:21 PDT] model summary:


[06/24/25 23:11:21 PDT] 🏋️‍♂️ Training model...
Epoch 1/100
[1m9688/9688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 4ms/step - accuracy: 0.9043 - loss: 0.2390 - val_accuracy: 0.9468 - val_loss: 0.1358 - learning_rate: 0.0010
Epoch 2/100
[1m9688/9688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 3ms/step - accuracy: 0.9251 - loss: 0.1697 - val_accuracy: 0.9495 - val_loss: 0.1349 - learning_rate: 0.0010
Epoch 3/100
[1m9688/9688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 3ms/step - accuracy: 0.9263 - loss: 0.1668 - val_accuracy: 0.9485 - val_loss: 0.1344 - learning_rate: 0.0010
Epoch 4/100
[1m9688/9688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 3ms/step - accuracy: 0.9270 - loss: 0.1652 - val_accuracy: 0.9482 - val_loss: 0.1340 - learning_rate: 0.0010
Epoch 5/100
[1m9688/9688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 3ms/step - accuracy: 0.9274 - loss: 0.1641 - val_accuracy: 0.9490 - val_loss: 0.1336 - learning_rate: 0.0010
Ep

## Evaluate Model

In [None]:
# 导入 evaluation.ipynb 模块
%run /content/drive/MyDrive/NYIT/880/code/utils/evaluation.ipynb

def get_model_summary_string(model):
    """获取 Keras 模型的摘要字符串"""
    stream = StringIO()
    model.summary(print_fn=lambda x: stream.write(x + '\n'))
    summary_str = stream.getvalue()
    stream.close()
    return summary_str

for model_file in models_to_evaluate:
    print(f"[{now()}] ⚙️ Loading model from {model_file}")
    model = models.load_model(model_file)

    # 模型信息
    report_model_info = f"[{now()}] 🤖 Model: {model_file.name}\n"
    report_model_info += f"[{now()}] ================= 🧠 Model Info =================\n"
    report_model_info += f"Model summary: {get_model_summary_string(model)}"
    print(report_model_info)

    # 内建的 evaluate 评估
    # report_model_info += f"[{now()}] ================= 🔶 tf.keras.Model.evaluate() =================\n"
    # results = model.evaluate(X_test, y_test, verbose=1, return_dict=True)
    # report_model_info += f"{results}\n"

    # 获取预测结果（概率），再转为类别索引
    y_pred_probs = model.predict(X_test, verbose=1)  # 得到的一个矩阵，每行代表一个样本属于每个类别的概率. 比如[0.1, 0.7, 0.2]就表示该样本有0.1的概率属于类别0, 0.7的概率属于类别1
    y_pred = np.argmax(y_pred_probs, axis=1)  # 对每行提取值最大的列(提取出每个样本的预测类别)
    y_true = np.argmax(y_test, axis=1)  # 提取出每个样本的真实类别

    # 生成评估报告
    report_eval = generate_evaluation_report(y_true=y_true, y_predict=y_pred,
                                             label_mapping=label_mapping,
                                             figure_output=report_folder / 'png' / f'{model_file.stem}_confusion_matrix.png',
                                             figure_show=True)
    print(report_eval)

    # 保存报告
    report_file = report_folder / f'{model_file.stem}_report.txt'
    report_all = report_dataset + '\n' + report_model_info + '\n' + report_eval
    with open(report_file, 'w') as f:
        f.write(report_all)
        print(f"[{now()}] 💾 Evaluation report saved to {report_file}")
    print('============================================================')

In [None]:
from google.colab import runtime
print(f'[{now()}] ⛔ 运行结束. shutdown now...')
runtime.unassign()