<a href="https://colab.research.google.com/github/funway/Countdown/blob/master/imbalance%20processing/cGAN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 使用 cGAN 对训练集进行过采样




## Google Env

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Modules import & Globals setup

In [None]:
### Modules ###
from pathlib import Path
from datetime import datetime
from collections import Counter
import numpy as np
import os


### Globals ###

## Label 特征的数值化编码
label_mapping = {
    "Benign": 0,
    "Bot": 1,
    "Brute Force -Web": 2,
    "Brute Force -XSS": 3,
    "DDOS attack-HOIC": 4,
    "DDOS attack-LOIC-UDP": 5,
    "DDoS attacks-LOIC-HTTP": 6,
    "DoS attacks-GoldenEye": 7,
    "DoS attacks-Hulk": 8,
    "DoS attacks-SlowHTTPTest": 9,
    "DoS attacks-Slowloris": 10,
    "FTP-BruteForce": 11,
    "Infilteration": 12,
    "SQL Injection": 13,
    "SSH-Bruteforce": 14
}

## 计划尝试三种样本分布模式
resample_schemes = {
    # 模式1. (标签0:非0标签总和) ≈ (160:157); 非0标签按大概比例增强
    1: {
        0: 1600000,  # 保持不变
        1: 200000,   # ⤵️ 228953
        2: 20000,    # ⤴️ 489
        3: 20000,    # ⤴️ 184
        4: 200000,   # ⤵️ 548809
        5: 20000,    # ⤴️ 1384
        6: 200000,   # ⤵️ 460953
        7: 100000,   # ⤴️ 33206
        8: 200000,   # ⤵️ 369530
        9: 111912,   # ⤴️ 111912
        10: 50000,   # ⤴️ 8792
        11: 154683,  # ⤴️ 154683
        12: 128511,  # ⤴️ 128511
        13: 20000,   # ⤴️ 70
        14: 150071   # ⤴️ 150071
    },
    # 模式2. (标签0:最多非0标签样本) ≈ (3:2)
    2: {
        0: 300000,   # ⤵️ 1600000
        1: 200000,   # ⤵️ 228953
        2: 20000,    # ⤴️ 489
        3: 20000,    # ⤴️ 184
        4: 200000,   # ⤵️ 548809
        5: 20000,    # ⤴️ 1384
        6: 200000,   # ⤵️ 460953
        7: 100000,   # ⤴️ 33206
        8: 200000,   # ⤵️ 369530
        9: 111912,   # ⤴️ 111912
        10: 50000,   # ⤴️ 8792
        11: 154683,  # ⤴️ 154683
        12: 128511,  # ⤴️ 128511
        13: 20000,   # ⤴️ 70
        14: 150071   # ⤴️ 150071
    },
    # 模式3. (标签0:非0标签总和) = (160:160); 每种非0标签都占 114300 个样本
    3: {
        0: 1600000,
        **{k: 114300 for k in range(1, 15)}
    },
    # 模式4. 所有标签都 20万样本
    4: {
       **{k: 200000 for k in range(0, 15)}
    },
}

## 数据目录
datasets_folder = Path('/content/drive/MyDrive/NYIT/870/datasets')
dataset = 'CSE-CIC-IDS2018'
preprocessed_folder = datasets_folder / 'preprocessed' / dataset
balanced_folder = datasets_folder / 'balanced' / dataset

In [None]:
scaling_method = 'standard'
# scaling_method = 'robust'

resample_scheme = 2
resample_to = resample_schemes[resample_scheme]

# oversampling_method = 'cGAN'
oversampling_method = 'ROS1+cGAN'
# undersampling_method = 'NM'

In [None]:
X_file = preprocessed_folder / f'integrated/train_X_{scaling_method}.npy'
y_file = preprocessed_folder / f'integrated/train_label_{scaling_method}.npy'

# 加载训练集文件
X = np.load(X_file)
y = np.load(y_file)

print(f'[{datetime.now().strftime("%x %X")}] {X_file.name} shape: {X.shape}, {y_file.name} shape: {y.shape}')
print(f'Labels: { {int(k): v for k, v in sorted(Counter(y).items())} }\n')

[04/26/25 22:45:51] train_X_standard.npy shape: (3797547, 70), train_label_standard.npy shape: (3797547,)
Labels: {0: 1600000, 1: 228953, 2: 489, 3: 184, 4: 548809, 5: 1384, 6: 460953, 7: 33206, 8: 369530, 9: 111912, 10: 8792, 11: 154683, 12: 128511, 13: 70, 14: 150071}



## 利用 ROS 提前补充极少数类样本
- 先用 ROS 随机复制的方式，将极少数类样本扩展到可接受的程度后再进行 oversampling


In [None]:
from imblearn.over_sampling import RandomOverSampler

oversample_to = {}
# 判断 oversampling_method 字符串开头是否为 ROS
if oversampling_method.startswith('ROS'):
    if oversampling_method.startswith('ROS+'):
        oversample_to = {2: 1000, 3: 500, 13: 500}
    elif oversampling_method.startswith('ROS1+'):
        oversample_to = {2: 1000, 3: 1000, 13: 1000}

    oversampler = RandomOverSampler(sampling_strategy=oversample_to, random_state=42)
    X, y = oversampler.fit_resample(X, y)

    print(f'[{datetime.now().strftime("%x %X")}] After ROS oversampling:')
    print(f'  X.shape: {X.shape}, y.shape: {y.shape}')
    print(f'  Labels: { {int(k): v for k, v in sorted(Counter(y).items())} }\n')
else:
    print(f'[{datetime.now().strftime("%x %X")}] No need to ROS oversampling.')

[04/26/25 22:45:54] After ROS oversampling:
  X.shape: (3799804, 70), y.shape: (3799804,)
  Labels: {0: 1600000, 1: 228953, 2: 1000, 3: 1000, 4: 548809, 5: 1384, 6: 460953, 7: 33206, 8: 369530, 9: 111912, 10: 8792, 11: 154683, 12: 128511, 13: 1000, 14: 150071}



## cGAN (conditional Generative Adversarial Network, 条件生成式对抗网络)





### 定义 cGAN

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model

# 1️⃣ 定义 cGAN 生成器
def build_generator(noise_dim, feature_dim, num_classes):
    """
    Builds the generator model for a conditional GAN.

    Args:
        noise_dim (int): Dimension of the noise input. 噪声向量的维度
        feature_dim (int): Dimension of the generated features. 生成数据的特征维度
        num_classes (int): Number of classes for the conditional input. 类别数量(labels 种类)

    Returns:
        Model: A Keras Model representing the generator.
    """
    # 定义输入层
    noise_input = layers.Input(shape=(noise_dim,))  # 噪声输入，输入一个 noise_dim 维的向量作为噪声
    label_input = layers.Input(shape=(1,), dtype='int32')  # 类别输入, 输入一个整数作为类别标签

    # 类别嵌入层
    label_embedding = layers.Embedding(num_classes, noise_dim)(label_input)  # 将类别标签映射为嵌入向量(noise_dim 维)
    label_embedding = layers.Flatten()(label_embedding) # 将嵌入向量展平, (1, noise_dim) -> (noise_dim,)

    # 合并 噪声向量 和 类别嵌入向量
    # 合并后的向量维度为 (noise_dim + noise_dim,) = (2 * noise_dim,)
    combined_input = layers.Concatenate()([noise_input, label_embedding])

    # 生成器网络结构 (三层全连接层 Dense, 每层神经元数分别为 256, 512, 1024, 激活函数为 relu)
    x = layers.Dense(256, activation='relu')(combined_input)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(512, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(1024, activation='relu')(x)
    x = layers.BatchNormalization()(x)

    # 输出层 (feature_dim 维的特征向量)
    output = layers.Dense(feature_dim, activation='linear')(x)

    # 返回生成器模型
    # 该模型接受两个输入，一个是噪声向量，一个是类别标签. 输出生成的特征向量
    return Model([noise_input, label_input], output, name="Generator")

# 2️⃣ 定义 cGAN 判别器
def build_discriminator(feature_dim, num_classes):
    """
    Builds the discriminator model for a conditional GAN.

    Args:
        feature_dim (int): Dimension of the input features.
        num_classes (int): Number of classes for the conditional input.

    Returns:
        Model: A Keras Model representing the discriminator.
    """
    # 定义输入层
    data_input = layers.Input(shape=(feature_dim,))  # 输入一个特征向量 (feature_dim 维)
    label_input = layers.Input(shape=(1,), dtype='int32')  # 输入一个类别标签 (整数)

    # 类别嵌入层
    label_embedding = layers.Embedding(num_classes, feature_dim)(label_input) # 将类别标签映射为嵌入向量(feature_dim 维)
    label_embedding = layers.Flatten()(label_embedding)  # 将嵌入向量展平, (1, feature_dim) -> (feature_dim,)

    # 合并 特征向量 和 类别嵌入向量
    # 合并后的向量维度为 (feature_dim + feature_dim,) = (2 * feature_dim,)
    combined_input = layers.Concatenate()([data_input, label_embedding])

    # 判别器网络结构 (两层全连接层 Dense, 每层神经元数分别为 512, 256, 激活函数为 LeakyReLU)
    x = layers.Dense(512)(combined_input)
    x = layers.LeakyReLU(0.2)(x)
    x = layers.Dropout(0.4)(x)
    x = layers.Dense(256)(x)
    x = layers.LeakyReLU(0.2)(x)
    x = layers.Dropout(0.4)(x)

    # 输出层 (sigmoid 激活函数. 输出为 [0, 1] 区间, 接近0: 判断为假数据, 接近1: 判断为真实数据)
    output = layers.Dense(1, activation='sigmoid')(x)

    # 返回判别器模型
    # 该模型接受两个输入，一个是特征向量，一个是类别标签. 输出判别结果(真/假 的概率)
    return Model([data_input, label_input], output, name="Discriminator")

# 3️⃣ 组合 cGAN
def build_gan(generator, discriminator):
    """
    Combines the generator and discriminator to build the conditional GAN (cGAN) model.

    Args:
        generator (Model): The generator model that generates data based on noise and labels.
        discriminator (Model): The discriminator model that evaluates the authenticity of the generated data.

    Returns:
        Model: A Keras Model representing the combined cGAN, which takes noise and labels as input
               and outputs the validity score.
    """
    # 冻结判别器的参数 (在训练 cGAN 时不训练判别器, 只训练生成器)
    discriminator.trainable = False

    noise_dim = generator.input_shape[0][1]

    # 定义输入层
    noise_input = layers.Input(shape=(noise_dim,))
    label_input = layers.Input(shape=(1,), dtype='int32')

    # 使用生成器生成数据
    generated_data = generator([noise_input, label_input])
    # 使用判别器判断生成数据的真伪
    validity = discriminator([generated_data, label_input])

    # 返回 cGAN 模型
    # 该模型接受两个输入，一个是噪声向量，一个是类别标签. 输出生成数据的真实性评分
    return Model([noise_input, label_input], validity, name="cGAN")

# 4️⃣ 训练函数
def train_cgan(X_train, y_train, generator, discriminator, gan, noise_dim, num_classes, epochs=2000, batch_size=512):
    """
    Trains the conditional GAN (cGAN) model.
    生成器 generator 生成数据，判别器 discriminator 判断真假，通过二者的对抗训练，优化生成器的能力。

    Args:
        X_train (numpy.ndarray): Training data features.
        y_train (numpy.ndarray): Training data labels.
        generator (tensorflow.keras.Model): The generator model.
        discriminator (tensorflow.keras.Model): The discriminator model.
        gan (tensorflow.keras.Model): The combined cGAN model.
        noise_dim (int): Dimension of the noise input.
        num_classes (int): Number of classes for the conditional input.
        epochs (int, optional): Number of training epochs. Defaults to 2000.
        batch_size (int, optional): Size of each training batch. Defaults to 512.
    """
    start_time = datetime.now()
    print(f"[{datetime.now().strftime('%x %X')}] Training cGAN for {epochs} epochs with batch size {batch_size}...")

    valid = np.ones((batch_size, 1))  # 定义真实数据标签 (batch_size 个 1)
    fake = np.zeros((batch_size, 1))  # 定义生成数据标签 (batch_size 个 0)

    for epoch in range(epochs + 1):
        # 从 X_train, y_train 中随机选择一组 batch_size 的真实数据
        idx = np.random.randint(0, X_train.shape[0], batch_size)
        real_samples, real_labels = X_train[idx], y_train[idx]
        # 保证 real_labels 的 shape 是 (batch_size, 1) 而不是 (batch_size)
        real_labels = real_labels.reshape(-1, 1)

        # 随机生成一组噪声向量 shape=(batch_size, noise_dim)
        noise = np.random.normal(0, 1, (batch_size, noise_dim))
        # 随机生成一组类别标签 shape=(batch_size, 1)
        gen_labels = np.random.randint(0, num_classes, (batch_size, 1))
        # 使用生成器生成一组特征向量 shape=(batch_size, feature_dim)
        gen_samples = generator.predict([noise, gen_labels], verbose=0)

        ## 训练判别器
        # 使用真实数据训练判别器, 返回判别器对于真实数据的损失
        d_loss_real = discriminator.train_on_batch([real_samples, real_labels], valid)
        # 使用生成数据训练判别器，返回判别器对于生成数据的损失
        d_loss_fake = discriminator.train_on_batch([gen_samples, gen_labels], fake)

        ## 训练生成器
        # 重新生成一组噪声向量 shape=(batch_size, noise_dim) 与 类别标签 shape=(batch_size, 1)
        noise = np.random.normal(0, 1, (batch_size, noise_dim))
        sampled_labels = np.random.randint(0, num_classes, (batch_size, 1))
        # 训练 cGAN, 返回生成器的损失
        g_loss = gan.train_on_batch([noise, sampled_labels], valid)

        # 每 500 个 epoch 打印一次进度，输出判别器和生成器的损失
        if epoch % 500 == 0:
            print(f"[{datetime.now().strftime('%x %X')}] Epoch {epoch}: Discriminator loss={d_loss_real:.4f}, Generator loss={g_loss:.4f}")

    end_time = datetime.now()
    print(f"[{end_time.strftime('%x %X')}] Training cGAN complete. Time elapsed: {end_time - start_time}")
    pass

# 5️⃣ 生成器生成函数
def generate_samples(generator, target_class, num_samples):
    """
    Generates samples using the generator for a specific target class.

    Args:
        generator (tensorflow.keras.Model): The generator model.
        target_class (int): The target class for which to generate samples.
        num_samples (int): The number of samples to generate.

    Returns:
        numpy.ndarray: Generated samples as a NumPy array.
    """
    noise_dim = generator.input_shape[0][1]

    # 随机生成一组噪声向量 shape=(num_samples, noise_dim)
    noise = np.random.normal(0, 1, (num_samples, noise_dim))
    # 随机生成一组类别标签 shape=(num_samples, 1), 全部为 target_class
    labels = np.full((num_samples, 1), target_class)
    # 使用生成器生成数据
    generated_data = generator.predict([noise, labels], verbose=0)
    return generated_data

### 初始化 并 训练 cGAN

In [None]:
## 1️⃣ 参数配置
noise_dim = 100
feature_dim = X.shape[1]
num_classes = len(np.unique(y))

forece_train = False  # 是否强制重新训练 cGAN
save_dir = balanced_folder / 'models' / f'{scaling_method}_{oversampling_method}_n{noise_dim}_f{feature_dim}'
os.makedirs(save_dir, exist_ok=True)
generator_file = save_dir / 'generator.keras'

if generator_file.exists() and not forece_train:
    # 如果已经存在预训练的生成器模型，则直接加载
    print(f"[{datetime.now().strftime('%x %X')}] 📡 Loading pre-trained generator from {generator_file}")
    generator = tf.keras.models.load_model(generator_file)
else:
    print(f"[{datetime.now().strftime('%x %X')}] 🚀 Training cGAN...")

    ## 2️⃣ 初始化 cGAN
    generator = build_generator(noise_dim, feature_dim, num_classes)
    discriminator = build_discriminator(feature_dim, num_classes)
    discriminator.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(0.0002, 0.5))
    gan = build_gan(generator, discriminator)
    gan.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(0.0002, 0.5))

    ## 3️⃣ 训练 cGAN
    train_cgan(X, y, generator, discriminator, gan, noise_dim, num_classes, epochs=5000, batch_size=1024)

    ## 4️⃣ 保存模型
    generator.save(save_dir / 'generator.keras')
    discriminator.save(save_dir / 'discriminator.keras')
    gan.save(save_dir / 'cgan.keras')
    print(f"[{datetime.now().strftime('%x %X')}] ✅ Saved models to {save_dir}")  # 其实只需要保存 generator 就够了

[04/26/25 22:45:55] 📡 Loading pre-trained generator from /content/drive/MyDrive/NYIT/870/datasets/balanced/CSE-CIC-IDS2018/models/standard_ROS1+cGAN_n100_f70/generator.keras


## 使用 cGAN 生成新样本

In [None]:
## TEST ##

# 获取 noise_dim
noise_dim = generator.input_shape[0][1]

# 获取 feature_dim
feature_dim = generator.output_shape[1]

# 获取 num_classes
from tensorflow.keras.layers import Embedding
num_classes = next((layer.input_dim for layer in generator.layers if isinstance(layer, Embedding)), None)

print(f"[{datetime.now().strftime('%x %X')}] 打印生成器的参数信息")
print(f"generator info - Noise dim: {noise_dim}, Feature dim: {feature_dim}, Num classes: {num_classes}")

[04/26/25 22:45:55] 打印生成器的参数信息
generator info - Noise dim: 100, Feature dim: 70, Num classes: 15


In [None]:
oversample_to = {}
labels_counts = dict(sorted(Counter(y).items()))
print(f'[{datetime.now().strftime("%x %X")}] labels_counts: { {int(k): v for k, v in labels_counts.items()} }\n')

for label, target in resample_to.items():
    if labels_counts[label] < resample_to[label]:
        oversample_to[label] = target
# oversample_to = {5: 20000}
print(f'[{datetime.now().strftime("%x %X")}] oversample_to scheme{resample_scheme}: {oversample_to}\n')

# 总计时开始
start_time = datetime.now()

# 先提取出不需要过采样的数据
mask = ~np.isin(y, list(oversample_to.keys()))
X_resampled = X[mask]
y_resampled = y[mask]
print(f'X_resampled.shape(before): {X_resampled.shape}')
print(f'Labels(before): { {int(k): v for k, v in sorted(Counter(y_resampled).items())} }\n')

for cls, target_count in oversample_to.items():
    st = datetime.now()

    current_X = X[y == cls]
    current_count = current_X.shape[0]
    need = target_count - current_count
    print(f"[{datetime.now().strftime('%x %X')}] 处理标签类别[{cls}]: {current_count} -> {target_count}")

    # 生成新样本
    generated_X = generate_samples(generator, cls, need)

    # 合并标签类别 cls 的原始样本与新生成样本
    sampled_X = np.vstack([current_X, generated_X])
    sampled_y = np.concatenate([np.full(current_count, cls), np.full(need, cls)])

    # 合并到最终的 X_resampled, y_resampled
    X_resampled = np.vstack([X_resampled, sampled_X])
    y_resampled = np.concatenate([y_resampled, sampled_y])

    et = datetime.now()
    print(f"  Time elapsed: {et - st}. [{st.strftime('x %X')} -> {et.strftime('x %X')}]")
    print(f'  X_resampled.shape: {X_resampled.shape}')
    print(f'  Labels: { {int(k): v for k, v in sorted(Counter(y_resampled).items())} }\n')

print(f"[{datetime.now().strftime('%x %X')}] ✅ After oversampling:")
end_time = datetime.now()
print(f"  Time elapsed: {end_time - start_time}. [{start_time.strftime('%x %X')} -> {end_time.strftime('%x %X')}]")
print(f'  X_resampled.shape: {X_resampled.shape}')
print(f'  Labels: { {int(k): v for k, v in sorted(Counter(y_resampled).items())} }\n')

[04/26/25 22:45:56] labels_counts: {0: 1600000, 1: 228953, 2: 1000, 3: 1000, 4: 548809, 5: 1384, 6: 460953, 7: 33206, 8: 369530, 9: 111912, 10: 8792, 11: 154683, 12: 128511, 13: 1000, 14: 150071}

[04/26/25 22:45:56] oversample_to scheme2: {2: 20000, 3: 20000, 5: 20000, 7: 100000, 10: 50000, 13: 20000}

X_resampled.shape(before): (3753422, 70)
Labels(before): {0: 1600000, 1: 228953, 4: 548809, 6: 460953, 8: 369530, 9: 111912, 11: 154683, 12: 128511, 14: 150071}

[04/26/25 22:45:57] 处理标签类别[2]: 1000 -> 20000
  Time elapsed: 0:00:03.375864. [x 22:45:57 -> x 22:46:00]
  X_resampled.shape: (3773422, 70)
  Labels: {0: 1600000, 1: 228953, 2: 20000, 4: 548809, 6: 460953, 8: 369530, 9: 111912, 11: 154683, 12: 128511, 14: 150071}

[04/26/25 22:46:01] 处理标签类别[3]: 1000 -> 20000
  Time elapsed: 0:00:04.632296. [x 22:46:01 -> x 22:46:06]
  X_resampled.shape: (3793422, 70)
  Labels: {0: 1600000, 1: 228953, 2: 20000, 3: 20000, 4: 548809, 6: 460953, 8: 369530, 9: 111912, 11: 154683, 12: 128511, 14: 1500

In [None]:
# 保存文件
X_resampled_file = balanced_folder / f'{X_file.stem}_s{resample_scheme}_{oversampling_method}.npy'
y_resampled_file = balanced_folder / f'{y_file.stem}_s{resample_scheme}_{oversampling_method}.npy'

# 判断实际 oversample 之后的标签样本数，有没有达到目标数量 oversample_to 的 95%，没有的话，设置变量 incomplete 为 true
incomplete = False
incomplete_ratio = 0.95
labels_counts = sorted(Counter(y_resampled).items())
labels_counts = dict(labels_counts)
for label, target in oversample_to.items():
    if labels_counts[label] <= target * incomplete_ratio:
        incomplete = True
        # 文件名后面有个 + 号表示过采样不完全，需要使用额外的简单过采样进行数据补全
        X_resampled_file = X_resampled_file.with_name(X_resampled_file.stem + '+.npy')
        y_resampled_file = y_resampled_file.with_name(y_resampled_file.stem + '+.npy')
        break

np.save(X_resampled_file, X_resampled)
np.save(y_resampled_file, y_resampled)

print(f"[{datetime.now().strftime('%x %X')}] ✅ Saved to {X_resampled_file} & {y_resampled_file.name}")

[04/26/25 22:46:58] ✅ Saved to /content/drive/MyDrive/NYIT/870/datasets/balanced/CSE-CIC-IDS2018/train_X_standard_s2_ROS1+cGAN.npy & train_label_standard_s2_ROS1+cGAN.npy


## SMOTE 补全数据

因为基于 **邻居** 样本的过采样算法, 可能会因为找不到邻居而导致无法新增数据。<br/>
所以在最后用 SMOTE 算法进行兜底, 补全不足 oversample_to 目标的样本。


In [None]:
from imblearn.over_sampling import SMOTE

if not incomplete:
    print(f"[{datetime.now().strftime('%x %X')}] ✅ 无需补全数据")
else:
    print(f"[{datetime.now().strftime('%x %X')}] ⚠️ 需要补全数据")

    # 加载训练集文件
    # X_resampled_file = balanced_folder / f'{X_file.stem}_s{resample_scheme}_BLSMOTE.npy'
    # y_resampled_file = balanced_folder / f'{y_file.stem}_s{resample_scheme}_BLSMOTE.npy'

    # X_resampled = np.load(X_resampled_file)
    # y_resampled = np.load(y_resampled_file)

    labels_counts = sorted(Counter(y_resampled).items())
    labels_counts = dict(labels_counts)

    print(f'X_resampled.shape: {X_resampled.shape}')
    print(f'Labels: { {int(k): v for k, v in labels_counts.items()} }\n')

    # 指定需要补充过采样的标签与目标
    oversample_to = {}
    for label, target in resample_to.items():
        if labels_counts[label] < resample_to[label]:
            oversample_to[label] = target
    print(f'[{datetime.now().strftime("%x %X")}] oversample_to: {oversample_to}\n')

    # 使用 SMOTE 过采样
    sampler = SMOTE(sampling_strategy=oversample_to, random_state=42)
    X_completed, y_completed = sampler.fit_resample(X_resampled, y_resampled)

    # 打印结果
    print(f'  X_completed.shape: {X_completed.shape}')
    print(f'  Labels: { {int(k): v for k, v in sorted(Counter(y_completed).items())} }\n')

    # 保存结果
    X_completed_file = X_resampled_file.with_name(X_resampled_file.stem + 'SMOTE.npy')
    y_completed_file = y_resampled_file.with_name(y_resampled_file.stem + 'SMOTE.npy')

    np.save(X_completed_file, X_completed)
    np.save(y_completed_file, y_completed)

    print(f"[{datetime.now().strftime('%x %X')}] ✅ Saved to {X_completed_file} & {y_completed_file.name}")

[04/26/25 22:46:58] ✅ 无需补全数据
