<a href="https://colab.research.google.com/github/funway/nid-imbalance-study/blob/main/imbalance%20processing/BorderlineSMOTE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 使用 **BorderlineSMOTE** 对训练集的少数样本进行过采样
- 传统 SMOTE 对所有少数类样本都进行处理(随机挑选邻居进行插值生成新数据)
- BorderlineSMOTE 只处理"边界上"的少数类样本


## Google Env

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Modules import & Globals setup

In [None]:
### Modules ###
from pathlib import Path
from datetime import datetime
from collections import Counter
import numpy as np


### Globals ###

## Label 特征的数值化编码
label_mapping = {
    "Benign": 0,
    "Bot": 1,
    "Brute Force -Web": 2,
    "Brute Force -XSS": 3,
    "DDOS attack-HOIC": 4,
    "DDOS attack-LOIC-UDP": 5,
    "DDoS attacks-LOIC-HTTP": 6,
    "DoS attacks-GoldenEye": 7,
    "DoS attacks-Hulk": 8,
    "DoS attacks-SlowHTTPTest": 9,
    "DoS attacks-Slowloris": 10,
    "FTP-BruteForce": 11,
    "Infilteration": 12,
    "SQL Injection": 13,
    "SSH-Bruteforce": 14
}

## 计划尝试三种样本分布模式
resample_schemes = {
    # 模式1. (标签0:非0标签总和) ≈ (160:157); 非0标签按大概比例增强
    1: {
        0: 1600000,  # 保持不变
        1: 200000,   # ⤵️ 228953
        2: 20000,    # ⤴️ 489
        3: 20000,    # ⤴️ 184
        4: 200000,   # ⤵️ 548809
        5: 20000,    # ⤴️ 1384
        6: 200000,   # ⤵️ 460953
        7: 100000,   # ⤴️ 33206
        8: 200000,   # ⤵️ 369530
        9: 111912,   # ⤴️ 111912
        10: 50000,   # ⤴️ 8792
        11: 154683,  # ⤴️ 154683
        12: 128511,  # ⤴️ 128511
        13: 20000,   # ⤴️ 70
        14: 150071   # ⤴️ 150071
    },
    # 模式2. (标签0:最多非0标签样本) ≈ (3:2)
    2: {
        0: 300000,   # ⤵️ 1600000
        1: 200000,   # ⤵️ 228953
        2: 20000,    # ⤴️ 489
        3: 20000,    # ⤴️ 184
        4: 200000,   # ⤵️ 548809
        5: 20000,    # ⤴️ 1384
        6: 200000,   # ⤵️ 460953
        7: 100000,   # ⤴️ 33206
        8: 200000,   # ⤵️ 369530
        9: 111912,   # ⤴️ 111912
        10: 50000,   # ⤴️ 8792
        11: 154683,  # ⤴️ 154683
        12: 128511,  # ⤴️ 128511
        13: 20000,   # ⤴️ 70
        14: 150071   # ⤴️ 150071
    },
    # 模式3. (标签0:非0标签总和) = (1:1); 每种非0标签都占 114300 个样本
    3: {
        0: 1600000,
        **{k: 114300 for k in range(1, 15)}
    },
    # 模式4. 所有标签都 20万样本
    4: {
       **{k: 200000 for k in range(0, 15)}
    },
}

## 数据目录
datasets_folder = Path('/content/drive/MyDrive/NYIT/870/datasets')
dataset = 'CSE-CIC-IDS2018'
preprocessed_folder = datasets_folder / 'preprocessed' / dataset
balanced_folder = datasets_folder / 'balanced' / dataset

In [None]:
scaling_method = 'standard'
# scaling_method = 'minmax'

resample_scheme = 3
resample_to = resample_schemes[resample_scheme]

oversampling_method = 'ROS+BLSMOTE'

# undersampling_method = 'NM'

In [None]:
X_file = preprocessed_folder / f'integrated/train_X_{scaling_method}.npy'
y_file = preprocessed_folder / f'integrated/train_label_{scaling_method}.npy'

# 加载训练集文件
X = np.load(X_file)
y = np.load(y_file)

labels_counts = sorted(Counter(y).items())
labels_counts = dict(labels_counts)

print(f'[{datetime.now().strftime("%x %X")}] {X_file.name} shape: {X.shape}')
print(f'Labels: { {int(k): v for k, v in labels_counts.items()} }\n')

[04/21/25 00:17:51] train_X_standard.npy shape: (3797547, 70)
Labels: {0: 1600000, 1: 228953, 2: 489, 3: 184, 4: 548809, 5: 1384, 6: 460953, 7: 33206, 8: 369530, 9: 111912, 10: 8792, 11: 154683, 12: 128511, 13: 70, 14: 150071}



## 利用 ROS 提前补充极少数类样本
- 先用 ROS 随机复制的方式，将极少数类样本扩展到可接受的程度后再进行 oversampling


In [None]:
from imblearn.over_sampling import RandomOverSampler

oversample_to = {}
# 判断 oversampling_method 字符串开头是否为 ROS
if oversampling_method.startswith('ROS'):
    if oversampling_method.startswith('ROS+'):
        oversample_to = {2: 1000, 3: 500, 13: 500}
    elif oversampling_method.startswith('ROS1+'):
        oversample_to = {2: 1000, 3: 1000, 13: 1000}

    oversampler = RandomOverSampler(sampling_strategy=oversample_to, random_state=42)
    X, y = oversampler.fit_resample(X, y)

    print(f'[{datetime.now().strftime("%x %X")}] After ROS oversampling:')
    print(f'  X.shape: {X.shape}, y.shape: {y.shape}')
    print(f'  Labels: { {int(k): v for k, v in sorted(Counter(y).items())} }\n')
else:
    print(f'[{datetime.now().strftime("%x %X")}] No need to ROS oversampling.')

[04/21/25 00:17:54] After ROS oversampling:
  X.shape: (3798804, 70), y.shape: (3798804,)
  Labels: {0: 1600000, 1: 228953, 2: 1000, 3: 500, 4: 548809, 5: 1384, 6: 460953, 7: 33206, 8: 369530, 9: 111912, 10: 8792, 11: 154683, 12: 128511, 13: 500, 14: 150071}



## BorderlineSMOTE

*   又是一个巨耗时的算法
*   ❓使用 BorderlineSMOTE 对 oversample_to 字典内的所有标签进行一次性过采样，不知道为什么会出现有的标签没有被处理的情况。


In [None]:
from imblearn.over_sampling import BorderlineSMOTE


### 分阶段过采样函数 #############################################################
# 70 → 350 → 1750 → 8750 → 43750 这样
def staged_oversample_blsmote(X, y, label, target, factor=5, random_state=42, k_neighbors=5, m_neighbors=10):
    current_X, current_y = X, y
    current_count = Counter(current_y)[label]

    # 如果目标数量比当前数量小或相等，直接返回
    if current_count >= target:
        return X, y

    while current_count < target:
        next_target = min(current_count * factor, target)  # 下一个阶段的目标
        print(f"[{datetime.now().strftime('%x %X')}]  当前数量: {current_count}, 下阶段目标: {next_target}")

        oversampler = BorderlineSMOTE(
            sampling_strategy={label: next_target},
            random_state=random_state,
            k_neighbors=k_neighbors,
            m_neighbors=m_neighbors
        )
        try:
          current_X, current_y = oversampler.fit_resample(current_X, current_y)
          if current_count == Counter(current_y)[label]:
              raise Exception('无法新增样本! (标签的样本分布不适合使用该过采样方法)')
          current_count = Counter(current_y)[label]
        except Exception as e:
          print(f"  ERROR: {e}")
          break

        # 最后一次得到近似 target 值后就可以退出了。
        if next_target == target:
            break

    return current_X, current_y
### 分阶段过采样函数 #############################################################


# 1. 指定需要过采样的标签与目标
oversample_to = {}
labels_counts = dict(sorted(Counter(y).items()))
for label, target in resample_to.items():
    if labels_counts[label] < resample_to[label]:
        oversample_to[label] = target
print(f'[{datetime.now().strftime("%x %X")}] oversample_to: {oversample_to}\n')

# 2. 执行过采样
start_time = datetime.now()

#############################################
# 2-1. 一次性过采样
# 可能出现某些标签没有被处理的情况。。。
# oversampler = BorderlineSMOTE(sampling_strategy=oversample_to, random_state=42, k_neighbors=5, m_neighbors=10)
# X_resampled, y_resampled = oversampler.fit_resample(X, y)

#############################################
# 2-2. 逐标签过采样
# 先提取出不需要过采样的数据
mask = ~np.isin(y, list(oversample_to.keys()))
X_resampled = X[mask]
y_resampled = y[mask]
print(f'[{datetime.now().strftime("%x %X")}] X_resampled.shape(before): {X_resampled.shape}')
print(f'  Labels(before): { {int(k): v for k, v in sorted(Counter(y_resampled).items())} }\n')

for label, target in oversample_to.items():
    print(f'[{datetime.now().strftime("%x %X")}] Oversampling [{label}]: {labels_counts[label]} -> {oversample_to[label]} ...')
    st = datetime.now()

    # 2-2-1. 不分阶段过采样
    sampler = BorderlineSMOTE(sampling_strategy={label: target}, random_state=42, k_neighbors=5, m_neighbors=10, kind='borderline-1')
    X_, y_ = sampler.fit_resample(X, y)

    # 2-2-2. 分阶段过采样
    # X_, y_ = staged_oversample_blsmote(X, y, label, target, factor=5, random_state=42, k_neighbors=5, m_neighbors=10)

    et = datetime.now()
    print(f"  Time elapsed: {et - st}. [{st.strftime('%Y%m%d %X')} -> {et.strftime('%Y%m%d %X')}]")

    # 从结果中提取出当前标签的样本（包括 原始样本 + 新增样本）
    mask_current_label = (y_ == label)
    X_current_label = X_[mask_current_label]
    y_current_label = y_[mask_current_label]

    # 拼接到最终结果
    X_resampled = np.vstack((X_resampled, X_current_label))
    y_resampled = np.hstack((y_resampled, y_current_label))

    print(f'  X_resampled.shape: {X_resampled.shape}')
    print(f'  Labels: { {int(k): v for k, v in sorted(Counter(y_resampled).items())} }\n')
#############################################

# 3. 查看结果
print(f'[{datetime.now().strftime("%x %X")}] After oversampling:')
end_time = datetime.now()
print(f"  Time elapsed: {end_time - start_time}. [{start_time.strftime('%x %X')} -> {end_time.strftime('%x %X')}]")
print(f'  X_resampled.shape: {X_resampled.shape}')
print(f'  Labels: { {int(k): v for k, v in sorted(Counter(y_resampled).items())} }\n')

[04/21/25 00:17:55] oversample_to: {2: 114300, 3: 114300, 5: 114300, 7: 114300, 9: 114300, 10: 114300, 13: 114300}

[04/21/25 00:17:56] X_resampled.shape(before): (3641510, 70)
  Labels(before): {0: 1600000, 1: 228953, 4: 548809, 6: 460953, 8: 369530, 11: 154683, 12: 128511, 14: 150071}

[04/21/25 00:17:56] Oversampling [2]: 1000 -> 114300 ...
  Time elapsed: 0:00:37.050501. [20250421 00:17:56 -> 20250421 00:18:33]
  X_resampled.shape: (3755810, 70)
  Labels: {0: 1600000, 1: 228953, 2: 114300, 4: 548809, 6: 460953, 8: 369530, 11: 154683, 12: 128511, 14: 150071}

[04/21/25 00:18:34] Oversampling [3]: 500 -> 114300 ...
  Time elapsed: 0:00:18.538283. [20250421 00:18:34 -> 20250421 00:18:53]
  X_resampled.shape: (3870110, 70)
  Labels: {0: 1600000, 1: 228953, 2: 114300, 3: 114300, 4: 548809, 6: 460953, 8: 369530, 11: 154683, 12: 128511, 14: 150071}

[04/21/25 00:18:54] Oversampling [5]: 1384 -> 114300 ...
  Time elapsed: 0:00:51.015972. [20250421 00:18:54 -> 20250421 00:19:45]
  X_resampl

In [None]:
# 保存文件
X_resampled_file = balanced_folder / f'{X_file.stem}_s{resample_scheme}_{oversampling_method}.npy'
y_resampled_file = balanced_folder / f'{y_file.stem}_s{resample_scheme}_{oversampling_method}.npy'

# 判断实际 oversample 之后的标签样本数，有没有达到目标数量 oversample_to 的 95%，没有的话，设置变量 incomplete 为 true
incomplete = False
incomplete_ratio = 0.95
labels_counts = sorted(Counter(y_resampled).items())
labels_counts = dict(labels_counts)
for label, target in oversample_to.items():
    if labels_counts[label] <= target * incomplete_ratio:
        incomplete = True
        # 文件名后面有个 + 号表示过采样不完全，需要使用额外的简单过采样进行数据补全
        X_resampled_file = X_resampled_file.with_name(X_resampled_file.stem + '+.npy')
        y_resampled_file = y_resampled_file.with_name(y_resampled_file.stem + '+.npy')
        break

np.save(X_resampled_file, X_resampled)
np.save(y_resampled_file, y_resampled)

print(f'[{datetime.now().strftime("%x %X")}] ✅ Saved to {X_resampled_file} & {y_resampled_file.name}')

[04/21/25 01:51:27] ✅ Saved to /content/drive/MyDrive/NYIT/870/datasets/balanced/CSE-CIC-IDS2018/train_X_standard_s3_ROS+BLSMOTE.npy & train_label_standard_s3_ROS+BLSMOTE.npy


## SMOTE 补全数据

因为基于 **邻居** 样本的过采样算法, 可能会因为找不到邻居而导致无法新增数据。<br/>
所以在最后用 SMOTE 算法进行兜底, 补全不足 oversample_to 目标的样本。


In [None]:
from imblearn.over_sampling import SMOTE

if not incomplete:
    print(f'[{datetime.now().strftime("%x %X")}] ✅ 无需补全数据')
else:
    print(f'[{datetime.now().strftime("%x %X")}] ⚠️ 需要补全数据')

    # 加载训练集文件
    # X_resampled_file = balanced_folder / f'{X_file.stem}_s{resample_scheme}_BLSMOTE.npy'
    # y_resampled_file = balanced_folder / f'{y_file.stem}_s{resample_scheme}_BLSMOTE.npy'

    # X_resampled = np.load(X_resampled_file)
    # y_resampled = np.load(y_resampled_file)

    labels_counts = sorted(Counter(y_resampled).items())
    labels_counts = dict(labels_counts)

    print(f'X_resampled.shape: {X_resampled.shape}')
    print(f'Labels: { {int(k): v for k, v in labels_counts.items()} }\n')

    # 指定需要补充过采样的标签与目标
    oversample_to = {}
    for label, target in resample_to.items():
        if labels_counts[label] < resample_to[label]:
            oversample_to[label] = target
    print(f'oversample_to: {oversample_to}\n')

    # 使用 SMOTE 过采样
    sampler = SMOTE(sampling_strategy=oversample_to, random_state=42)
    X_completed, y_completed = sampler.fit_resample(X_resampled, y_resampled)

    # 打印结果
    print(f'  X_completed.shape: {X_completed.shape}')
    print(f'  Labels: { {int(k): v for k, v in sorted(Counter(y_completed).items())} }\n')

    # 保存结果
    X_completed_file = X_resampled_file.with_name(X_resampled_file.stem + 'SMOTE.npy')
    y_completed_file = y_resampled_file.with_name(y_resampled_file.stem + 'SMOTE.npy')

    np.save(X_completed_file, X_completed)
    np.save(y_completed_file, y_completed)

    print(f'[{datetime.now().strftime("%x %X")}] ✅ Saved to {X_completed_file} & {y_completed_file.name}')

[04/21/25 01:51:27] ✅ 无需补全数据
