<a href="https://colab.research.google.com/github/funway/nid-imbalance-study/blob/main/imbalance%20processing/UnderSampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 使用几种 UnderSampling 算法对数据集进行欠采样

- **RUS** (RandomUnderSampler)
- **IHT** (InstanceHardnessThreshold)
- **NM** (NearMiss, 丢弃！)
- **FNM** (FAISS NearMiss)



## Google Env

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


## Modules import & Globals setup

In [None]:
### Modules ###
from pathlib import Path
from datetime import datetime
from collections import Counter
import numpy as np


### Globals ###

## Label 特征的数值化编码
label_mapping = {
    "Benign": 0,
    "Bot": 1,
    "Brute Force -Web": 2,
    "Brute Force -XSS": 3,
    "DDOS attack-HOIC": 4,
    "DDOS attack-LOIC-UDP": 5,
    "DDoS attacks-LOIC-HTTP": 6,
    "DoS attacks-GoldenEye": 7,
    "DoS attacks-Hulk": 8,
    "DoS attacks-SlowHTTPTest": 9,
    "DoS attacks-Slowloris": 10,
    "FTP-BruteForce": 11,
    "Infilteration": 12,
    "SQL Injection": 13,
    "SSH-Bruteforce": 14
}

## 计划尝试三种样本分布模式
resample_schemes = {
    # 模式1. (标签0:非0标签总和) ≈ (160:157); 非0标签按大概比例增强
    1: {
        0: 1600000,  # 保持不变
        1: 200000,   # ⤵️ 228953
        2: 20000,    # ⤴️ 489
        3: 20000,    # ⤴️ 184
        4: 200000,   # ⤵️ 548809
        5: 20000,    # ⤴️ 1384
        6: 200000,   # ⤵️ 460953
        7: 100000,   # ⤴️ 33206
        8: 200000,   # ⤵️ 369530
        9: 111912,   # ⤴️ 111912
        10: 50000,   # ⤴️ 8792
        11: 154683,  # ⤴️ 154683
        12: 128511,  # ⤴️ 128511
        13: 20000,   # ⤴️ 70
        14: 150071   # ⤴️ 150071
    },
    # 模式2. (标签0:最多非0标签样本) ≈ (3:2)
    2: {
        0: 300000,   # ⤵️ 1600000
        1: 200000,   # ⤵️ 228953
        2: 20000,    # ⤴️ 489
        3: 20000,    # ⤴️ 184
        4: 200000,   # ⤵️ 548809
        5: 20000,    # ⤴️ 1384
        6: 200000,   # ⤵️ 460953
        7: 100000,   # ⤴️ 33206
        8: 200000,   # ⤵️ 369530
        9: 111912,   # ⤴️ 111912
        10: 50000,   # ⤴️ 8792
        11: 154683,  # ⤴️ 154683
        12: 128511,  # ⤴️ 128511
        13: 20000,   # ⤴️ 70
        14: 150071   # ⤴️ 150071
    },
    # 模式3. (标签0:非0标签总和) = (1:1); 每种非0标签都占 114300 个样本
    3: {
        0: 1600000,
        **{k: 114300 for k in range(1, 15)}
    },
    # 模式4. 所有标签都 20万样本
    4: {
       **{k: 200000 for k in range(0, 15)}
    },
}

## 数据目录
datasets_folder = Path('/content/drive/MyDrive/NYIT/870/datasets')
dataset = 'CSE-CIC-IDS2018'
preprocessed_folder = datasets_folder / 'preprocessed' / dataset
balanced_folder = datasets_folder / 'balanced' / dataset

In [None]:
scaling_method = 'standard'
# scaling_method = 'minmax'

resample_scheme = 2
resample_to = resample_schemes[resample_scheme]

oversampling_method = 'ROS1+cGAN'
# oversampling_method = 'ADASYN'
# oversampling_method = 'BLSMOTE+SMOTE'

In [None]:
X_file = balanced_folder / f'train_X_{scaling_method}_s{resample_scheme}_{oversampling_method}.npy'
y_file = balanced_folder / f'train_label_{scaling_method}_s{resample_scheme}_{oversampling_method}.npy'

# 加载训练集文件
X = np.load(X_file)
y = np.load(y_file)

labels_counts = sorted(Counter(y).items())
labels_counts = dict(labels_counts)

print(datetime.now().strftime('%x %X'))
print(f'{X_file.stem}.shape: {X.shape}')
print(f'Labels: { {int(k): v for k, v in labels_counts.items()} }\n')

04/26/25 22:56:03
train_X_standard_s2_ROS1+cGAN.shape: (3983422, 70)
Labels: {0: 1600000, 1: 228953, 2: 20000, 3: 20000, 4: 548809, 5: 20000, 6: 460953, 7: 100000, 8: 369530, 9: 111912, 10: 50000, 11: 154683, 12: 128511, 13: 20000, 14: 150071}



In [None]:
# 指定需要欠采样的标签与目标
undersample_to = {}
for label, target in resample_to.items():
    if labels_counts[label] > resample_to[label]:
        undersample_to[label] = target
print(f'undersample_to: {undersample_to}')

undersample_to: {0: 300000, 1: 200000, 4: 200000, 6: 200000, 8: 200000}


## **RUS** (RandomUnderSampler)
随机从多数类样本中删除样本，直到达到目标数量。
* 速度快到令人感动 (ಥ﹏ಥ)

In [None]:
from imblearn.under_sampling import RandomUnderSampler

undersampling_method = 'RUS'
start_time = datetime.now()

undersampler = RandomUnderSampler(sampling_strategy=undersample_to, random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X, y)

print(f'After {undersampling_method} undersampling:')
end_time = datetime.now()
print(f"  Time elapsed: {end_time - start_time}. [{start_time.strftime('%x %X')} -> {end_time.strftime('%x %X')}]")
print(f'  X_resampled.shape: {X_resampled.shape}')
print(f'  Labels: { {int(k): v for k, v in sorted(Counter(y_resampled).items())} }\n')

# 保存文件
X_resampled_file = X_file.with_name(f'{X_file.stem}_{undersampling_method}.npy')
y_resampled_file = y_file.with_name(f'{y_file.stem}_{undersampling_method}.npy')
np.save(X_resampled_file, X_resampled)
np.save(y_resampled_file, y_resampled)
print(f"[{datetime.now().strftime('%x %X')}] ✅ Saved to {X_resampled_file} & {y_resampled_file.name}")

After RUS undersampling:
  Time elapsed: 0:00:00.579592. [04/26/25 22:56:03 -> 04/26/25 22:56:04]
  X_resampled.shape: (1875177, 70)
  Labels: {0: 300000, 1: 200000, 2: 20000, 3: 20000, 4: 200000, 5: 20000, 6: 200000, 7: 100000, 8: 200000, 9: 111912, 10: 50000, 11: 154683, 12: 128511, 13: 20000, 14: 150071}

[04/26/25 22:56:08] ✅ Saved to /content/drive/MyDrive/NYIT/870/datasets/balanced/CSE-CIC-IDS2018/train_X_standard_s2_ROS1+cGAN_RUS.npy & train_label_standard_s2_ROS1+cGAN_RUS.npy


In [None]:
# 补充: 使用 TomekLink 清洗边界
# from imblearn.under_sampling import TomekLinks

# print(f"[{datetime.now().strftime('%x %X')}] ⏰ TomekLinks cleaning...")
# start_time = datetime.now()

# undersampler = TomekLinks(n_jobs=-1)
# X_resampled_tl, y_resampled_tl = undersampler.fit_resample(X_resampled, y_resampled)

# end_time = datetime.now()
# print(f'After TomekLinks undersampling:')
# print(f"  Time elapsed: {end_time - start_time}. [{start_time.strftime('%x %X')} -> {end_time.strftime('%x %X')}]")
# print(f'  X_resampled.shape: {X_resampled_tl.shape}')
# print(f'  Labels: { {int(k): v for k, v in sorted(Counter(y_resampled_tl).items())} }\n')

[04/21/25 17:40:26] ⏰ TomekLinks cleaning...


## **IHT** (InstanceHardnessThreshold)
首先用一个分类器(默认为 LogisticRegression)对每个样本预测概率，计算这个样本被正确分类的难度，然后根据这个"难度"排序，删除哪些最难分类的样本。

- 耗时
- 内存消耗大
- 大数据时需要减少 `cv` 参数, 不然可能耗尽内存并报错 ` TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.`
- 无法保存欠采样到指定数量


In [None]:
from imblearn.under_sampling import InstanceHardnessThreshold

undersampling_method = 'IHT'
start_time = datetime.now()
print(f"[{start_time.strftime('%x %X')}] Start {undersampling_method} undersampling...")
print(f'original X shape: {X.shape}')
print(f'undersample_to: {undersample_to}')

undersampler = InstanceHardnessThreshold(sampling_strategy=undersample_to, random_state=42, cv=3, n_jobs=-1)
X_resampled, y_resampled = undersampler.fit_resample(X, y)

print(f"[{datetime.now().strftime('%x %X')}] After {undersampling_method} undersampling:")
end_time = datetime.now()
print(f"  Time elapsed: {end_time - start_time}. [{start_time.strftime('%x %X')} -> {end_time.strftime('%x %X')}]")
print(f'  X_resampled.shape: {X_resampled.shape}')
print(f'  Labels: { {int(k): v for k, v in sorted(Counter(y_resampled).items())} }\n')

# 保存文件
X_resampled_file = X_file.with_name(f'{X_file.stem}_{undersampling_method}.npy')
y_resampled_file = y_file.with_name(f'{y_file.stem}_{undersampling_method}.npy')
np.save(X_resampled_file, X_resampled)
np.save(y_resampled_file, y_resampled)
print(f"[{datetime.now().strftime('%x %X')}] ✅ Saved to {X_resampled_file} & {y_resampled_file.name}")

[04/19/25 23:17:12] Start IHT undersampling...
original X shape: (4439222, 70)
undersample_to: {1: 114300, 4: 114300, 6: 114300, 8: 114300, 11: 114300, 12: 114300, 14: 114300}


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

## **NM** (NearMiss) 放弃！！
根据样本与少数类样本之间的"距离"来选择要保留的多数类样本。<br/>
**内存开销超级大！**<br/>
NearMiss 有三种版本:
* **version 1** 对每个(多数类)样本，计算它到 k 个最近邻居的平均距离，保留平均距离最小的。
  * 更容易导致某些偏远的少数类样本没有多数类"邻居", 不利于后续对少数类样本进行过采样。
* **version 2** 对每个(多数类)样本，计算它到 k 个最远邻居的平均距离，保留平均距离最小的。
  * 相比于 version 1, 3 更加保守一点。
* **version 3** 从少数类样本出发，选择它最近的 k 个多数类样本进行保留。
  * 对于极度 imbalance 的数据，会导致大部分多数类样本被删除(无法根据指定数量进行保留)。

In [None]:
# from imblearn.under_sampling import NearMiss
# print(f'undersample to: {undersample_to}')
# undersampling_method = 'NM'
# start_time = datetime.now()

# undersampler = NearMiss(sampling_strategy=undersample_to, version=2, n_neighbors=3, n_jobs=-1)
# X_resampled, y_resampled = undersampler.fit_resample(X, y)

# print(f'After {undersampling_method} undersampling:')
# end_time = datetime.now()
# print(f"  Time elapsed: {end_time - start_time}. [{start_time.strftime('%x %X')} -> {end_time.strftime('%x %X')}]")
# print(f'  X_resampled.shape: {X_resampled.shape}')
# print(f'  Labels: { {int(k): v for k, v in sorted(Counter(y_resampled).items())} }\n')

# # 保存文件
# X_resampled_file = X_file.with_name(f'{X_file.stem}_{undersampling_method}.npy')
# y_resampled_file = y_file.with_name(f'{y_file.stem}_{undersampling_method}.npy')
# np.save(X_resampled_file, X_resampled)
# np.save(y_resampled_file, y_resampled)
# print(f"[{datetime.now().strftime('%x %X')}] ✅ Saved to {X_resampled_file} & {y_resampled_file.name}")

undersample to: {1: 200000, 2: 20000, 3: 20000, 4: 200000, 6: 200000, 7: 100000, 8: 200000, 13: 20000}


## **FNM** (FAISS-NearMiss)
- 由于 imblearn 的 NM 在计算 KNN 近邻的时候需要大量内存，对于百万级别的数据就是灾难。

- 所以改成使用 FAISS 库的 IndexHNSWFlat 算法计算近邻。。。速度快很多(但它不是完全精度，是近似值)

- FAISS 还能利用 GPU 加速，但是看起来 colab 好像不支持

- 代码是利用 chatGPT 老师跟 NearMiss 源码修改的，不保证正确性


In [None]:
############################################
## 修改为使用 faiss 进行近邻搜索
############################################
# 打印是否安装 faiss-gpu 库
!pip show faiss-gpu|grep Version:
!echo -e "\033[0m"  # 重置终端文字颜色

# 安装 faiss-gpu 库
!apt install libomp-dev
!pip install faiss-cpu
!pip install faiss-gpu-cu11==1.8.0.1


import faiss
from imblearn.under_sampling import NearMiss
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import _safe_indexing

class FAISSNearMiss(NearMiss):
    def _fit_resample(self, X, y):
        # 创建 FAISS 索引 (FlatL2, 速度太慢！)
        # index = faiss.IndexFlatL2(X.shape[1])
        # index.add(X.astype(np.float32))  # FAISS 需要数据类型为 float32

        # 创建 FAISS 索引 (HNSW)
        M = 32  # 每个节点的最大邻居数，越大越准确，但速度越慢
        efConstruction = 1024  # 索引构建时的候选邻居数，越大越准确，但速度越慢
        index = faiss.IndexHNSWFlat(X.shape[1], M)
        index.hnsw.efConstruction = efConstruction  # 设置 HNSW 索引的参数

        # FAISS 需要数据类型为 float32
        index.add(X.astype(np.float32))
        # 执行 FAISS 的最近邻搜索，找到每个样本的邻居
        print(f"[{datetime.now().strftime('%x %X')}] Caculating nearest neighbors by FAISS...")
        distances, nearest_neighbors = index.search(X, k=self.n_neighbors+1)
        print(f"[{datetime.now().strftime('%x %X')}]   Nearest neighbors shape: {nearest_neighbors.shape}")
        print(f"[{datetime.now().strftime('%x %X')}]   distances shape: {distances.shape}")

        # 这个虽然没啥用了，但 NM 后续还会引用 self.nn_.n_neighbors 属性(为什么不直接用 sefl.n_neighbors ... )
        self.nn_ = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn_.set_params(**{"n_jobs": self.n_jobs})

        ########################################################################
        # 下面是从 NearMiss 源码修改的

        idx_under = np.empty((0,), dtype=int)

        target_stats = Counter(y)
        class_minority = min(target_stats, key=target_stats.get)
        # print(f"[{datetime.now().strftime('%x %X')}] class_minority: {class_minority}")
        minority_class_indices = np.flatnonzero(y == class_minority)

        for target_class in np.unique(y):
            print(f"[{datetime.now().strftime('%x %X')}] processing target class {target_class}...")
            if target_class in self.sampling_strategy_.keys():
                n_samples = self.sampling_strategy_[target_class]
                target_class_indices = np.flatnonzero(y == target_class)
                X_class = _safe_indexing(X, target_class_indices)
                y_class = _safe_indexing(y, target_class_indices)

                if self.version == 1:
                    dist_vec = distances[target_class_indices]
                    idx_vec = nearest_neighbors[target_class_indices]
                    index_target_class = self._selection_dist_based(
                        X,
                        y,
                        dist_vec,
                        n_samples,
                        target_class,
                        sel_strategy="nearest",
                    )
                else:
                    raise NotImplementedError('Only version 1 is implemented for FAISSNearMiss')
            else:
                index_target_class = slice(None)

            idx_under = np.concatenate(
                (
                    idx_under,
                    np.flatnonzero(y == target_class)[index_target_class],
                ),
                axis=0,
            )

        self.sample_indices_ = idx_under

        return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under)

undersampling_method = 'FNM'
start_time = datetime.now()
print(f"[{start_time.strftime('%x %X')}] Start {undersampling_method} undersampling...")

undersampler = FAISSNearMiss(sampling_strategy=undersample_to)
X_resampled, y_resampled = undersampler.fit_resample(X, y)

print(f'After {undersampling_method} undersampling:')
end_time = datetime.now()
print(f"  Time elapsed: {end_time - start_time}. [{start_time.strftime('%x %X')} -> {end_time.strftime('%x %X')}]")
print(f'  X_resampled.shape: {X_resampled.shape}')
print(f'  Labels: { {int(k): v for k, v in sorted(Counter(y_resampled).items())} }\n')

# 保存文件
X_resampled_file = X_file.with_name(f'{X_file.stem}_{undersampling_method}.npy')
y_resampled_file = y_file.with_name(f'{y_file.stem}_{undersampling_method}.npy')
np.save(X_resampled_file, X_resampled)
np.save(y_resampled_file, y_resampled)
print(f"[{datetime.now().strftime('%x %X')}] ✅ Saved to {X_resampled_file} & {y_resampled_file.name}")

[0m[0m
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libomp-dev is already the newest version (1:14.0-55~exp2).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.
[04/07/25 03:38:34] Start FNM undersampling...
[04/07/25 05:25:20] Caculating nearest neighbors by FAISS...
[04/07/25 05:27:40]   Nearest neighbors shape: (3983422, 4)
[04/07/25 05:27:40]   distances shape: (3983422, 4)
[04/07/25 05:27:41] processing target class 0...
[04/07/25 05:27:41] processing target class 1...
[04/07/25 05:27:41] processing target class 2...
[04/07/25 05:27:41] processing target class 3...
[04/07/25 05:27:41] processing target class 4...
[04/07/25 05:27:42] processing target class 5...
[04/07/25 05:27:42] processing target class 6...
[04/07/25 05:27:42] processing target class 7...
[04/07/25 05:27:42] processing target class 8...
[04/07/25 05:27:43] processing target class 9...
[04/07/25 05:27:43] processing target class 10...
[04/07/25 0