<a href="https://colab.research.google.com/github/funway/nid-imbalance-study/blob/main/preprocessing/pre_process_integrate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 整合数据集，生成 train, validate, test 数据文件

## Google Colab Env

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Modules import & Globals setup

In [None]:
### Modules ###
from pathlib import Path
from datetime import datetime
import numpy as np


### Globals ###
## 数据文件
dataset = 'CSE-CIC-IDS2018'
dataset_folder = f'/content/drive/MyDrive/NYIT/870/datasets/original/{dataset}/'
preprocessed_folder = f'/content/drive/MyDrive/NYIT/870/datasets/preprocessed/{dataset}/'
balanced_folder = f'/content/drive/MyDrive/NYIT/870/datasets/balanced/{dataset}/'

# 支持: standard, minmax, robust, l1pstandard, l1pminmax
# scaling_method = 'standard'
scaling_method = 'l1pstandard'

# Label 特征的数值化编码
label_mapping = {
    "Benign": 0,
    "Bot": 1,
    "Brute Force -Web": 2,
    "Brute Force -XSS": 3,
    "DDOS attack-HOIC": 4,
    "DDOS attack-LOIC-UDP": 5,
    "DDoS attacks-LOIC-HTTP": 6,
    "DoS attacks-GoldenEye": 7,
    "DoS attacks-Hulk": 8,
    "DoS attacks-SlowHTTPTest": 9,
    "DoS attacks-Slowloris": 10,
    "FTP-BruteForce": 11,
    "Infilteration": 12,
    "SQL Injection": 13,
    "SSH-Bruteforce": 14
}

## Integration
* 将预处理后生成的多个特征文件 X.npy 与对应的 label.npy 标签文件, 整合到两个文件 all_X.npy 以及 all_label.npy 中
* 同时对 all_X 进行一次纵向裁剪，删除全零值的特征列。理论上删除全零特征列的动作应该是在整合之后、scaling 之前。但我们这里是先 scaling 后整合，所以删除全零列也就放在这里的整合之后了。
* 经过验证，原始文件，StandardScaler 和 MinmaxScaler 处理后的全零列都是 [32 34 56 57 58 59 60 61] 这8列。
* robust, l1pstandard, l1pminmax 处理后的全零列是 [32 33 34 50 56 57 58 59 60 61]，多了 33, 50 两列。



In [None]:
## 整合数据文件 ##

all_X = []  # 用来存储所有的X数据
all_label = []  # 用来存储所有的label数据

label_reg = '*_label.npy'
label_files = list((Path(preprocessed_folder) / 'separated/').rglob(label_reg))

for label_file in label_files:
    print(f'label_file: {label_file}')
    X_file = str(label_file).replace('_label.npy', f'_X_{scaling_method}.npy')
    print(f'X_file: {X_file}')

    # 读取数据
    labels = np.load(label_file)
    X = np.load(X_file)
    print(f'  X.shape: {X.shape}')
    print(f'  labels.shape: {labels.shape}')

    # 合并数据
    all_X.append(X)
    all_label.append(labels)

# 将所有数据合并成一个大数组
all_label = np.concatenate(all_label, axis=0)
all_X = np.concatenate(all_X, axis=0)
print(f'[{datetime.now().strftime("%x %X")}] After Integration, all_X.shape: {all_X.shape}')
print(f'[{datetime.now().strftime("%x %X")}] After Integration, all_label.shape: {all_label.shape}')

# 统计每个标签的数量
unique_labels, counts = np.unique(all_label, return_counts=True)
# 打印结果
for label, count in zip(unique_labels, counts):
    print(f"标签 {label}: {count} 个样本")

# 判断 X 中是否有全零的列，有的话删除
zero_columns = np.all(np.isclose(X, 0, atol=1e-15), axis=0)
print(f'[{datetime.now().strftime("%x %X")}] 全零列: {np.where(zero_columns == True)[0]}')
all_X = all_X[:, ~zero_columns]
print(f'[{datetime.now().strftime("%x %X")}] 删除全零值的列后, all_X.shape: {all_X.shape}')

# 保存为npy文件
# np.save(Path(preprocessed_folder) / f'integrated/all_X_{scaling_method}.npy', all_X)
# np.save(Path(preprocessed_folder) / f'integrated/all_label_{scaling_method}.npy', all_label)
# print('✅ Saved')


label_file: /content/drive/MyDrive/NYIT/870/datasets/preprocessed/CSE-CIC-IDS2018/separated/Wednesday-28-02-2018_TrafficForML_CICFlowMeter_label.npy
X_file: /content/drive/MyDrive/NYIT/870/datasets/preprocessed/CSE-CIC-IDS2018/separated/Wednesday-28-02-2018_TrafficForML_CICFlowMeter_X_l1pstandard.npy
  X.shape: (606902, 78)
  labels.shape: (606902,)
label_file: /content/drive/MyDrive/NYIT/870/datasets/preprocessed/CSE-CIC-IDS2018/separated/Wednesday-21-02-2018_TrafficForML_CICFlowMeter_label.npy
X_file: /content/drive/MyDrive/NYIT/870/datasets/preprocessed/CSE-CIC-IDS2018/separated/Wednesday-21-02-2018_TrafficForML_CICFlowMeter_X_l1pstandard.npy
  X.shape: (1048575, 78)
  labels.shape: (1048575,)
label_file: /content/drive/MyDrive/NYIT/870/datasets/preprocessed/CSE-CIC-IDS2018/separated/Wednesday-14-02-2018_TrafficForML_CICFlowMeter_label.npy
X_file: /content/drive/MyDrive/NYIT/870/datasets/preprocessed/CSE-CIC-IDS2018/separated/Wednesday-14-02-2018_TrafficForML_CICFlowMeter_X_l1pstand

## Trim
*   由于 CSE-CIC-IDS2018 数据集数据量太大，所以需要对整合后的数据文件进行裁剪
*   对于不同标签值的样本，保留的数据从原始数据中随机抽取，而不是顺序抽取



In [None]:
## 裁剪整合后的数据文件并保存 ##

print(f'[{datetime.now().strftime("%x %X")}] 裁剪整合后的数据文件. Before Trim, all_X.shape: {all_X.shape}')

# 将 key 值对于的标签裁剪到 ≤ value 的数量
# 因为标签 1,3 本来就没有30万条数据, 所以这里并不会对标签 1,3 进行裁剪
trim_to = {0:2000000, 1:300000, 3:300000}

# 初始化空列表，用来存储裁剪后的数据
trimed_X = []
trimed_label = []

# 获取每个标签对应的索引
for label in trim_to:
    print(f'Triming label: {label}')

    # 找到所有标签为 `label` 的样本
    label_indices = np.where(all_label == label)[0]

    # 计算需要保留的样本数，如果该标签样本数大于指定的最大值，则裁剪
    num_samples_to_keep = min(len(label_indices), trim_to[label])

    # 随机选择 num_samples_to_keep 个索引
    np.random.seed(42)  # 设置随机数种子，保证每次执行得到的随机序列都一样
    sampled_indices = np.random.choice(label_indices, num_samples_to_keep, replace=False)

    trimed_X.append(all_X[sampled_indices])
    trimed_label.append(all_label[sampled_indices])

# 处理未定义在 trim_to 字典中的标签（保留全部数据）
remaining_labels = np.unique(all_label)
for label in remaining_labels:
    if label not in trim_to:
        print(f'Keep all data of label: {label}')
        label_indices = np.where(all_label == label)[0]
        trimed_X.append(all_X[label_indices])
        trimed_label.append(all_label[label_indices])

# 合并所有裁剪后的数据
trimed_X = np.concatenate(trimed_X, axis=0)
trimed_label = np.concatenate(trimed_label, axis=0)

# 打印裁剪后的数据形状
print(f"[{datetime.now().strftime('%x %X')}] 裁剪后的 X 数据形状: {trimed_X.shape}")
print(f"[{datetime.now().strftime('%x %X')}] 裁剪后的 label 数据形状: {trimed_label.shape}")
# 统计每个标签的数量
unique_labels, counts = np.unique(trimed_label, return_counts=True)
# 打印结果
for label, count in zip(unique_labels, counts):
    print(f"标签 {label}: {count} 个样本")

# 保存为npy文件
np.save(Path(preprocessed_folder) / f'integrated/trimed_X_{scaling_method}.npy', trimed_X)
np.save(Path(preprocessed_folder) / f'integrated/trimed_label_{scaling_method}.npy', trimed_label)
print(f'[{datetime.now().strftime("%x %X")}] ✅ Saved')

[04/20/25 04:56:39] 裁剪整合后的数据文件. Before Trim, all_X.shape: (9320035, 68)
Triming label: 0
Triming label: 1
Triming label: 3
Keep all data of label: 2
Keep all data of label: 4
Keep all data of label: 5
Keep all data of label: 6
Keep all data of label: 7
Keep all data of label: 8
Keep all data of label: 9
Keep all data of label: 10
Keep all data of label: 11
Keep all data of label: 12
Keep all data of label: 13
Keep all data of label: 14
[04/20/25 04:56:48] 裁剪后的 X 数据形状: (4746934, 68)
[04/20/25 04:56:48] 裁剪后的 label 数据形状: (4746934,)
标签 0: 2000000 个样本
标签 1: 286191 个样本
标签 2: 611 个样本
标签 3: 230 个样本
标签 4: 686012 个样本
标签 5: 1730 个样本
标签 6: 576191 个样本
标签 7: 41508 个样本
标签 8: 461912 个样本
标签 9: 139890 个样本
标签 10: 10990 个样本
标签 11: 193354 个样本
标签 12: 160639 个样本
标签 13: 87 个样本
标签 14: 187589 个样本
[04/20/25 04:57:02] ✅ Saved


## Split to Train, Valid, Test
*   将整合好的数据集 **按比例** 分割为 **Train**(训练集), **Valid**(验证集), **Test**(测试集)
*   比例暂定为 Train: 80%, Valid: 10%, Test: 10%



In [None]:
from sklearn.model_selection import train_test_split
from collections import Counter


X_file = Path(preprocessed_folder) / f'integrated/trimed_X_{scaling_method}.npy'
label_file = Path(preprocessed_folder) / f'integrated/trimed_label_{scaling_method}.npy'

print(f'X_file: {X_file}')
print(f'label_file: {label_file}')

# Read whole data
X = np.load(X_file)
label = np.load(label_file)
print(f'Original X.shape: {X.shape}')
print(f'Original labels: { {int(k): v for k, v in sorted(Counter(label).items())} }\n')

# Split whole => 0.8 : 0.2
X_train, X_test, y_train, y_test = train_test_split(X, label, test_size=0.2, random_state=42, stratify=label)

# Split 0.2 => 0.1 : 0.1
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)

print(f'X_train.shape: {X_train.shape}')
print(f'y_train labels: { {int(k): v for k, v in sorted(Counter(y_train).items())} }\n')
np.save(Path(preprocessed_folder) / f'integrated/train_X_{scaling_method}.npy', X_train)
np.save(Path(preprocessed_folder) / f'integrated/train_label_{scaling_method}.npy', y_train)

print(f'X_valid.shape: {X_valid.shape}')
print(f'y_valid labels: { {int(k): v for k, v in sorted(Counter(y_valid).items())} }\n')
np.save(Path(preprocessed_folder) / f'integrated/valid_X_{scaling_method}.npy', X_valid)
np.save(Path(preprocessed_folder) / f'integrated/valid_label_{scaling_method}.npy', y_valid)

print(f'X_test.shape: {X_test.shape}')
print(f'y_test labels: { {int(k): v for k, v in sorted(Counter(y_test).items())} }\n')
np.save(Path(preprocessed_folder) / f'integrated/test_X_{scaling_method}.npy', X_test)
np.save(Path(preprocessed_folder) / f'integrated/test_label_{scaling_method}.npy', y_test)

print(f'[{datetime.now().strftime("%x %X")}] ✅ Saved')

X_file: /content/drive/MyDrive/NYIT/870/datasets/preprocessed/CSE-CIC-IDS2018/integrated/trimed_X_l1pstandard.npy
label_file: /content/drive/MyDrive/NYIT/870/datasets/preprocessed/CSE-CIC-IDS2018/integrated/trimed_label_l1pstandard.npy
Original X.shape: (4746934, 68)
Original labels: {0: 2000000, 1: 286191, 2: 611, 3: 230, 4: 686012, 5: 1730, 6: 576191, 7: 41508, 8: 461912, 9: 139890, 10: 10990, 11: 193354, 12: 160639, 13: 87, 14: 187589}

X_train.shape: (3797547, 68)
y_train labels: {0: 1600000, 1: 228953, 2: 489, 3: 184, 4: 548809, 5: 1384, 6: 460953, 7: 33206, 8: 369530, 9: 111912, 10: 8792, 11: 154683, 12: 128511, 13: 70, 14: 150071}

X_valid.shape: (474693, 68)
y_valid labels: {0: 200000, 1: 28619, 2: 61, 3: 23, 4: 68601, 5: 173, 6: 57619, 7: 4151, 8: 46191, 9: 13989, 10: 1099, 11: 19335, 12: 16064, 13: 9, 14: 18759}

X_test.shape: (474694, 68)
y_test labels: {0: 200000, 1: 28619, 2: 61, 3: 23, 4: 68602, 5: 173, 6: 57619, 7: 4151, 8: 46191, 9: 13989, 10: 1099, 11: 19336, 12: 16064