In [1]:
import pandas as pd

df = pd.read_csv("D:/dataset/cleaned_improved_cicids2017.csv")

In [2]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.model_selection import train_test_split

# ✅ 사용할 공격 유형 리스트 (BENIGN 포함)
selected_classes = [
    "BENIGN",
    "FTP-Patator",
    "SSH-Patator",
    "DoS Hulk",
    "DoS Slowhttptest",
    "DoS GoldenEye",
    "DoS Slowloris",
    "Portscan",
    "DDoS"
]

# ✅ 데이터 필터링 (`Label` 기준)
df_filtered = df[df['Label'].isin(selected_classes)].copy()

# ✅ X, y 분리
X = df_filtered.drop(columns=['Label'])  # Feature Data
y = df_filtered['Label']  # Target Labels

# ✅ Train/Test 분리 (원본 Test 데이터 유지)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# ✅ 클래스별 개수 확인
print("Before SMOTE (Train):", Counter(y_train))
print("Before SMOTE (Test):", Counter(y_test))  # Test는 그대로 둬야 함

# ✅ BENIGN 클래스 개수 확인
benign_count = Counter(y_train)["BENIGN"]  # Train 데이터에서 BENIGN 샘플 개수

# ✅ 기존 클래스 개수 저장
target_stats = Counter(y_train)

# ✅ SMOTE 비율 설정 (BENIGN 개수의 0.1 비율로 맞춤)
sampling_strategy = {}
for cls, count in target_stats.items():
    if cls != "BENIGN":  # BENIGN 제외
        new_count = int(benign_count * 0.5)  # 목표 개수 설정
        if new_count > count:  # 기존 개수보다 클 때만 적용 (SMOTE는 over-sampling만 가능)
            sampling_strategy[cls] = new_count

print("SMOTE Sampling Strategy:", sampling_strategy)

# ✅ SMOTE 적용 (Train 데이터만 over-sampling)
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# ✅ SMOTE 적용 후 클래스별 개수 확인
print("After SMOTE (Train):", Counter(y_train_resampled))
print("After SMOTE (Test - Unchanged):", Counter(y_test))  # Test 데이터는 변경 없음

# ✅ 최종 Train/Test 데이터 생성
df_train_resampled = pd.DataFrame(X_train_resampled, columns=X.columns)
df_train_resampled['Label'] = y_train_resampled  # 원래 라벨 복원

df_test = pd.DataFrame(X_test, columns=X.columns)
df_test['Label'] = y_test  # 원래 라벨 복원

# ✅ 최종 데이터 확인
print("Final Train Data:", df_train_resampled.shape)
print("Final Test Data:", df_test.shape)

# ✅ 데이터 저장 (선택 사항)
df_train_resampled.to_csv("train_resampled.csv", index=False)
df_test.to_csv("test_original.csv", index=False)


Before SMOTE (Train): Counter({'BENIGN': 1275631, 'Portscan': 127253, 'DoS Hulk': 126774, 'DDoS': 76115, 'DoS GoldenEye': 6054, 'FTP-Patator': 3178, 'DoS Slowloris': 3087, 'SSH-Patator': 2369, 'DoS Slowhttptest': 1392})
Before SMOTE (Test): Counter({'BENIGN': 318909, 'Portscan': 31813, 'DoS Hulk': 31694, 'DDoS': 19029, 'DoS GoldenEye': 1513, 'FTP-Patator': 794, 'DoS Slowloris': 772, 'SSH-Patator': 592, 'DoS Slowhttptest': 348})
SMOTE Sampling Strategy: {'DDoS': 637815, 'DoS Hulk': 637815, 'DoS GoldenEye': 637815, 'Portscan': 637815, 'FTP-Patator': 637815, 'SSH-Patator': 637815, 'DoS Slowloris': 637815, 'DoS Slowhttptest': 637815}
After SMOTE (Train): Counter({'BENIGN': 1275631, 'DDoS': 637815, 'DoS Hulk': 637815, 'DoS GoldenEye': 637815, 'Portscan': 637815, 'FTP-Patator': 637815, 'SSH-Patator': 637815, 'DoS Slowloris': 637815, 'DoS Slowhttptest': 637815})
After SMOTE (Test - Unchanged): Counter({'BENIGN': 318909, 'Portscan': 31813, 'DoS Hulk': 31694, 'DDoS': 19029, 'DoS GoldenEye': 151

<hr>

In [3]:
import numpy as np
from imblearn.over_sampling import SMOTE
from collections import Counter

# ✅ 사용할 공격 유형 리스트 (BENIGN 포함)
selected_classes = [
    "BENIGN",
    "FTP-Patator",
    "SSH-Patator",
    "DoS Hulk",
    "DoS Slowhttptest",
    "DoS GoldenEye",
    "DoS Slowloris",
    "Portscan",
    "DDoS"
]

# ✅ 데이터 필터링 (`Label` 기준)
df_filtered = df[df['Label'].isin(selected_classes)].copy()

# ✅ X, y 분리 (라벨 인코딩 없이 원본 그대로 유지)
X = df_filtered.drop(columns=['Label'])  # Feature Data
y = df_filtered['Label']  # Target Labels

# ✅ 클래스별 개수 확인
print("Before SMOTE:", Counter(y))

# ✅ BENIGN 클래스 개수 확인
benign_count = Counter(y)["BENIGN"]  # BENIGN 샘플 개수

# ✅ 기존 클래스 개수 저장
target_stats = Counter(y)

# ✅ SMOTE 비율 설정 (BENIGN 개수의 0.75 비율로 맞춤)
sampling_strategy = {}

for cls, count in target_stats.items():
    if cls != "BENIGN":  # BENIGN 제외
        new_count = int(benign_count * 0.1)  # 목표 개수 설정
        if new_count > count:  # 기존 개수보다 클 때만 적용 (SMOTE는 over-sampling만 가능)
            sampling_strategy[cls] = new_count

print("SMOTE Sampling Strategy:", sampling_strategy)

# ✅ SMOTE 적용 (선택한 클래스만 over-sampling)
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# ✅ SMOTE 적용 후 클래스별 개수 확인
print("After SMOTE:", Counter(y_resampled))

# ✅ 최종 데이터 프레임 생성
df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['Label'] = y_resampled  # 원래 라벨 복원

# ✅ 최종 데이터 확인
df_resampled.head()



Before SMOTE: Counter({'BENIGN': 1594540, 'Portscan': 159066, 'DoS Hulk': 158468, 'DDoS': 95144, 'DoS GoldenEye': 7567, 'FTP-Patator': 3972, 'DoS Slowloris': 3859, 'SSH-Patator': 2961, 'DoS Slowhttptest': 1740})
SMOTE Sampling Strategy: {'FTP-Patator': 159454, 'SSH-Patator': 159454, 'DoS Slowloris': 159454, 'DoS Slowhttptest': 159454, 'DoS Hulk': 159454, 'DoS GoldenEye': 159454, 'Portscan': 159454, 'DDoS': 159454}
After SMOTE: Counter({'BENIGN': 1594540, 'FTP-Patator': 159454, 'SSH-Patator': 159454, 'DoS Slowloris': 159454, 'DoS Slowhttptest': 159454, 'DoS Hulk': 159454, 'DoS GoldenEye': 159454, 'Portscan': 159454, 'DDoS': 159454})


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,0,119719148,231,0,0,0,0,0,0.0,0.0,...,22509459,17,12685486.0,5296658.0,20694308,6499982,0,0,0,BENIGN
1,17,65511209,6,6,288,288,48,48,48.0,0.0,...,1506210,1506210,64004884.0,0.0,64004884,64004884,0,0,0,BENIGN
2,17,113976922,267,0,20447,0,153,37,76.580524,44.140625,...,10983883,14,25498178.0,18833050.0,48523116,5463561,0,0,0,BENIGN
3,17,67037196,8,8,384,384,48,48,48.0,0.0,...,11034681,11034681,55956316.0,0.0,55956316,55956316,0,0,0,BENIGN
4,17,68045057,8,8,384,384,48,48,48.0,0.0,...,11043596,11043596,56943904.0,0.0,56943904,56943904,0,0,0,BENIGN


In [4]:
df = df_resampled

In [5]:
df.to_csv('D:/dataset/0219_Paper_Dataset/cic_ids_smote.csv', index=False)

In [6]:
from sklearn.preprocessing import LabelEncoder

# Label Encoding 적용
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Label'])


# 인코딩된 데이터프레임 확인
print(df['Label'].value_counts())

Label
0    1594540
6     159454
8     159454
5     159454
4     159454
3     159454
2     159454
7     159454
1     159454
Name: count, dtype: int64


In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np

tmp = df.drop(labels = 'Label',axis=1)
labels = df['Label']

# 1. 데이터 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(tmp)

# 2. PCA 적용 (주성분 개수 설정)
n_components = 25  # 원하는 차원 수로 설정
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)


# 3. PCA 누적 분산 비율 확인
explained_variance_ratio = pca.explained_variance_ratio_.cumsum()
print("누적 분산 비율 (cumulative explained variance):")
print(explained_variance_ratio)

# 4. PCA 결과를 DataFrame으로 변환하고 label 병합
X_pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(n_components)])
df_merged = X_pca_df.assign(label=labels.values)

# 5. 병합된 데이터 타입 확인
print("Label 데이터 타입:", df_merged['label'].dtype)

# 6. 정렬 여부 확인 (필요하면 추가)
# 확인 예시: 병합 전후 `labels` 순서와 `df_merged['label']`이 동일한지 점검
assert np.array_equal(labels.values, df_merged['label'].values), "Label 순서가 일치하지 않습니다."

# 7. PCA 결과 시각화 (2D로 줄인 경우)
import matplotlib.pyplot as plt
if n_components >= 2:
    plt.figure(figsize=(8, 6))
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis', alpha=0.5)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('PCA 결과 시각화')
    plt.colorbar(label='Label')
    plt.show()

In [None]:
df_merged

In [None]:
df_merged.to_csv('cic_ids_smote03_pca25.csv', index=False)