In [None]:
import glob
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from collections import Counter
import xgboost as xgb
from sklearn.metrics import accuracy_score ,confusion_matrix ,precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

In [None]:
# #讀取
df_all = pd.read_parquet('D:/碩士機器學習/data/特徵87/600000數據實驗/df_all2.parquet')

In [None]:
# 分特徵和標籤
all_feature = df_all.drop(['Label'],axis = 1)
all_labels = df_all['Label']
print(all_feature.shape,all_labels.shape)

In [None]:
#觀察
label_counts = all_labels.value_counts(ascending=True)
print(label_counts)

#unique_values = all_feature["SimillarHTTP"].unique()
#print(unique_values)


In [None]:
#資料轉成數值型態
le = LabelEncoder()
all_label_tran = le.fit_transform(all_labels)
all_feature['Source IP'] = LabelEncoder().fit_transform(all_feature['Source IP'])
all_feature['Destination IP'] = LabelEncoder().fit_transform(all_feature['Destination IP'])
all_feature['Timestamp'] = LabelEncoder().fit_transform(all_feature['Timestamp'])
all_feature['Flow ID'] = LabelEncoder().fit_transform(all_feature['Flow ID'])
all_feature['SimillarHTTP'] = LabelEncoder().fit_transform(all_feature['SimillarHTTP'])

In [None]:
# xgboost前20特徵
xgboost_feature_20 = all_feature[['Destination Port', 'Flow ID', 'Source Port', 'Timestamp', 'Fwd Header Length', 'Flow Bytes/s', 'Fwd Seg Size Min', 'Flow Duration', 'Fwd Packets Length Total', 'Fwd Header Length.1', 'Flow IAT Min', 'Packet Length Min', 'Fwd Packet Length Min', 'Fwd IAT Min', 'Fwd Packet Length Max', 'Flow IAT Mean', 'Fwd Packet Length Mean', 'Flow Packets/s', 'Avg Packet Size', 'Source IP']]
xgboost_feature_5 = all_feature[['Destination Port', 'Flow ID', 'Source Port', 'Timestamp', 'Fwd Header Length']]
xgboost_feature_3 = all_feature[['Destination Port', 'Flow ID', 'Source Port']]

In [None]:
#歸一化[0-1]
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
normalized_features = pd.DataFrame(
    scaler.fit_transform(all_feature),#xgboost_feature_5
    columns=all_feature.columns)#xgboost_feature_5
print(normalized_features)

In [None]:
#特徵列表
df = pd.DataFrame(all_feature)
print(df.info())

In [None]:
#隨機切割
x_train, x_test, y_train, y_test = train_test_split(normalized_features, all_label_tran, test_size=0.1, random_state=0)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

In [None]:
#觀察
class_counts = np.bincount(y_test)
print(class_counts)

In [None]:
# 建立 XGBClassifier 模型
bst = xgb.XGBClassifier(n_estimators=110, 
                        max_depth=6,
                        booster = 'gbtree',
                        subsample = 1, #默認
                        colsample_bytree = 0.7, #默認                
                        learning_rate=0.2, #學習率
                        num_class=12,
                        objective='multi:softmax',
                        tree_method='hist',
                        device='cuda',
                        eval_metric= 'mlogloss', #mlogloss merror
                        #early_stopping_rounds=10, 
                        random_state=10)

In [None]:
# 使用訓練資料訓練模型
eval_set = [(x_train, y_train), (x_test, y_test)]#在每個訓練輪次（epoch 或 boosting round）時評估其效能
xgboost = bst.fit(x_train, y_train) # eval_set=eval_set

In [None]:
# 使用訓練資料預測分類
preds = bst.predict(x_test)

In [None]:
#accuracy多少
accuracy = accuracy_score(y_test, preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

print('訓練集: ',bst.score(x_train,y_train))
print('測試集: ',bst.score(x_test,y_test))

In [None]:
#精確度
precision = precision_score(y_test, preds, average='weighted')
print("Precision: %.2f%%" % (precision * 100.0))

#召回率
recall = recall_score(y_test, preds, average='weighted')
print("Recall: %.2f%%" % (recall * 100.0))

#f1-score分數 (精确度和召回率的调和平均值)
f1 = f1_score(y_test, preds, average='weighted')
print("F1-score: %.2f%%" % (f1 * 100.0))

In [None]:
#混淆矩陣熱力圖
rcParams['font.family'] = 'Microsoft JhengHei'
cm=confusion_matrix(y_test, preds)

import seaborn as sns
cm_label = ['Benign', 'DrDoS_DNS', 'DrDoS_LDAP', 'DrDoS_MSSQL', 'DrDoS_NTP', 'DrDoS_NetBIOS', 'DrDoS_SNMP', 'DrDoS_SSDP', 'DrDoS_UDP', 'Syn', 'TFTP', 'UDP-lag']
plt.figure(figsize=(16, 13))
cms = sns.heatmap(cm, annot=True, cmap='Blues', fmt='d',annot_kws={"size": 10, "verticalalignment": "center", "horizontalalignment": "center"},vmin=0, vmax=10000)
plt.xlabel('Y_test labels',fontsize=15)
plt.ylabel('Predicted labels',rotation=90,fontsize=15)
plt.title('Confusion Matrix(第三次實驗)',fontsize=20)
cms.set_xticklabels(cm_label, rotation=45, ha='right',fontsize=14)
cms.set_yticklabels(cm_label, rotation=0,fontsize=14)
plt.show()

In [None]:
print("\n precision、recall、f1詳細資訊")
classification = classification_report(y_test, preds)
print(classification)

In [None]:
# 繪製特徵重要性
rcParams['font.family'] = 'Microsoft JhengHei'
plt.rcParams['font.size'] = 9
plt.figure(figsize=(20, 130), dpi=300)  # 调整图像尺寸和 DPI
xgb.plot_importance(bst, importance_type='weight', max_num_features=20, height=0.5)
plt.title('XGBoost 特徵重要性（第三次實驗）', fontsize=12)
plt.tight_layout()
#plt.savefig('xgb_feature_importance.pdf', bbox_inches='tight') 
plt.show()

In [None]:
#correlation matrix
import seaborn as sns
selected_features = ["Destination Port", "Flow ID", "Source Port", "Timestamp", "Flow Bytes/s", "Fwd Seg Size Min", "Fwd Packets Length Total", "Flow Duration", "Flow IAT Min", "Fwd Packet Length Max", "Packet Length Min", "Packet Length Max", "Flow IAT Std", "Fwd IAT Std", "Fwd Packet Length Min", "Avg Packet Size", "Flow IAT Max", "Fwd Packet Length Mean", "Fwd IAT Min", "Flow Packets/s"]
selected_features_1 = all_feature[["Destination Port", "Flow ID", "Source Port", "Timestamp", "Flow Bytes/s", "Fwd Seg Size Min", "Fwd Packets Length Total", "Flow IAT Min"]]
xgboost_feature_15 = all_feature[["Destination Port", "Flow ID", "Source Port", "Timestamp", "Flow Bytes/s", "Fwd Seg Size Min", "Fwd Packets Length Total", "Flow Duration", "Flow IAT Min", "Fwd Packet Length Max", "Packet Length Min", "Packet Length Max", "Flow IAT Std", "Fwd IAT Std", "Fwd Packet Length Min"]]
xgboost_feature_10 = all_feature[["Destination Port", "Flow ID", "Source Port", "Timestamp", "Flow Bytes/s", "Fwd Seg Size Min", "Fwd Packets Length Total", "Flow Duration", "Flow IAT Min", "Fwd Packet Length Max"]]
xgboost_feature_5 = all_feature[["Destination Port", "Flow ID", "Source Port", "Timestamp", "Flow Bytes/s"]]
xgboost_feature_4 = all_feature[["Destination Port", "Flow ID", "Source Port", "Timestamp"]]
corr_matrix = selected_features_1.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, 
            annot=True,        # 顯示數值
            fmt=".2f",         # 小數點格式
            cmap='coolwarm',   # 顏色樣式
            square=True,
            linewidths=0.5,
            cbar_kws={"shrink": .8})

plt.title("Feature Correlation Matrix", fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np

# 計算相關係數矩陣
corr_matrix = all_feature[selected_features].corr().abs()

# 取上三角矩陣避免重複檢查
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# 找出高相關特徵（> 0.8）
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]

# 移除這些特徵
df_reduced = all_feature[selected_features].drop(columns=to_drop)
print(df_reduced)

In [None]:
# 獲取樹的結構
trees = bst.get_booster().get_dump()
for tree in trees:
    print(tree)

In [None]:
# 繪製模型中的第一棵樹
#plt.figure(figsize=(300, 300), dpi=300)
#xgb.plot_tree(bst, num_trees=0)
#plt.show()

In [None]:
#列印出特徵名
importance = bst.get_booster().get_score(importance_type='weight')

feature_importance_df = pd.DataFrame(
    list(importance.items()), columns=['Feature', 'Importance']
).sort_values(by='Importance', ascending=False)

print("特徵名稱（依重要性排序）：")
for f in feature_importance_df['Feature']:
    print(f" {f}")