In [None]:
import pandas as pd
from itertools import product
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from keras.utils import to_categorical #此套件不能用字串方式使用
from keras.models import Sequential, load_model
from keras.layers import Dropout,Dense, LSTM
from sklearn.metrics import confusion_matrix ,precision_score, recall_score, f1_score
import seaborn as sns
from sklearn.metrics import classification_report
from keras import regularizers
import time

In [None]:
def LSTM_all(df_all, number_feature=4, timesteps=10, d1= 0.3 ,d2 = 0.1, d3 = 0.1):
    if number_feature not in (20, 15, 10, 5, 4):
        if number_feature > 20:
            print(f"警告：number_feature={number_feature} 不合法，已自動設定為 20")
            number_feature = 20
        elif number_feature <= 4:
            print(f"警告：number_feature={number_feature} 不合法，已自動設定為 4")
            number_feature = 4

    #1
    all_feature = df_all.drop(['Label'],axis = 1)
    all_labels = df_all['Label']
    #2
    le = LabelEncoder()
    all_label_tran = le.fit_transform(all_labels)
    feature_name = ['Source IP', 'Destination IP', 'Timestamp', 'Flow ID','SimillarHTTP']
    for feature in feature_name: 
        all_feature[feature] = LabelEncoder().fit_transform(all_feature[feature])
    # 3
    feature_name_number = ["Destination Port", "Flow ID", "Source Port", "Timestamp", "Flow Bytes/s",
                            "Fwd Seg Size Min", "Fwd Packets Length Total", "Flow Duration", "Flow IAT Min", "Fwd Packet Length Max", 
                            "Packet Length Min", "Packet Length Max", "Flow IAT Std", "Fwd IAT Std", "Fwd Packet Length Min", 
                            "Avg Packet Size", "Flow IAT Max", "Fwd Packet Length Mean", "Fwd IAT Min", "Flow Packets/s"]
    
    selected_features = all_feature[feature_name_number[:number_feature]]
    #4
    scaler = MinMaxScaler()
    normalized_features = pd.DataFrame(scaler.fit_transform(selected_features),
        columns=selected_features.columns)
    #5
    train_label_onehot = to_categorical(all_label_tran)
    #6
    lstm_feature=[]
    lstm_label=[]
    for i in range(len(normalized_features)-timesteps):
        lstm_feature.append(normalized_features[i:(i+timesteps)])
        lstm_label.append(train_label_onehot[(i+timesteps)])
        #print(i)
    reshaped_feature = np.array(lstm_feature)
    reshaped_label = np.array(lstm_label)
    #7
    x_train, x_test, y_train, y_test = train_test_split(reshaped_feature,reshaped_label, test_size=0.1,random_state=85)
    #8
    starttime = time.time()
    #LSTM模型建立
    model = Sequential()
    model.add(LSTM(input_shape=(timesteps, number_feature),units=256, unroll=False,
                    kernel_initializer='glorot_normal', activation='tanh',recurrent_dropout=0.0))
    #建立拋棄層
    model.add(Dropout(d1))
    model.add(Dense(units=128, activation='relu', kernel_initializer='he_normal'))
    model.add(Dropout(d2))
    model.add(Dense(units=64, activation='relu',kernel_initializer='he_normal'))
    model.add(Dropout(d3))
    model.add(Dense(units=32, activation='relu',kernel_initializer='he_normal'))
    #建立輸出層
    model.add(Dense(units=12, kernel_initializer='glorot_uniform', activation='softmax'))
    #9
    #訓練方式
    model.compile(loss="categorical_crossentropy",optimizer = "adam", metrics = ['accuracy'])
    #進行訓練
    train_history = model.fit(x_train, y_train, batch_size=2000, epochs=30, validation_split = 0.1, verbose =1)
    #10
    predict = model.predict(x_test)
    timestop = time.time() - starttime

    #11
    loss = model.evaluate(x_test, y_test)
    #
    y_true = np.argmax(y_test, axis = 1)
    y_pred = np.argmax(predict, axis = 1)
    method = {
        "accuracy" : accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, average='weighted'),
        "recall": recall_score(y_true, y_pred, average='weighted'),
        "f1": f1_score(y_true, y_pred, average='weighted')
    }
    #12
    return train_history, y_test, predict, model, timestop, method, loss

In [None]:
d = pd.read_parquet('D:/碩士機器學習/data/特徵87/600000數據實驗/df_all2.parquet')
results = []
number_features_option = [20, 15, 10, 5, 4] 
timestep_option = [15, 10, 5] 
total = len(number_features_option)*len(timestep_option)
for conut ,(number_features, timestep) in enumerate(product(number_features_option,timestep_option), start= 1):
    print(f"{conut} / {total}:featues = {number_features}, timesteps = {timestep}")
    train_history, y_test, predict, model, timestop, method, loss = LSTM_all(d,number_feature=number_features, timesteps= timestep)
    results.append({
    "features": number_features,
    "timesteps": timestep,
    "train_history": train_history,
    "y_test": y_test,
    "predict": predict,
    "loss":loss,
    "model": model,
    "timestop": timestop,
    "method": method,
    })

In [None]:
for i in range(len(results)):
     print(f"第{i+1}筆訓練時間: {results[i]['timestop']:.1f} 秒 {results[i]['features']}F-{results[i]['timesteps']}T {results[i]['loss'][0]*100:.3f}%")

In [None]:
model.summary() 

In [None]:
#折線圖準確率與驗證準確率
rcParams['font.family'] = 'Microsoft JhengHei'
for i in range(len(results)):
    plt.figure(figsize=(12, 6))
    plt.title(f"第{i+1}組參數 Train History {results[i]['features']}F-{results[i]['timesteps']}T")
    plt.plot(results[i]["train_history"].history['accuracy'],marker = "o")
    plt.plot(results[i]["train_history"].history['val_accuracy'],marker = "o")
    plt.xlabel('Epoch')
    plt.legend(["acc ","val_acc"])
    plt.grid(True)
    #plt.yticks(ticks=[i/100 for i in range(50, 101,5)])
    plt.show()
    #filename = f"photo/recurrent_dropout/第{i+1}次{results[i]['features']}F_{results[i]['timesteps']}T_Acc.png"
    #plt.savefig(filename, dpi=300, bbox_inches='tight')  # 儲存成高解析度圖檔

#折線圖損失值與驗證損失值
for i in range(len(results)):
    plt.figure(figsize=(12, 6))
    plt.title(f"第{i+1}組參數 Train History {results[i]['features']}F-{results[i]['timesteps']}T")
    plt.plot(results[i]["train_history"].history['loss'],marker = "o")
    plt.plot(results[i]["train_history"].history['val_loss'],marker = "o")
    plt.xlabel('Epoch')
    plt.legend(["loss ","val_loss"])
    plt.grid(True)
    plt.show()
    #filename = f"photo/recurrent_dropout/第{i+1}次{results[i]['features']}F_{results[i]['timesteps']}T_Loss.png"
    #plt.savefig(filename, dpi=300, bbox_inches='tight')  # 儲存成高解析度圖檔

In [None]:
fig1, axes = plt.subplots(3, 5, figsize=(20, 12))  # 3 rows x 5 columns
fig1.suptitle("Train History of All 15 Acc" , fontsize=20)
for i in range(len(results)):
    row = i // 5
    col = i % 5
    ax = axes[row][col]

    history = results[i]["train_history"].history
    label = f"{results[i]['features']}F-{results[i]['timesteps']}T"

    if  'accuracy' in history and 'val_accuracy' in history:
        ax.plot(history['accuracy'], marker='o', label="acc")
        ax.plot(history['val_accuracy'], marker='o', label="val_acc")
        ax.set_title(label, fontsize=10)
        ax.set_xlabel("Epoch")
        ax.set_ylabel("Accuracy")
        ax.grid(True)
        ax.legend(fontsize="small") 
plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # 預留空間給大標題
plt.show()

In [None]:
fig2, axes = plt.subplots(3, 5, figsize=(20, 12))  # 3 rows x 5 columns
fig2.suptitle("Train History of All 15 Loss recurrent_dropout=0.5", fontsize=20)
for i in range(len(results)):
    row = i // 5
    col = i % 5
    ax = axes[row][col]

    history = results[i]["train_history"].history
    label = f"{results[i]['features']}F-{results[i]['timesteps']}T"

    if 'loss' in history and 'val_loss' in history:
        ax.plot(history['loss'], marker='o', label="loss")
        ax.plot(history['val_loss'], marker='o', label="val_loss")
        ax.set_title(label, fontsize=10)
        ax.set_xlabel("Epoch")
        ax.set_ylabel("Loss")
        ax.grid(True)
        ax.legend(fontsize="small")
plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # 預留空間給大標題
plt.show()

In [None]:
for i in range(len(results)):
    print(f"第{i+1}組參數{results[i]['features']}F-{results[i]['timesteps']}T")
    print("Accuracy: %.3f%%" % (results[i]['method']["accuracy"] * 100.0))
    print("Precision: %.3f%%" % (results[i]['method']["precision"] * 100.0))
    print("Recall: %.3f%%" % (results[i]['method']["recall"] * 100.0))
    print("F1-score: %.3f%%" % (results[i]['method']["f1"] * 100.0))

In [None]:
for i in range(len(results)):
    cm=confusion_matrix(np.argmax(results[i]['y_test'],axis=1), np.argmax(results[i]['predict'],axis=1))
    cm_label = ['Benign', 'DrDoS_DNS', 'DrDoS_LDAP', 'DrDoS_MSSQL', 'DrDoS_NTP', 'DrDoS_NetBIOS', 'DrDoS_SNMP', 'DrDoS_SSDP', 'DrDoS_UDP', 'Syn', 'TFTP', 'UDP-lag']
    plt.figure(figsize=(16, 13))
    cms = sns.heatmap(cm, annot=True, cmap='Blues', fmt='d',annot_kws={"size": 10, "verticalalignment": "center", "horizontalalignment": "center"},vmin=0, vmax=10000)
    plt.xlabel('Y_test labels',fontsize=20)
    plt.ylabel('Predicted labels',rotation=90,fontsize=20)
    plt.title(f"第{i+1}組參數的Confusion Matrix {results[i]['features']}F-{results[i]['timesteps']}T recurrent_dropout=0.8",fontsize=20)
    cms.set_xticklabels(cm_label, rotation=45, ha='right',fontsize=14)
    cms.set_yticklabels(cm_label, rotation=0,fontsize=14)
    plt.show()
    #filename = f"confusion Matrix photo/第{i+1}次{results[i]['features']}F_{results[i]['timesteps']}T_confusion Matrix.png"
    #plt.savefig(filename, dpi=300, bbox_inches='tight')  # 儲存成高解析度圖檔

In [None]:
# 精確度、召回率、f1-score分數表格
for i in range(len(results)):
    print(f"\n 第{i +1}次 precision、recall、f1詳細資訊 {results[i]['features']}F-{results[i]['timesteps']}T")
    target_names=["Benign", "DrDoS_DNS", "DrDoS_LDAP", "DrDoS_MSSQL", "DrDoS_NTP", "DrDoS_NetBIOS", "DrDoS_SNMP", "DrDoS_SSDP", "DrDoS_UDP", "Syn", "TFTP", "UDP-lag"]
    classification = classification_report(np.argmax(results[i]['y_test'],axis=1), np.argmax(results[i]['predict'],axis=1),target_names=target_names)
    print(classification)