In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from datetime import datetime

# sklearn
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics

# keras
import keras
from keras import models, optimizers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, LSTM, GRU, LeakyReLU
from keras import Model ,models, layers, optimizers, regularizers
from keras.callbacks import ModelCheckpoint
import keras.backend.tensorflow_backend as K
import tensorflow as tf

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)

In [None]:
train_df = pd.read_csv("dataset/total2.csv", index_col=2)


# 라벨 처리
train_df.loc[train_df['Label'] != 'Benign', "Label"] = 1
train_df.loc[train_df['Label'] == 'Benign', "Label"] = 0

train_df = train_df.apply(pd.to_numeric, errors = 'coerce')

#결측치 제거
train_df = train_df.replace([np.inf, -np.inf], np.nan)
train_df.fillna(0.0,inplace=True)

# 정상만 학습 (학습_정상 70%, 탐지_정상+악성 30%)
x_train, x_test = train_test_split(train_df, test_size=0.3)
x_train, x_val = train_test_split(x_train, test_size=0.3)

# 학습용 라벨 분리
train_label = x_train['Label']
valid_label = x_val['Label'].astype('float64').values
test_label = x_test['Label'].astype('float64').values

x_valid = x_val.drop('Label',axis=1).values
x_test = x_test.drop('Label', axis=1).values

# 정상만 분리
x_train_y0 = x_train[train_label == 0].drop('Label',axis=1).values
x_train_y1 = x_train[train_label == 1].drop('Label',axis=1).values

x_valid_y0 = x_val[valid_label == 0].drop('Label',axis=1).values
x_valid_y1 = x_val[valid_label == 1].drop('Label',axis=1).values

# 라벨 데이터 저장
np.save('dataset/valid_label.npy', valid_label)
np.save('dataset/test_label.npy', test_label)


# min max 정규화
scaler = MinMaxScaler(feature_range=(0,1))

x_train_y0_scaled = scaler.fit_transform(x_train_y0)
x_valid_y0_scaled = scaler.fit_transform(x_valid_y0)
x_valid_scaled = scaler.fit_transform(x_valid)
x_test_scaled = scaler.fit_transform(x_test)

# train, valid, test 데이터 저장
np.save('dataset/x_train_y0_scaled.npy', x_train_y0_scaled)
np.save('dataset/x_valid_y0_scaled.npy', x_valid_y0_scaled)
np.save('dataset/x_valid_scaled.npy', x_valid_scaled)
np.save('dataset/x_test_scaled.npy', x_test_scaled)


In [None]:
# 데이터 셋 load
valid_label = np.load('dataset/valid_label.npy')
test_label = np.load('dataset/test_label.npy')

x_train_y0_scaled = np.load('dataset/x_train_y0_scaled.npy')
x_valid_y0_scaled = np.load('dataset/x_valid_y0_scaled.npy')
x_valid_scaled = np.load('dataset/x_valid_scaled.npy')
x_test_scaled = np.load('dataset/x_test_scaled.npy')

In [None]:

# timesteps 설정
# ************************** #
timesteps = 10
# ************************** #

# timesteps로 나눈 나머지 산출
x_train_y0_rest = -(x_train_y0_scaled.shape[0] % timesteps)
x_valid_y0_rest = -(x_valid_y0_scaled.shape[0] % timesteps)
x_valid_rest = -(x_valid_scaled.shape[0] % timesteps)
x_test_rest = -(x_test_scaled.shape[0] % timesteps)

# 나머지만큼 데이터셋 절삭
x_train_y0_scaled = x_train_y0_scaled[:x_train_y0_rest]
x_valid_y0_scaled = x_valid_y0_scaled[:x_valid_y0_rest]
x_valid_scaled = x_valid_scaled[:x_valid_rest]
x_test_scaled = x_test_scaled[:x_test_rest]

valid_label_rest = valid_label[:x_valid_rest]
test_label_rest = test_label[:x_test_rest]

# 3차원으로 데이터 변환
# reshape input to be 3D [samples, timesteps, features]
x_train_y0_scaled_reshape = x_train_y0_scaled.reshape((int(x_train_y0_scaled.shape[0]/timesteps), timesteps, x_train_y0_scaled.shape[1])) # 정상 데이터 셋
x_valid_y0_scaled_reshape = x_valid_y0_scaled.reshape((int(x_valid_y0_scaled.shape[0]/timesteps), timesteps, x_valid_y0_scaled.shape[1])) # 테스트 데이터 셋
x_valid_scaled_reshape = x_valid_scaled.reshape((int(x_valid_scaled.shape[0]/timesteps), timesteps, x_valid_scaled.shape[1])) # 테스트 데이터 셋
x_test_scaled_reshape = x_test_scaled.reshape((int(x_test_scaled.shape[0]/timesteps), timesteps, x_test_scaled.shape[1])) # 테스트 데이터 셋

In [None]:

lr = 0.001 # 학습률(learning rate)
batch = 100 # batch size
epochs = 10

# # 모델생성
with K.tf.device('/gpu:0'):
    print("GPU load 완료",end=' / ')
    lstm_ae = models.Sequential()
    # Encoder
    lstm_ae.add(layers.GRU(64, activation='relu', input_shape=(timesteps, 78), return_sequences=True))
    lstm_ae.add(layers.GRU(32, activation='relu', return_sequences=True))
    lstm_ae.add(layers.GRU(16, activation='relu', return_sequences=False))
    lstm_ae.add(layers.RepeatVector(timesteps))
    
    # Decoder
    lstm_ae.add(layers.GRU(16, activation='relu', return_sequences=True))
    lstm_ae.add(layers.GRU(32, activation='relu', return_sequences=True))
    lstm_ae.add(layers.GRU(64, activation='relu', return_sequences=True))
    lstm_ae.add(layers.TimeDistributed(layers.Dense(78)))
          
   # compile
    lstm_ae.compile(loss='mse', optimizer=optimizers.Adam(lr=lr), metrics=['accuracy'],)
          
    # fit
    history = lstm_ae.fit(x_train_y0_scaled_reshape, x_train_y0_scaled_reshape,
                              epochs=epochs, batch_size=batch,
                              validation_data=(x_valid_y0_scaled_reshape, x_valid_y0_scaled_reshape))
    # 모델 저장
    lstm_ae.save('./model/gru_ae.h5')

In [None]:
gru_ae = models.load_model('./newmodel/gru_ae.h5')

# valid 데이터셋으로 예측 수행
valid_x_predictions = gru_ae.predict(x_valid_scaled_reshape)

# 복원 오차 산출
mse = np.mean(np.power(x_valid_scaled - valid_x_predictions.reshape(valid_x_predictions.shape[0]*timesteps, 78), 2), axis=1)


best_f1 = 0
best_th = 0.04
i = best_th
# 최적의 threshold 산출
while True:
    if i == 0.1:
        break
    
    print('현재: ',i, end='\n')
    mse_th = mse.copy()
    mse_th[mse_th < i] = 0
    mse_th[mse_th >= i] = 1
    
    f1 = f1_score(valid_label_rest, mse_th)*100
    
    if f1 > best_f1:
        print('best f1 변경: ', best_f1, ' -> ', f1)
        print('best_th 변경: ', best_th, ' -> ', i)
        best_f1 = f1
        best_th = i
        
        acc = accuracy_score(valid_label_rest, mse_th)*100
        recall = recall_score(valid_label_rest, mse_th)*100
        pre = precision_score(valid_label_rest, mse_th)*100
        print("accuracy_score :",acc)
        print("recall_score :",recall)
        print("precision_score :",pre)
        print("f1_score :",f1, end='\n\n')
       
    i -= 0.0002
    del [[mse_th]]

print('\n\n')        
print('best f1 : ', best_f1)
print('best th : ', best_th)

# test 데이터 셋으로 예측 수행
test_x_predictions = lstm_ae.predict(x_test_scaled_reshape)

# 복원 오차 산출
mse = np.mean(np.power(x_test_scaled - test_x_predictions.reshape(test_x_predictions.shape[0]*timesteps, 78), 2), axis=1)

# threshold 기준으로 분류
mse[mse < best_th] = 0
mse[mse >= best_th] = 1

# 결과 가시화
print("accuracy_score :",accuracy_score(test_label_rest, mse)*100)
print("recall_score :",recall_score(test_label_rest, mse)*100)
print("precision_score :",precision_score(test_label_rest, mse)*100)
print("f1_score :",f1_score(test_label_rest, mse)*100)