In [None]:
from google.colab import drive

In [None]:
import pandas as pd

In [None]:
# mounting & permission
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# 데이터 파일 path 선언
colab_path = "gdrive/My Drive/AI Security Project/"

# 파일 로드 (양성데이터 1개, 공격트래픽데이터 10개)
benign=pd.read_csv(colab_path +'8.benign.csv')
g_c=pd.read_csv(colab_path +'8.gafgyt.combo.csv')
g_j=pd.read_csv(colab_path +'8.gafgyt.junk.csv')
g_s=pd.read_csv(colab_path +'8.gafgyt.scan.csv')
g_t=pd.read_csv(colab_path +'8.gafgyt.tcp.csv')
g_u=pd.read_csv(colab_path +'8.gafgyt.udp.csv')
m_a=pd.read_csv(colab_path +'8.mirai.ack.csv')
m_sc=pd.read_csv(colab_path +'8.mirai.scan.csv')
m_sy=pd.read_csv(colab_path +'8.mirai.syn.csv')
m_u=pd.read_csv(colab_path +'8.mirai.udp.csv')
m_u_p=pd.read_csv(colab_path +'8.mirai.udpplain.csv')

In [None]:
# 각 Data Set에 target Column 선언
benign['type']='benign'
g_c['type']='gafgyt_combo'
g_j['type']='gafgyt_junk'
g_s['type']='gafgyt_scan'
g_t['type']='gafgyt_tcp'
g_u['type']='gafgyt_udp'
m_u['type']='mirai_udp'
m_a['type']='mirai_ack'
m_sc['type']='mirai_scan'
m_sy['type']='mirai_syn'
m_u_p['type']='mirai_udpplain'

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from pandas.plotting import scatter_matrix

# Modeling

In [None]:
data_set=pd.concat([benign,m_u,g_c,g_j,g_s,g_t,g_u,m_a,m_sc,m_sy,m_u_p],
               axis=0, sort=False, ignore_index=True)

data_set.groupby('type')['type'].count()

type
benign             46585
gafgyt_combo       54283
gafgyt_junk        28579
gafgyt_scan        27825
gafgyt_tcp         88816
gafgyt_udp        103720
mirai_ack         111480
mirai_scan         45930
mirai_syn         125715
mirai_udp         151879
mirai_udpplain     78244
Name: type, dtype: int64

In [None]:
# 분류의 고질적 문제인 클래스 불균형 해소를 위해 층화추출을 시도하여 학습을 시킨다.
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data_set, data_set['type']):
  strat_train_set = data_set.loc[train_index]
  strat_test_set = data_set.loc[test_index]

# 층화 추출 여부 확인 (클래스 비율이 유지된 샘플이 추출되었는지를 확인하는 과정)
print(strat_train_set['type'].value_counts()/len(strat_train_set), '\n')
print(strat_test_set['type'].value_counts()/len(strat_test_set))

mirai_udp         0.175978
mirai_syn         0.145663
mirai_ack         0.129169
gafgyt_udp        0.120178
gafgyt_tcp        0.102909
mirai_udpplain    0.090659
gafgyt_combo      0.062896
benign            0.053977
mirai_scan        0.053218
gafgyt_junk       0.033113
gafgyt_scan       0.032240
Name: type, dtype: float64 

mirai_udp         0.175978
mirai_syn         0.145662
mirai_ack         0.129168
gafgyt_udp        0.120177
gafgyt_tcp        0.102907
mirai_udpplain    0.090660
gafgyt_combo      0.062898
benign            0.053977
mirai_scan        0.053218
gafgyt_junk       0.033115
gafgyt_scan       0.032240
Name: type, dtype: float64


# Scaling

In [None]:
# pipeline 구축을 통한 scaling 진행
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

cat_col = ['type']
num_col = strat_train_set.drop('type', axis=1).columns

num_pipeline = Pipeline([
    ('std_scaler', MinMaxScaler())
])

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_col)
])

data_scaled = full_pipeline.fit_transform(strat_train_set)
test_scaled = full_pipeline.transform(strat_test_set)

# PCA 진행

In [None]:
# PCA 알고리즘 선택과 최적 차원 수 탐색
from sklearn.decomposition import PCA

# 차원 수 시각화를 위한 임의의 pca 선언
pca_test = PCA()
pca_test.fit(data_scaled) # 차원에 따른 설명된 분산 및 elbow 파악을 위하여 선언함.
cumsum = np.cumsum(pca_test.explained_variance_ratio_)

pca = PCA(n_components=0.95) # 분산 기준은 0.95로 설정함
data_prepared = pca.fit_transform(data_scaled)
test_prepared = pca.transform(test_scaled)
print(data_prepared.shape)

(690444, 6)


# Modeling

In [None]:
train_y = strat_train_set['type']
test_y = strat_test_set['type']

In [None]:
labels_full=pd.get_dummies(strat_train_set['type'], prefix='type')
labels_full2=pd.get_dummies(strat_test_set['type'], prefix='type')

labels=labels_full.values
labels2 = labels_full2.values

In [None]:
# 모델 학습
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping

model = Sequential()
model.add(Dense(10, input_dim=data_prepared.shape[1], activation='relu'))
model.add(Dense(40, input_dim=data_prepared.shape[1], activation='relu'))
model.add(Dense(10, input_dim=data_prepared.shape[1], activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.add(Dense(labels.shape[1],activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, 
                        patience=5, verbose=1, mode='auto')
model.fit(data_prepared,labels,validation_data=(test_prepared, labels2),
          callbacks=[monitor],verbose=2,epochs=500)

Epoch 1/500
21577/21577 - 76s - loss: 0.8770 - val_loss: 0.6674 - 76s/epoch - 4ms/step
Epoch 2/500
21577/21577 - 59s - loss: 0.6032 - val_loss: 0.5815 - 59s/epoch - 3ms/step
Epoch 3/500
21577/21577 - 49s - loss: 0.5603 - val_loss: 0.5440 - 49s/epoch - 2ms/step
Epoch 4/500
21577/21577 - 54s - loss: 0.5347 - val_loss: 0.5210 - 54s/epoch - 3ms/step
Epoch 5/500
21577/21577 - 54s - loss: 0.5176 - val_loss: 0.5169 - 54s/epoch - 3ms/step
Epoch 6/500
21577/21577 - 56s - loss: 0.5088 - val_loss: 0.5258 - 56s/epoch - 3ms/step
Epoch 7/500
21577/21577 - 53s - loss: 0.5016 - val_loss: 0.4997 - 53s/epoch - 2ms/step
Epoch 8/500
21577/21577 - 59s - loss: 0.4952 - val_loss: 0.4919 - 59s/epoch - 3ms/step
Epoch 9/500
21577/21577 - 43s - loss: 0.4900 - val_loss: 0.4859 - 43s/epoch - 2ms/step
Epoch 10/500
21577/21577 - 64s - loss: 0.4855 - val_loss: 0.4787 - 64s/epoch - 3ms/step
Epoch 11/500
21577/21577 - 42s - loss: 0.4810 - val_loss: 0.4788 - 42s/epoch - 2ms/step
Epoch 12/500
21577/21577 - 41s - loss: 0.

<keras.callbacks.History at 0x7f7cc74018d0>

In [None]:
 # 성능 평가
from sklearn import metrics
from sklearn.metrics import f1_score, precision_score, recall_score

pred_st = model.predict(test_prepared)
pred_st = np.argmax(pred_st,axis=1)
y_eval_st = np.argmax(labels2,axis=1)
score_st = metrics.accuracy_score(y_eval_st, pred_st)

f1_score = metrics.f1_score(y_eval_st, pred_st, average='weighted')
precision_score = metrics.precision_score(y_eval_st, pred_st, average='weighted')
recall_score = metrics.recall_score(y_eval_st, pred_st, average='weighted')

print(score_st)
print(f1_score)
print(precision_score)
print(recall_score)

0.8408337774893981
0.807592041570281
0.7969762723723293
0.8408337774893981


  _warn_prf(average, modifier, msg_start, len(result))
