### 데이터 로드

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path = '/content/drive/MyDrive/security/'

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
import matplotlib.pyplot as plt
import re

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.regularizers import *
from tensorflow.keras.optimizers import *
import tensorflow.keras.backend as K
from datetime import datetime

from sklearn.model_selection import train_test_split

### 데이터 로딩

In [None]:
train_df = pd.read_csv(path+'train.csv')
test_df = pd.read_csv(path+'test.csv')
submission_df = pd.read_csv(path+'sample_submission.csv')

In [None]:
print(train_df.shape, test_df.shape, submission_df.shape)

(472972, 3) (1418916, 2) (1418916, 2)


### y값 카운트

In [None]:
# level 값 카운트
train_df['level'].value_counts()

0    334065
1    132517
3      4141
5      2219
2        12
4        10
6         8
Name: level, dtype: int64

### 결측값 체크

In [None]:
train_df.loc[train_df['full_log'].isnull()]

Unnamed: 0,id,level,full_log


In [None]:
label = to_categorical(np.array(train_df['level']))

In [None]:
text_train, text_val, label_train, label_val = train_test_split(train_df['full_log'], label, test_size=0.2, random_state=42)

### 숫자 마스킹

In [None]:
text_train = text_train.str.replace(r'[0-9]', '<num>')
text_val = text_val.str.replace(r'[0-9]', '<num>')

### 데이터 전처리 - Dobby님 코드 참조

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(text_train)
top_k = len(tokenizer.word_index)

x_train = tokenizer.texts_to_sequences(text_train)
x_val = tokenizer.texts_to_sequences(text_val)

max_length=300

x_train_vector = tf.keras.preprocessing.sequence.pad_sequences(
    x_train, maxlen=max_length, padding='post')

x_val_vector = tf.keras.preprocessing.sequence.pad_sequences(
    x_val, maxlen=max_length, padding='post')

In [None]:
vocab_size = top_k + 1

def below_threshold_len(max_length, nested_list):
    cnt = 0
    for s in nested_list:
        if(len(s) <= max_length):
            cnt = cnt + 1
    print('전체 샘플 중 길이가 %s 이하인 샘플의 비율: %s'%(max_length, (cnt / len(nested_list))*100))

below_threshold_len(max_length, x_train)

전체 샘플 중 길이가 300 이하인 샘플의 비율: 80.88810894953976


In [None]:
path = './model'

### 모델 만들기 - MultiHead Self Attention
#### get_config 부분에서 파라미터 업데이트가 이루어집니다 (텐서플로 2버전)

In [None]:
class MultiHeadAttention(Layer):
    def __init__(self, embedding_dim, num_heads=8):
        super(MultiHeadAttention, self).__init__()
        self.embedding_dim = embedding_dim # d_model
        self.num_heads = num_heads
        
        # Attention Score Error 방지코드 - 어텐션 스코어 수식 참조
        assert embedding_dim % self.num_heads == 0

        self.projection_dim = embedding_dim // num_heads
        self.query_dense = Dense(embedding_dim)
        self.key_dense = Dense(embedding_dim)
        self.value_dense = Dense(embedding_dim)
        self.dense = Dense(embedding_dim)

    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'embedding_dim' : self.embedding_dim,
            'num_heads' : self.num_heads,

            'projection_dim' : self.projection_dim,
            'query_dense' : self.query_dense,
            'key_dense' : self.key_dense,
            'value_dense' : self.value_dense,
            'dense' : self.dense
        })

    def scaled_dot_product_attention(self, query, key, value):
        matmul_qk = tf.matmul(query, key, transpose_b=True)
        depth = tf.cast(tf.shape(key)[-1], tf.float32)
        logits = matmul_qk / tf.math.sqrt(depth)
        attention_weights = tf.nn.softmax(logits, axis=-1)
        output = tf.matmul(attention_weights, value)
        return output, attention_weights

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]

        # (batch_size, seq_len, embedding_dim)
        query = self.query_dense(inputs)
        key = self.key_dense(inputs)
        value = self.value_dense(inputs)

        # (batch_size, num_heads, seq_len, projection_dim)
        query = self.split_heads(query, batch_size)
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)

        scaled_attention, _ = self.scaled_dot_product_attention(query, key, value)
        # (batch_size, seq_len, num_heads, projection_dim)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

        # (batch_size, seq_len, embedding_dim)
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.embedding_dim))
        outputs = self.dense(concat_attention)
        return outputs

In [None]:
class TransformerBlock(Layer):
    def __init__(self, embedding_dim, num_heads, dff, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(embedding_dim, num_heads)
        self.ffn = Sequential(
            [Dense(dff, activation="relu"),
             Dense(embedding_dim),]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'att' : self.att,
            'ffn' : self.ffn,
            'layernorm1' : self.layernorm1,
            'layernorm2' : self.layernorm2,
            'dropout1' : self.dropout1,
            'dropout2' : self.dropout2
        })

    def call(self, inputs, training):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
class TokenAndPositionEmbedding(Layer):
    def __init__(self, max_len, vocab_size, embedding_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = Embedding(vocab_size, embedding_dim)
        self.pos_emb = Embedding(max_len, embedding_dim)

    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'token_emb' : self.token_emb,
            'pos_emb' : self.pos_emb,
        })
        return config

    def call(self, x):
        max_len = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=max_len, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
embedding_dim = 128  # Embedding size for each token
num_heads = 8  # Number of attention heads
dff = 2048  # Hidden layer size in feed forward network inside transformer

inputs = Input(shape=(max_length,))
embedding_layer = TokenAndPositionEmbedding(max_length, vocab_size, embedding_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embedding_dim, num_heads, dff)
x = transformer_block(x)
x = GlobalAveragePooling1D()(x)
x = Dropout(0.1)(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
outputs = Dense(7, activation="softmax")(x)

model = Model(inputs=inputs, outputs=outputs)

In [None]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 300)]             0         
_________________________________________________________________
token_and_position_embedding (None, 300, 128)          1506048   
_________________________________________________________________
transformer_block (Transform (None, 300, 128)          593024    
_________________________________________________________________
global_average_pooling1d (Gl (None, 128)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_3 (Dropout)          (None, 16)                0     

### F1 score를 계산하는 코드 추가

In [None]:
def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    # tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

def fbeta_score_macro(y_true, y_pred, beta=1, threshold=0.5):

    y_true = K.cast(y_true, 'float')
    y_pred = K.cast(K.greater(K.cast(y_pred, 'float'), threshold), 'float')

    tp = K.sum(y_true * y_pred, axis=0)
    fp = K.sum((1 - y_true) * y_pred, axis=0)
    fn = K.sum(y_true * (1 - y_pred), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = (1 + beta ** 2) * p * r / ((beta ** 2) * p + r + K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)

    return K.mean(f1)

### 모델과 라벨 맞추기

In [None]:
ckpt_1 = 'tf_chkpoint.ckpt'
model.compile(optimizer=Adam(1e-4), loss="categorical_crossentropy", metrics=[f1])
mc = ModelCheckpoint(filepath = ckpt_1, monitor = 'f1', mode = 'max', save_best_only = True,verbose = 1, save_weights_only=True)
history = model.fit(x_train_vector, label_train, validation_data=(x_val_vector, label_val), batch_size=128, epochs=20, callbacks = [mc])

time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print(f'[{time_str}] : train(fitting) ended')

Epoch 1/20

Epoch 00001: f1 improved from -inf to 0.35952, saving model to tf_chkpoint.ckpt
Epoch 2/20

Epoch 00002: f1 improved from 0.35952 to 0.43469, saving model to tf_chkpoint.ckpt
Epoch 3/20

Epoch 00003: f1 improved from 0.43469 to 0.43990, saving model to tf_chkpoint.ckpt
Epoch 4/20

Epoch 00004: f1 improved from 0.43990 to 0.44017, saving model to tf_chkpoint.ckpt
Epoch 5/20

Epoch 00005: f1 improved from 0.44017 to 0.44068, saving model to tf_chkpoint.ckpt
Epoch 6/20

Epoch 00006: f1 improved from 0.44068 to 0.44370, saving model to tf_chkpoint.ckpt
Epoch 7/20

Epoch 00007: f1 improved from 0.44370 to 0.44440, saving model to tf_chkpoint.ckpt
Epoch 8/20

Epoch 00008: f1 improved from 0.44440 to 0.44621, saving model to tf_chkpoint.ckpt
Epoch 9/20

Epoch 00009: f1 did not improve from 0.44621
Epoch 10/20

Epoch 00010: f1 did not improve from 0.44621
Epoch 11/20

Epoch 00011: f1 did not improve from 0.44621
Epoch 12/20

### 최적 가중치 모델 로드

In [None]:
model.load_weights(ckpt_1)

In [None]:
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

### confuzion matrix 구현부분 - 예측률, 예측값 계산

In [None]:
proba = model.predict(x_val_vector)
pred = np.argmax(proba, axis=-1)

In [None]:
true_label = np.argmax(label_val, axis=-1)

In [None]:
ct = pd.crosstab(true_label, pred, rownames=['real'], colnames = ['pred'])

In [None]:
ct

### 예측률이 0.9 이하이면 새로운 클래스로 분류

In [None]:
preds_numpy = pred
preds_numpy[np.where(np.max(proba, axis=1) < 0.9)] = 7
print(np.sum(preds_numpy==7))
ct = pd.crosstab(true_label, preds_numpy, rownames=['real'], colnames=['pred'])
ct

### 새로운 값 예측

In [None]:
text_test = test_df['full_log'].str.replace(r'[0-9]', '<num>')

x_test = tokenizer.texts_to_sequences(text_test)

x_test_vector = tf.keras.preprocessing.sequence.pad_sequences(
   x_test , maxlen=max_length, padding='post')

### 새로운 클래스 레벨7 계산로직 - softmax 확률값과 클래스 예측값 계산

In [None]:
proba_test = model.predict(x_test_vector)
pred_test = np.argmax(proba_test, axis=-1)

pred_test[np.where(np.max(proba_test, axis=1) < 0.9)] = 7

print(np.sum(pred_test==7))

In [None]:
submission_df['level'] = pred_test

In [None]:
submission_df.head()

In [None]:
submission_df.to_csv(path+'submission_128.csv', index=False)