<a href="https://colab.research.google.com/github/euphoria96/KB_Smishing_dacon/blob/master/Model4_BiLSTM_ngram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setting
Mounting google drive

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

### Download Mecab
Mecab for colab by dacon.io

In [0]:
cd /content/gdrive/My Drive/Colab Notebooks/smishing/Mecab-ko-for-Google-Colab

/content/gdrive/My Drive/Colab Notebooks/smishing/Mecab-ko-for-Google-Colab


In [0]:
! bash install_mecab-ko_on_colab190912.sh

### Import packages

In [0]:
# data preprocessing
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
# tokenizer
import re
from konlpy.tag import Mecab
# modeling
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional, Flatten, Input, SpatialDropout1D, Conv1D, MaxPooling1D, GRU, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras import regularizers
from sklearn.metrics import roc_auc_score
import pickle
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings(action='ignore')

### Load Data

In [0]:
cd /content/gdrive/My Drive/Colab Notebooks/smishing/

/content/gdrive/My Drive/Colab Notebooks/smishing


In [0]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/public_test.csv')
submission=pd.read_csv('data/submission_제출양식.csv')
print(train.shape, test.shape, submission.shape)
train.head(3)

(295945, 4) (1626, 3) (1626, 2)


Unnamed: 0,id,year_month,text,smishing
0,0,2017-01,XXX은행성산XXX팀장입니다.행복한주말되세요,0
1,1,2017-01,오늘도많이웃으시는하루시작하세요XXX은행 진월동VIP라운지 XXX올림,0
2,2,2017-01,안녕하십니까 고객님. XXX은행입니다.금일 납부하셔야 할 금액은 153600원 입니...,0


## Data Preparation


### Tokenizing

In [0]:
def text_preprocessing(text_list):
    stopwords = ['을', '를', '이', '가', '은', '는', 'null']
    tokenizer = Mecab()
    token_list, ngram_list = [], []
    for text in text_list:
        txt = re.sub('[^가-힣a-z]',' ',text.lower())
        txt = re.sub('x{1,}',' ',txt)
        token = tokenizer.morphs(txt)
        token = [t for t in token if t not in stopwords]
        token_list.append(' '.join(token))
        ngram = [token[i]+'.'+token[i+1] for i in range(len(token)-1)]
        ngram_list.append(' '.join(ngram))
    return token_list, ngram_list

### Sampling
- Counter({0: 277242, 1: 18703}) #0.0631975535994864
- mixed sampling

In [0]:
def data_sampling(col='text_', seed=1234, a=3, b=3):
    train_nsm_idx=list(train[train['smishing']==0].index)
    train_sm_idx=list(train[train['smishing']==1].index)
    random.seed(seed)
    train_nsm_idx = random.sample(train_nsm_idx, k=18703*a)
    random.seed(seed)
    train_sm_idx = random.choices(train_sm_idx, k=18703*b)
    train_idx = train_nsm_idx + train_sm_idx
    print(train_idx[:5])
    random.shuffle(train_idx)
    print(train_idx[:5])
    train_X = pd.DataFrame(train[col], columns=[col]).iloc[train_idx].reset_index(drop=True)
    train_y = pd.DataFrame(train['smishing'], columns=['smishing']).iloc[train_idx].reset_index(drop=True)
    print(train_X.shape, train_y.shape)
    return train_X, train_y

### Word Embedding

In [0]:
def data_preparation(train_text, test_text, max_len=1000):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_text)
    train_X_seq = tokenizer.texts_to_sequences(train_text)
    vocab_size = len(tokenizer.word_index) + 1
    print('vocab size: ', vocab_size)
    X_train = pad_sequences(train_X_seq, maxlen=max_len)
    test_X_seq = tokenizer.texts_to_sequences(test_text)
    X_test = pad_sequences(test_X_seq, maxlen=max_len)
    print("train & test shape: ", X_train.shape, X_test.shape)
    return X_train, X_test, vocab_size

### scoring for modeling

In [0]:
auc_=0
def auc_score(y_true, y_pred):
    global auc_
    try:
        auc_ = roc_auc_score( y_true, y_pred, average='macro', sample_weight=None).astype('float32')
    except ValueError:
        pass
    return auc_

def auc( y_true, y_pred ) :
    score = tf.py_func( lambda y_true, y_pred : auc_score(y_true, y_pred) , [y_true, y_pred], 'float32', stateful=False, name='sklearnAUC' )
    return score

In [0]:
def model_save(model, mname):
    model_json = model.to_json()
    with open('model/'+mname+'.json', 'w') as json_file : 
        json_file.write(model_json)
    model.save_weights('model/'+mname+'.h5')

## Modeling

In [0]:
train['text_'], train['ngram'] = text_preprocessing(train.text)
test['text_'], test['ngram'] = text_preprocessing(test.text)

In [0]:
train_X_n, train_y_n = data_sampling(col='ngram', seed=15026912, a=3, b=2)

[206864, 218560, 111768, 152524, 170588]
[25559, 185452, 293634, 175839, 247866]
(93515, 1) (93515, 1)


In [0]:
X_train_n, X_test_n, vocab_size_n = data_preparation(train_X_n.ngram, test.text_, max_len=800)

vocab size:  22607
train & test shape:  (93515, 800) (1626, 800)


### 1. BiLSTM - 2gram

In [0]:
def BiLSTM(vocab_size, max_len=800):
    model = Sequential()
    model.add(Embedding(vocab_size, 128, input_length = max_len))
    model.add(Dropout(0.3))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='tanh', kernel_regularizer = regularizers.l2(0.001)))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[auc])
    model.summary()
    return model

In [0]:
model1 = BiLSTM(vocab_size_n)
history = model1.fit(X_train_n, train_y_n, epochs=7, batch_size=128, validation_split=0.2)

mname = (datetime.now() + timedelta(hours=9) ).isoformat()[5:-10]+'_BiLSTM_2gram_ep7'
model_save(model1, mname)

y_pred = model1.predict(X_test_n, batch_size=128)
submission['smishing'] = y_pred
submission.to_csv('submission/'+mname+'.csv',index=False)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 800, 128)          2893696   
_________________________________________________________________
dropout_2 (Dropout)          (None, 800, 128)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 3,000,833
Trainable params: 3,000,833
Non-trainable params: 0
____________________________________________

In [0]:
model1.fit(X_train_n, train_y_n, epochs=3, batch_size=128, validation_split=0.2)
mname = (datetime.now() + timedelta(hours=9) ).isoformat()[5:-10]+'_BiLSTM_2gram_ep10'
model_save(model1, mname)

y_pred = model1.predict(X_test_n, batch_size=128)
submission['smishing'] = y_pred
submission.to_csv('submission/'+mname+'.csv',index=False)

Train on 74812 samples, validate on 18703 samples
Epoch 1/3

In [0]:
model1.fit(X_train_n, train_y_n, epochs=1, batch_size=128, validation_split=0.2)

mname = (datetime.now() + timedelta(hours=9) ).isoformat()[5:-10]+'_BiLSTM_2gram_ep7'
model_save(model1, mname)

y_pred = model1.predict(X_test_n, batch_size=128)
submission['smishing'] = y_pred
submission.to_csv('submission/'+mname+'.csv',index=False)

Train on 119699 samples, validate on 29925 samples


In [0]:
model1.fit(X_train_n, train_y_n, epochs=3, batch_size=128, validation_split=0.2)

mname = (datetime.now() + timedelta(hours=9) ).isoformat()[5:-10]+'_BiLSTM_2gram_ep10'
model_save(model1, mname)

y_pred = model1.predict(X_test_n, batch_size=128)
submission['smishing'] = y_pred
submission.to_csv('submission/'+mname+'.csv',index=False)

Train on 119699 samples, validate on 29925 samples
Epoch 1/3

KeyboardInterrupt: ignored

In [0]:
history2 = model1.fit(X_train_n, train_y_n, epochs=2, batch_size=128, validation_split=0.2)

mname = (datetime.now() + timedelta(hours=9) ).isoformat()[5:-10]+'_BiLSTM_2gram_ep5'
model_save(model1, mname)

y_pred = model1.predict(X_test_n, batch_size=128)
submission['smishing'] = y_pred
submission.to_csv('submission/'+mname+'.csv',index=False)

Train on 119699 samples, validate on 29925 samples
Epoch 1/2
Epoch 2/2
 18048/119699 [===>..........................] - ETA: 37:44 - loss: 1.2393e-05 - auc: 1.0000Train on 119699 samples, validate on 29925 samples
Epoch 1/2
Epoch 2/2


In [0]:
y_pred = model1.predict(X_test_n, batch_size=128)
submission['smishing'] = y_pred
submission.to_csv('submission/'+mname+'.csv',index=False)

NameError: ignored

In [0]:
model1 = BiLSTM(vocab_size_n)
history = model1.fit(X_train_n, train_y_n, epochs=10, batch_size=64, validation_split=0.2)

mname = (datetime.now() + timedelta(hours=9) ).isoformat()[5:-10]+'_BiLSTM_2gram_ep10_b64'
model_save(model1, mname)

y_pred = model1.predict(X_test_n, batch_size=64)
submission['smishing'] = y_pred
submission.to_csv('submission/'+mname+'.csv',index=False)

Train on 119699 samples, validate on 29925 samples


### 2. ConvLSTM - 2gram

In [0]:
def ConvLSTM(vocab_size, max_len=800):
    model= Sequential()
    model.add(Embedding(vocab_size, 128, input_length=max_len))
    model.add(Conv1D(64, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.2))
    model.add(Bidirectional(LSTM(128)))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(64,activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1,activation='sigmoid'))

    model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=[auc])
    model.summary()

In [0]:
model1 = ConvLSTM(vocab_size_n)
history = model1.fit(X_train_n, train_y_n, epochs=5, batch_size=128, validation_split=0.2)

mname = (datetime.now() + timedelta(hours=9) ).isoformat()[5:-10]+'_ConvLSTM_2gram_v1_ep5_'
model_save(model1, mname)

y_pred = model1.predict(X_test_n, batch_size=128)
submission['smishing'] = y_pred
submission.to_csv('submission/'+mname+'.csv',index=False)