<a href="https://colab.research.google.com/github/euphoria96/KB_Smishing_dacon/blob/master/Model8_Ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setting
### Mounting google drive

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

### Download Mecab

In [2]:
cd /content/gdrive/My Drive/Colab Notebooks/smishing/Mecab-ko-for-Google-Colab

/content/gdrive/My Drive/Colab Notebooks/smishing/Mecab-ko-for-Google-Colab


In [0]:
! bash install_mecab-ko_on_colab190912.sh

### Import packages

In [0]:
# data preprocessing
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
# tokenizer
import re
from konlpy.tag import Mecab
# modeling
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN, LSTM, Bidirectional, Flatten, Input, SpatialDropout1D, Conv1D, MaxPooling1D, GRU, Dropout
from tensorflow.keras.models import Sequential
from sklearn.metrics import roc_auc_score
import pickle
import time
import warnings
warnings.filterwarnings(action='ignore') 

### Load Data

In [2]:
path = '/content/gdrive/My Drive/Colab Notebooks/smishing/'
train = pd.read_csv(path+'data/train.csv')
test = pd.read_csv(path+'data/public_test.csv')
submission=pd.read_csv(path+'data/submission_제출양식.csv')
print(train.shape, test.shape, submission.shape)
train.head(3)

(295945, 4) (1626, 3) (1626, 2)


Unnamed: 0,id,year_month,text,smishing
0,0,2017-01,XXX은행성산XXX팀장입니다.행복한주말되세요,0
1,1,2017-01,오늘도많이웃으시는하루시작하세요XXX은행 진월동VIP라운지 XXX올림,0
2,2,2017-01,안녕하십니까 고객님. XXX은행입니다.금일 납부하셔야 할 금액은 153600원 입니...,0
3,4,2017-01,XXX 고객님안녕하세요XXX은행 XXX지점입니다지난 한 해 동안 저희 XXX지점에 ...,0
4,5,2017-01,1월은 새로움이 가득XXX입니다.올 한해 더 많이행복한 한해되시길바랍니다,0


## Data Preparation
### Sampling
- Ensemble different resampled dataset
- 0    277242, 1     18703

In [0]:
## for train data
RANDOM_SEED = 1234
# index
train_nsm_idx=list(train[train['smishing']==0].index)
train_sm_idx=list(train[train['smishing']==1].index)
random.seed(RANDOM_SEED)
random.shuffle(train_nsm_idx)
## for test data
test['smishing'] = 2
test_X = pd.DataFrame(test['text'])
test_y = pd.DataFrame(test['smishing'])

In [0]:
def data_resampling(train, i):
  train_nsm_idx_smp = train_nsm_idx[i*2*18703:(i+1)*2*18703] # split index # oversampling (2)
  if i == 7:  train_nsm_idx_smp = train_nsm_idx[2*7*18703:] + random.sample(train_nsm_idx, k=3303*2)
  train_sm_idx_smp = random.choices(train_sm_idx, k=18703*2)
  train_idx = train_nsm_idx_smp + train_sm_idx_smp # merge index
  random.shuffle(train_idx) # index shuffle
  print('resampled idx: ',train_idx[:5])
  train_X = pd.DataFrame(train['text'], columns=['text']).iloc[train_idx].reset_index(drop=True)
  train_y = pd.DataFrame(train['smishing'], columns=['smishing']).iloc[train_idx].reset_index(drop=True)
  return train_X, train_y

In [0]:
stopwords = ['을', '를', '이', '가', '은', '는', 'null']

def tokenizing(text_list):
  tokenizer = Mecab()
  token_list = []
  for text in text_list:
    tokens = []
    txt = re.sub('[^가-힣a-zA-Z]',' ',text)
    txt = re.sub('X{1,}',' ',txt)
    token = tokenizer.morphs(txt)
    for t in token:
      if t not in stopwords:
        tokens.append(t)
    token_list.append(' '.join(tokens))
  return token_list

In [0]:
def data_preparation(train, test_X, i, max_len=1000):
  train_X, train_y = data_resampling(train, i)
  print(pd.DataFrame([len(x) for x in train_X.text]).describe().transpose())
  train_X_txt = tokenizing(train_X.text)
  test_X_txt = tokenizing(test_X.text)
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(train_X_txt)  
  train_X_seq = tokenizer.texts_to_sequences(train_X_txt) 
  vocab_size = len(tokenizer.word_index) + 1
  print('vocab size: ', vocab_size)
  X_train = pad_sequences(train_X_seq, maxlen=max_len)
  test_X_seq = tokenizer.texts_to_sequences(test_X_txt) 
  X_test = pad_sequences(test_X_seq, maxlen=max_len)
  print("train data : ", X_train.shape, " test data : ", X_test.shape)
  return X_train, train_y, X_test, vocab_size

In [0]:
auc_score_=1
def auc_score(y_true, y_pred):
  global auc_score_
  try:
    auc_score_ = roc_auc_score( y_true, y_pred, average='macro', sample_weight=None).astype('float32')
  except ValueError:
    pass
  return auc_score_

def auc( y_true, y_pred ) :
    score = tf.py_func( lambda y_true, y_pred : auc_score(y_true, y_pred) , [y_true, y_pred], 'float32', stateful=False, name='sklearnAUC' )
    return score

In [0]:
def model_save(model, mname):
  model_json = model.to_json()
  with open(path+'model/'+mname+'.json', 'w') as json_file : 
      json_file.write(model_json)
  model.save_weights(path+'model/'+mname+'.h5')

In [0]:
def build_model(max_len=1000):
  model= Sequential()
  model.add(Embedding(vocab_size, 128, input_length=max_len))
  model.add(Bidirectional(LSTM(64)))
  model.add(Dense(64,activation='relu'))
  model.add(Dense(1,activation='sigmoid'))

  model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=[auc])
  model.summary()
  return model

## Ensemble Model
### with Bi LSTM

In [0]:
prediction = pd.DataFrame()
mname = 'Ensemble_BiLSTM_v1_ep3_'
for i in range(8):
  print('========================== '+str(i+1)+' th prediction ==========================')
  X_train, train_y, X_test, vocab_size = data_preparation(train, test_X, i, max_len=800)

  model1 = build_model(max_len=800)
  model1.fit(X_train, train_y, epochs=3, batch_size=128, validation_split=0.3)
  model_save(model1, mname+str(i))

  prediction['pred'+str(i)] = model1.predict(X_test, batch_size=128)

submission['smishing'] = prediction.mean(axis=1) # ensemble with average
submission.to_csv(path+'submission/'+mname+'.csv',index=False)

resampled idx:  [176052, 101086, 70127, 141754, 233338]
     count        mean         std  min   25%    50%    75%     max
0  74812.0  467.523713  379.321034  1.0  73.0  408.0  877.0  1360.0
vocab size:  20169
train data :  (74812, 800)  test data :  (1626, 800)
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 800, 128)          2581632   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 2,688,769
Trainable params: 2,688,769
Non-trainable params: 0
____________________________________________