<a href="https://colab.research.google.com/github/euphoria96/KB_Smishing_dacon/blob/master/14%ED%9A%8C_%EB%8C%80%ED%9A%8C_%ED%9B%88%EB%A0%A8_%EC%BD%94%EB%93%9C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Dacon 14회 KB 금융문자 분석 모델링 경진대회
### euphoria
### 2020년 1월 17일

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
cd /content/gdrive/My Drive/Colab Notebooks/smishing/Mecab-ko-for-Google-Colab

/content/gdrive/My Drive/Colab Notebooks/smishing/Mecab-ko-for-Google-Colab


In [0]:
! bash install_mecab-ko_on_colab190912.sh

## 1. 라이브러리 및 데이터
### 1-1. Import libraries

In [0]:
# data preprocessing
import os
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(action='ignore')
from datetime import datetime, timedelta
import pickle
# text tokenizing
import re
from konlpy.tag import Mecab
# modeling
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional, SpatialDropout1D, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import backend as K
from sklearn.metrics import roc_auc_score

### 1-2. Settings for reproducible results
> ref: https://keras.io/getting-started/faq/#how-can-i-obtain-reproducible-results-using-keras-during-development

In [0]:
#random.randint(0,99999999)
sd = 15026912

np.random.seed(sd)
random.seed(sd)
os.environ['PYTHONHASHSEED']=str(sd)

config = tf.ConfigProto(intra_op_parallelism_threads=1,inter_op_parallelism_threads=1)
tf.set_random_seed(sd)

sess = tf.Session(graph=tf.get_default_graph(), config=config)
K.set_session(sess)

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = ""
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
tf.logging.set_verbosity(tf.logging.ERROR)

In [0]:
def now():
    return datetime.now().isoformat()[5:-7].replace('T',' ')

### 1-3. Load data

In [5]:
os.chdir('/content/gdrive/My Drive/Colab Notebooks/smishing/') 
train = pd.read_csv('0_Data/train.csv')
print(train.shape)
train.head(3)

(295945, 4)


Unnamed: 0,id,year_month,text,smishing
0,0,2017-01,XXX은행성산XXX팀장입니다.행복한주말되세요,0
1,1,2017-01,오늘도많이웃으시는하루시작하세요XXX은행 진월동VIP라운지 XXX올림,0
2,2,2017-01,안녕하십니까 고객님. XXX은행입니다.금일 납부하셔야 할 금액은 153600원 입니...,0


## 2. 데이터 전처리
### 2-1. Data Cleaning
- tokenizing using Mecab
- make bi-gram

In [0]:
def text_preprocessing(text_list):
    '''
    args: text list that wants to tokenize
    return: token list, bigram list
    '''
    stopwords = ['을', '를', '이', '가', '은', '는', 'null'] #제거: 불용어, 한글 영문 외 문자, XXX 등 비식별처리된 문자
    tokenizer = Mecab()
    token_list, bigram_list = [], []
    for text in text_list:
        txt = re.sub('[^가-힣a-z]',' ',text.lower())
        txt = re.sub('x{1,}',' ',txt)
        token = tokenizer.morphs(txt)  # tokenizing
        token = [t for t in token if t not in stopwords or type(t)!= float] # text cleaning
        token_list.append(' '.join(token))
        bigram = [token[i]+'.'+token[i+1] for i in range(len(token)-1)]  # bi-gram
        bigram_list.append(' '.join(bigram))
    return token_list, bigram_list

In [0]:
train['token_txt'], train['bigram'] = text_preprocessing(train.text)

### 2-2. Data Sampling
- Counter({0: 277242, 1: 18703}) #0.0631975535994864
- Data Imbalance Problem!
- Using Mixed Sampling

In [0]:
def train_data_sampling(train, seed=1234, a=3, b=3):
    '''
    args: train data, seed number, a(under sampling), b(over sampling)
    return: sampling index
    '''
    train_nsm_idx=list(train[train['smishing']==0].index)
    train_sm_idx=list(train[train['smishing']==1].index)
    random.seed(seed)
    train_nsm_idx = random.sample(train_nsm_idx, k=18703*a)
    random.seed(seed)
    train_sm_idx = random.choices(train_sm_idx, k=18703*b)
    train_idx = train_nsm_idx + train_sm_idx
    print(train_idx[:5])
    random.shuffle(train_idx)
    print(train_idx[:5])
    return train_idx

In [9]:
trn_idx = train_data_sampling(train, seed=sd, a=3, b=2)
df_train = train.iloc[trn_idx].reset_index(drop=True)
print(df_train.shape)

[206864, 218560, 111768, 152524, 170588]
[25559, 185452, 293634, 175839, 247866]
(93515, 6)


### 2-3. text pre-processing
- text to sequences with tf.keras

In [0]:
def save_tokenizer(tokenizer):
    mname = 'tokenizer_'+now()
    with open('1_Model/'+mname+'.pickle', 'wb') as f:
        pickle.dump(tokenizer, f, protocol = pickle.HIGHEST_PROTOCOL)

In [0]:
def text2sequence(train_text, max_len=1000):
    '''
    args: text of train data, max length(for word embedding)
    return: train data(for modeling), vocabulary size
    '''
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_text)
    save_tokenizer(tokenizer)
    train_X_seq = tokenizer.texts_to_sequences(train_text)
    vocab_size = len(tokenizer.word_index) + 1
    print('vocab size: ', vocab_size)
    X_train = pad_sequences(train_X_seq, maxlen=max_len)
    return X_train, vocab_size

In [12]:
train_y = df_train['smishing']
train_X, vocab_size = text2sequence(df_train['token_txt'], max_len=660)
train_X2, vocab_size2 = text2sequence(df_train['bigram'], max_len=660)
print(train_X.shape, train_y.shape)

vocab size:  22614
vocab size:  22614
(93515, 660) (93515,)


## 3. 탐색적 자료 분석 (EDA)

In [0]:
x1 = pd.DataFrame(pd.Series([len(x.split()) for x in train['text_']]).describe(), columns=['text_']).transpose()
x2 = pd.DataFrame(pd.Series([len(x.split()) for x in train['2gram']]).describe(), columns=['2gram']).transpose()
x3 = pd.DataFrame(pd.Series([len(x.split()) for x in train['3gram']]).describe(), columns=['3gram']).transpose()
pd.concat([x1,x2,x3], axis=0)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
text_,295945.0,70.010012,90.041335,1.0,16.0,35.0,82.0,664.0
2gram,295945.0,69.010056,90.041308,1.0,15.0,34.0,81.0,663.0
3gram,295945.0,68.010191,90.041213,1.0,14.0,33.0,80.0,662.0


In [0]:
x1 = pd.DataFrame(pd.Series([len(x) for x in train['text_']]).describe(), columns=['text_']).transpose()
x2 = pd.DataFrame(pd.Series([len(x) for x in train['2gram']]).describe(), columns=['2gram']).transpose()
x3 = pd.DataFrame(pd.Series([len(x) for x in train['3gram']]).describe(), columns=['3gram']).transpose()
pd.concat([x1,x2,x3], axis=0)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
text_,295945.0,188.277727,241.091119,1.0,43.0,94.0,220.0,1660.0
2gram,295945.0,371.484614,482.234776,1.0,82.0,183.0,435.0,3315.0
3gram,295945.0,549.332724,723.312468,1.0,114.0,266.0,645.0,4965.0


In [13]:
train.iloc[95753]

id                111663
year_month       2017-06
text          XXX6ㅛㅗ8ㅛㅗ8
smishing               0
token_txt               
bigram                  
Name: 95753, dtype: object

## 4. 변수 선택 및 모델 구축

## 5. 모델 학습 및 검증
### 5-1. Build Model
- score: AUC
- Bi-LSTM (Bi-Directional Long Short Term Memory)

In [0]:
auc_ = 0
def auc_score(y_true, y_pred):
    global auc_
    try:
        auc_ = roc_auc_score( y_true, y_pred, average='macro', sample_weight = None).astype('float32')
    except ValueError:
        pass
    return auc_

def auc(y_true, y_pred):
    score = tf.py_func( lambda y_true, y_pred : auc_score(y_true, y_pred) , [y_true, y_pred], 'float32', stateful = False, name = 'sklearnAUC' )
    return score

In [0]:
def BiLSTM(vocab_size, max_len=1000):
    model = Sequential()
    model.add(Embedding(vocab_size, 128, input_length = max_len))
    model.add(SpatialDropout1D(0.3))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='tanh', kernel_regularizer = regularizers.l2(0.001)))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[auc])
    model.summary()
    return model

In [0]:
def model_save(model, mname):
    model_json = model.to_json()
    with open('1_Model/'+mname+'.json', 'w') as json_file : 
        json_file.write(model_json)
    model.save_weights('1_Model/'+mname+'.h5')

### 5-2. Train Model
#### (1) Unigram

In [0]:
print('start time: ', datetime.now().isoformat())
model1 = BiLSTM(vocab_size, max_len=600)
early_stopping = EarlyStopping(patience=2, min_delta=0.00005)
history = model1.fit(train_X, train_y, epochs=5, batch_size=128, validation_split=0.3, callbacks=[early_stopping])

mname = now()+'_BiLSTM_unigram_660'
model_save(model1, mname)
print('end time: ', datetime.now().isoformat())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 600, 64)           1726272   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 600, 64)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               66048     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 1,800,641
Trainable params: 1,800,641
Non-trainable params: 0
____________________________________________

#### (2) Bi-gram

In [0]:
print('start time: ', datetime.now().isoformat())
model2 = BiLSTM(vocab_size2, max_len=660)
early_stopping = EarlyStopping(patience=2, min_delta=0.00005)
history = model2.fit(train_X2, train_y, epochs=50, batch_size=128, validation_split=0.3, callbacks=[early_stopping])

mname = now()+'_BiLSTM_bigram_660'
model_save(model2, mname)
print('end time: ', datetime.now().isoformat())

start time:  2020-01-16T05:06:23.299477
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 660, 128)          2894592   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 660, 128)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               98816     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 3,001,729
Trainable params: 3,001,729
Non-trainable params: 0
______