# Import libraries

In [56]:
import os
import pandas as pd
import numpy as np
import random
import warnings
warnings.filterwarnings(action = 'ignore')
from datetime import datetime
import pickle

import re
from eunjeon import Mecab

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional, SpatialDropout1D, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import backend as K
from sklearn.metrics import roc_auc_score

# 같은 결과 생성을 위한 설정

In [10]:
sd = random.randint(0,99999999)
print(sd)

np.random.seed(sd)
random.seed(sd)
os.environ['PYTHONHASHEED']=str(sd)

config = tf.ConfigProto(intra_op_parallelism_threads=1,inter_op_parallelism_threads=1)
tf.set_random_seed(sd)

sess = tf.Session(graph=tf.get_default_graph(), config=config)
K.set_session(sess)

tf.logging.set_verbosity(tf.logging.ERROR)

15828500


# 데이터 불러오기

In [11]:
train = pd.read_csv("C:/Users/ASUS/Documents/data_competition/KB_smishing_classification/data/train.csv", encoding = 'ANSI')
print(train.shape)

(295945, 4)


# 데이터 전처리

## Data Cleaning(소문자, 특수문자 제거, xxx제거, Bi-gram)

In [58]:
def text_preprocessing(text_list):
    
    stopwords = ['을', '를', '이', '가', '은', '는', 'null']
    tokenizer = Mecab()
    bigram_list = []
    
    for text in text_list:
        txt = re.sub('[^가-힣a-z]', ' ', text.lower())
        txt = re.sub('x{1,}', ' ', txt)
        token = tokenizer.morphs(txt)
        token = [t for t in token if t not in stopwords or type(t) != float]
        bigram = [token[i] + '.' + token[i+1] for i in range(len(token) - 1)]
        bigram_list.append(' '.join(bigram))
        
    return bigram_list
        

In [59]:
train['bigram'] = text_preprocessing(train.text)

## Data sampling

In [61]:
from collections import Counter
Counter(train['smishing']) #target unbalancing

Counter({0: 277242, 1: 18703})

In [63]:
def train_data_sampling(train, seed=1234, a=3, b=3):
    
    train_nsm_idx=list(train[train['smishing']==0].index)
    train_sm_idx=list(train[train['smishing']==1].index)
    random.seed(seed)
    train_nsm_idx = random.sample(train_nsm_idx, k=18703*a)
    random.seed(seed)
    train_sm_idx = random.choices(train_sm_idx, k=18703*b)
    train_idx = train_nsm_idx + train_sm_idx
    print(train_idx[:5])
    random.shuffle(train_idx)
    print(train_idx[:5])
    return train_idx

In [64]:
trn_idx = train_data_sampling(train, seed=sd, a=3, b=2)
df_train = train.iloc[trn_idx].reset_index(drop=True)
print(df_train.shape)

[75413, 22902, 71920, 47563, 79410]
[210196, 98059, 36717, 271066, 188673]
(93515, 5)


## pre-processing(bi-gram -> sequence)

In [67]:
def save_tokenizer(tokenizer, mname):
    with open('C:/Users/ASUS/Documents/PreModel/Tokenizer/' + mname + '.pickle', 'wb') as f:
        pickle.dump(tokenizer, f, protocol = pickle.HIGHEST_PROTOCOL)

In [73]:
def text2sequence(train_text, max_len=1000):
    
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_text)
    save_tokenizer(tokenizer, 'tokenizer')
    train_X_seq = tokenizer.texts_to_sequences(train_text)
    vocab_size = len(tokenizer.word_index) + 1
    print('vocab_size : ', vocab_size)
    X_train = pad_sequences(train_X_seq, maxlen = max_len)
    return X_train, vocab_size

In [74]:
train_y = df_train['smishing']
train_X, vocab_size = text2sequence(df_train['bigram'], max_len = 660)
print(train_X.shape, train_y.shape)

vocab_size :  22261
(93515, 660) (93515,)


In [75]:
pd.Series([len(x.split()) for x in train['bigram']]).describe()

count    295945.000000
mean         68.920658
std          89.861211
min           0.000000
25%          15.000000
50%          34.000000
75%          81.000000
max         664.000000
dtype: float64

# Modeling

In [76]:
auc_ = 0
def auc_score(y_true, y_pred):
    global auc_
    try:
        auc_ = roc_auc_score( y_true, y_pred, average='macro', sample_weight = None).astype('float32')
    except ValueError:
        pass
    return auc_

def auc(y_true, y_pred):
    score = tf.py_func( lambda y_true, y_pred : auc_score(y_true, y_pred) , [y_true, y_pred], 'float32', stateful = False, name = 'sklearnAUC' )
    return score

In [77]:
def BiLSTM(vocab_size, max_len=1000):
    model = Sequential()
    model.add(Embedding(vocab_size, 128, input_length = max_len))
    model.add(SpatialDropout1D(0.3))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='tanh', kernel_regularizer = regularizers.l2(0.001)))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[auc])
    model.summary()
    return model

In [78]:
def model_save(model, mname):
    model_json = model.to_json()
    with open('C:/Users/ASUS/Documents/PreModel/Tokenizer/'+mname+'.json', 'w') as json_file : 
        json_file.write(model_json)
    model.save_weights('C:/Users/ASUS/Documents/PreModel/Tokenizer/'+mname+'.h5')

In [79]:
print('START TIME: ', datetime.now().isoformat())
model = BiLSTM(vocab_size, max_len=660)
early_stopping = EarlyStopping(patience=3, min_delta=0.00001)
history = model.fit(train_X, train_y, epochs=50, batch_size=128, validation_split=0.3, callbacks=[early_stopping])

model_save(model, 'model') # save trained model
print('END TIME: ', datetime.now().isoformat())

START TIME:  2020-02-24T15:04:30.734996
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 660, 128)          2849408   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 660, 128)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               98816     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 2,956,545
Trainable params: 2,956,545
Non-trainable params: 0
______

In [80]:
def text2sequence_test(tokenizer, test_text, max_len=1000):
    test_seq = tokenizer.texts_to_sequences(test_text)
    X_test = pad_sequences(test_seq, maxlen=max_len)
    return X_test

def get_prediction(test_file_path):
    '''
    Args: String
    Return: Pandas DataFrame    
    '''
    
    '''1. load test dataset'''
    test = pd.read_csv(test_file_path)
    
    '''2. load model and tokenizer'''
    with open('1_Model/tokenizer.pickle', 'rb') as f:
        tokenizer_test = pickle.load(f)
    with open('1_Model/model.json', 'r') as ff:
        json_model = ff.read()
    model_test = model_from_json(json_model)
    model_test.load_weights('1_Model/model.h5')
    
    '''3. test data preprocessing'''
    test['bigram'] = text_preprocessing(test.text)
    test_X = text2sequence_test(tokenizer_test, test['bigram'], max_len=660)
    
    model_test.compile(optimizer='adam', loss='binary_crossentropy', metrics=[auc])

    '''4. prediction'''
    y_pred = model_test.predict(test_X, batch_size=128)
    
    '''5. make submission'''
    test['smishing'] = y_pred
    submission = test[['id','smishing']]
    #submission.to_csv('submission.csv',index=False)

    return submission