<a href="https://colab.research.google.com/github/euphoria96/KB_Smishing_dacon/blob/master/Modeling_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setting
### Mounting google drive

In [0]:
from google.colab import drive

drive.mount('/content/gdrive')

### Download Mecab
현재 다양한 자연어 처리 패키지 중에서 mecab는 윈도우에서는 설치가 힘든 패키지 중 하나이다.

이를 극복하기 위해, colab에서 mecab를 설치하고 활용할 수 있도록 colab 파일을 공유하고자 한다.

In [2]:
cd /content/gdrive/My Drive/Colab Notebooks/smishing/Mecab-ko-for-Google-Colab

/content/gdrive/My Drive/Colab Notebooks/smishing/Mecab-ko-for-Google-Colab


In [0]:
! bash install_mecab-ko_on_colab190912.sh

### Import packages

In [0]:
# data preprocessing
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
# tokenizer
import re
from konlpy.tag import Mecab
# modeling
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN, LSTM
from tensorflow.keras.models import Sequential
from sklearn.metrics import roc_auc_score
import pickle
import warnings
warnings.filterwarnings(action='ignore') 

### Load Data

In [6]:
path = '/content/gdrive/My Drive/Colab Notebooks/smishing/data/'
train = pd.read_csv(path+"train.csv")
test = pd.read_csv(path+"public_test.csv")
submission=pd.read_csv(path+"submission_제출양식.csv")
print(train.shape, test.shape, submission.shape)
train.head()

(295945, 4) (1626, 3) (1626, 2)


Unnamed: 0,id,year_month,text,smishing
0,0,2017-01,XXX은행성산XXX팀장입니다.행복한주말되세요,0
1,1,2017-01,오늘도많이웃으시는하루시작하세요XXX은행 진월동VIP라운지 XXX올림,0
2,2,2017-01,안녕하십니까 고객님. XXX은행입니다.금일 납부하셔야 할 금액은 153600원 입니...,0
3,4,2017-01,XXX 고객님안녕하세요XXX은행 XXX지점입니다지난 한 해 동안 저희 XXX지점에 ...,0
4,5,2017-01,1월은 새로움이 가득XXX입니다.올 한해 더 많이행복한 한해되시길바랍니다,0


## Data Preparation
### Sampling
- Counter({0: 277242, 1: 18703}) #0.0631975535994864
- mixed sampling

In [36]:
RANDOM_SEED = 1234

## for train data
# index
train_nsm_idx=list(train[train['smishing']==0].index)
train_sm_idx=list(train[train['smishing']==1].index)
# 3. Mixed Sampling
random.seed(RANDOM_SEED)
train_nsm_idx = random.sample(train_nsm_idx, k=18703*3)
random.seed(RANDOM_SEED)
train_sm_idx = random.choices(train_sm_idx, k=18703*3)
# index shuffle
train_idx = train_nsm_idx + train_sm_idx
print(train_idx[:5])
random.shuffle(train_idx)
print(train_idx[:5])
# Merge
train_X = pd.DataFrame(train['text'], columns=['text']).iloc[train_idx].reset_index(drop=True)
train_y = pd.DataFrame(train['smishing'], columns=['smishing']).iloc[train_idx].reset_index(drop=True)
print(train_X.shape, train_y.shape)

[242186, 63505, 4255, 49550, 19561]
[187347, 140625, 240425, 47135, 243703]
(112218, 1) (112218, 1)


In [37]:
## for test data
test['smishing'] = 2 # train data와 동일한 형태 생성을 위해 임의의 숫자를 추가 #이후 스미싱 여부 확률 값으로 덮어 씌워짐
test_X = pd.DataFrame(test['text'])
test_y = pd.DataFrame(test['smishing'])
print(test_X.shape, test_y.shape)

(1626, 1) (1626, 1)


### Tokenizing

In [38]:
tmp = [len(x) for x in train_X.text]
print('문자최대길이: ',max(tmp))
print('문자평균길이: ',sum(tmp)/len(tmp))

문자최대길이:  1478
문자평균길이:  467.7582027838671


In [0]:
stopwords = ['을', '를', '이', '가', '은', '는', 'null']

def tokenizing(text_list):
  tokenizer = Mecab()
  token_list = []
  for text in text_list:
    tokens = []
    txt = re.sub('[^가-힣a-zA-Z]',' ',text)
    txt = re.sub('X{1,}',' ',txt)
    token = tokenizer.morphs(txt)
    for t in token:
      if t not in stopwords:
        tokens.append(t)
    token_list.append(' '.join(tokens))
  return token_list

In [55]:
train_X_txt = tokenizing(train_X.text)
test_X_txt = tokenizing(test_X.text)
print(len(X_train), len(train_y), len(X_test))

112218 112218 1626


In [56]:
idx = 0
print(train_X.text[idx])
print(train_X_txt[idx])

2018년 새해가 밝았습니다.올 한 해에도 원하시는 일   모두 이루시고늘 행복하고 즐거운 일이   가득하길 바랍니다.새해 복 많이 받으세요.XXX은행과천XXX올림
년 새해 밝 았 습니다 올 한 해 에 도 원 하 시 일 모두 이루 시 고 늘 행복 하 고 즐거운 일 가득 하 길 바랍니다 새해 복 많이 받 으세요 은행 과 천 올림


In [58]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_X_txt)  # _개의 행을 가진 X의 각 행에 토큰화를 수행
train_X_seq = tokenizer.texts_to_sequences(train_X_txt) #단어를 숫자값, 인덱스로 변환하여 저장
word_to_index = tokenizer.word_index
vocab_size = len(word_to_index) + 1
print('vocab size: ', vocab_size)
max_len = 1000 # 전체 데이터셋의 길이 맞추기
X_train = pad_sequences(train_X_seq, maxlen=max_len)
print("train data shape: ", X_train.shape)
test_X_seq = tokenizer.texts_to_sequences(test_X_txt) #단어를 숫자값, 인덱스로 변환하여 저장
X_test = pad_sequences(test_X_seq, maxlen=max_len)
print("test data shape: ", X_test.shape)

vocab size:  22637
train data shape:  (112218, 1000)
test data shape:  (1626, 1000)


## Modeling

In [0]:
def auc_score(y_true, y_pred):
  try:
    auc = roc_auc_score( y_true, y_pred, average='macro', sample_weight=None).astype('float32')
  except ValueError:
    pass
  return auc

def auc( y_true, y_pred ) :
    score = tf.py_func( lambda y_true, y_pred : auc_score(y_true, y_pred) , [y_true, y_pred], 'float32', stateful=False, name='sklearnAUC' )
    return score

### 2. LSTM
##### (1) train

In [0]:
mpath = '/content/gdrive/My Drive/Colab Notebooks/smishing/model/'

In [84]:
model = Sequential()
model.add(Embedding(vocab_size, 64))
model.add(LSTM(64))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=[auc])
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 64)          1448768   
_________________________________________________________________
lstm_4 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dense_6 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 33        
Total params: 1,483,905
Trainable params: 1,483,905
Non-trainable params: 0
_________________________________________________________________


In [0]:
history = model.fit(X_train, train_y, epochs=5, batch_size=256, validation_split=0.2)
model_json = model.to_json()
with open(mpath+"LSTM_v3.json", "w") as json_file : 
    json_file.write(model_json)

Train on 89774 samples, validate on 22444 samples
Epoch 1/5
20736/89774 [=====>........................] - ETA: 7:32 - loss: 0.1290 - auc: 0.9928

In [0]:
# 귀찮
y_pred = model.predict_classes(X_test, batch_size=128)
print(y_pred[:4])
submission['smishing'] = y_pred
submission.to_csv(spath+"LSTM_v3.csv",index=False)

##### (2) validation

In [0]:
scores = model.evaluate(X_test, y_test, verbose=0) # 테스트 데이터에 대해서 정확도 평가
print("정확도: %.2f%%" % (scores[1]*100))

In [0]:
epochs = range(1, len(history.history['accuracy']) + 1)
plt.plot(epochs, history.history['loss'])
plt.plot(epochs, history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

##### (3) find the best

In [0]:
y_pred = model.predict(X_test, batch_size=64)
print(y_pred.shape)

### Submission
test data에서 각 text당 스미싱 확률을 구할 수 있으니 이를 제출양식의 smishing 변수에 넣고 csv 파일로 내보낸다

In [0]:
submission['smishing'] = y_pred
spath = '/content/gdrive/My Drive/Colab Notebooks/smishing/submission/'
submission.to_csv(spath+"LSTM_1.csv",index=False)

In [75]:
s = pd.read_csv(spath+'LSTM_1.csv')
s.head()

Unnamed: 0,id,smishing
0,340000,0.0
1,340001,0.0
2,340002,0.0
3,340003,0.0
4,340004,0.0
