# RNN을 이용한 SMS Spam 분류
- 캐글에서 제공하는 [스팸메일 데이터](https://www.kaggle.com/uciml/sms-spam-collection-dataset)

In [1]:
import numpy as np
import tensorflow as tf

seed = 2021
np.random.seed(seed)
tf.random.set_seed(seed)

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
from google.colab import files

uploaded = files.upload()
filename = list(uploaded.keys())[0]

Saving spam.csv to spam (2).csv


### 데이터 전처리

In [4]:
import pandas as pd

df = pd.read_csv(filename, encoding='latin1')
df.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,


In [5]:
del df['Unnamed: 2']
del df['Unnamed: 3']
del df['Unnamed: 4']
df['v1'] = df['v1'].replace(['ham', 'spam'], [0, 1])
df.head(3)

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   int64 
 1   v2      5572 non-null   object
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


In [7]:
# Null 값 확인
df.isnull().sum()

v1    0
v2    0
dtype: int64

In [8]:
# 데이터 중복이 있는지 확인
df.v2.nunique()

5169

In [9]:
# 데이터 중복 제거
df = df.drop_duplicates('v2', keep='first')

In [10]:
df.v1.value_counts()

0    4516
1     653
Name: v1, dtype: int64

In [11]:
X = df.v2.values
y = df.v1.values

In [12]:
X[2]

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [13]:
# 구두점 제거, 소문자로 변환
from string import punctuation
def preprocessing(s):
    s = s.encode('utf8').decode('ascii', 'ignore')
    return ''.join(c for c in s if c not in punctuation).lower()

In [14]:
X_punct = [preprocessing(x) for x in X]
X_punct[:3]

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat',
 'ok lar joking wif u oni',
 'free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry questionstd txt ratetcs apply 08452810075over18s']

In [15]:
# 단어 집합을 만들고, 그 크기를 확인
t = Tokenizer()
t.fit_on_texts(X_punct)
sequences = t.texts_to_sequences(X_punct)
vocab_size = len(t.word_index) + 1
print(f'단어 집합의 크기: {vocab_size}')

단어 집합의 크기: 9480


### 전체 단어 집합에서 희소 단어의 비율

In [16]:
word_to_index = t.word_index

In [17]:
threshold = 2
total_cnt = len(word_to_index) # 단어의 수
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in t.word_counts.items():
    total_freq = total_freq + value

    # 단어의 등장 빈도수가 threshold보다 작으면
    if(value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

print(f'등장빈도가 {threshold-1}번 이하인 희귀단어의 수: {rare_cnt}')
print("단어 집합(vocabulary)에서 희귀단어의 비율:",(rare_cnt/total_cnt)*100)
print("전체 등장빈도에서 희귀단어 등장빈도 비율:",(rare_freq/total_freq)*100) 

등장빈도가 1번 이하인 희귀단어의 수: 5644
단어 집합(vocabulary)에서 희귀단어의 비율: 59.542145795970036
전체 등장빈도에서 희귀단어 등장빈도 비율: 7.235897435897436


In [18]:
X_data = sequences
max_len = max(len(l) for l in X_data)
print('SMS의 최대 길이 : %d' % max_len)
print('SMS의 평균 길이 : %f' % (sum(map(len, X_data))/len(X_data)))

SMS의 최대 길이 : 171
SMS의 평균 길이 : 15.089959


In [19]:
# 전체 데이터셋의 길이를 max_len에 맞도록 padding
data = pad_sequences(X_data, maxlen=max_len)
data.shape

(5169, 171)

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    data, y, stratify=y, test_size=0.2, random_state=seed
)
X_train.shape, X_test.shape

((4135, 171), (1034, 171))

### 모델 정의/설정/학습
- Embedding: 32차원
- SimpleRNN: 32 노드

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

In [22]:
model = Sequential([ 
    Embedding(vocab_size, 32),
    SimpleRNN(32),
    Dense(1, activation='sigmoid')                
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          303360    
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 32)                2080      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 305,473
Trainable params: 305,473
Non-trainable params: 0
_________________________________________________________________


In [23]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [24]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
model_path = 'best-sms-rnn.hdf5'
checkpoint = ModelCheckpoint(filepath=model_path, monitor='val_loss', 
                             verbose=1, save_best_only=True)
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

In [25]:
history = model.fit(X_train, y_train, epochs=50, batch_size=60, 
                    validation_split=0.2, verbose=0,
                    callbacks=[checkpoint, early_stopping])


Epoch 00001: val_loss improved from inf to 0.24384, saving model to best-sms-rnn.hdf5

Epoch 00002: val_loss improved from 0.24384 to 0.07921, saving model to best-sms-rnn.hdf5

Epoch 00003: val_loss improved from 0.07921 to 0.07195, saving model to best-sms-rnn.hdf5

Epoch 00004: val_loss improved from 0.07195 to 0.06067, saving model to best-sms-rnn.hdf5

Epoch 00005: val_loss did not improve from 0.06067

Epoch 00006: val_loss did not improve from 0.06067

Epoch 00007: val_loss did not improve from 0.06067

Epoch 00008: val_loss did not improve from 0.06067

Epoch 00009: val_loss did not improve from 0.06067

Epoch 00010: val_loss did not improve from 0.06067

Epoch 00011: val_loss did not improve from 0.06067

Epoch 00012: val_loss did not improve from 0.06067

Epoch 00013: val_loss did not improve from 0.06067

Epoch 00014: val_loss did not improve from 0.06067


In [26]:
from tensorflow.keras.models import load_model
best_model = load_model(model_path)

In [27]:
# 정확도
acc = best_model.evaluate(X_test, y_test)
print(f'정확도: {acc[1]:.4f}')

정확도: 0.9836
