## SimpleRNN을 이용한 SMS Spam 분류
- 캐글 데이터 : https://

In [1]:
import pandas as pd
url = 'https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/10.%20RNN%20Text%20Classification/dataset/spam.csv'

In [2]:
df = pd.read_csv(url, encoding='latin1')
df.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,


- 데이터 전처리

In [3]:
# Selection
df = df[['v1', 'v2']]

In [4]:
# 결측치 확인
df.isna().sum().sum()

0

In [5]:
# 중복 데이터 확인
df.shape, df.v2.nunique()

((5572, 2), 5169)

In [6]:
df.drop_duplicates(subset=['v2'], inplace=True)
df.shape

(5169, 2)

In [7]:
# ['ham', 'spam'] --> [0, 1]
df.v1 =df.v1.replace(['ham', 'spam'], [0,1])
df.head(3)

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...


In [8]:
# x, y
x, y = df.v2.values, df.v1.values

- 텍스트 전처리

In [9]:
import re
X_data = [re.sub('[^a-z0-9]', ' ', line.lower()) for line in x]
X_data[:3]

['go until jurong point  crazy   available only in bugis n great world la e buffet    cine there got amore wat   ',
 'ok lar    joking wif u oni   ',
 'free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005  text fa to 87121 to receive entry question std txt rate t c s apply 08452810075over18 s']

In [10]:
# 단어집합 생성, 크기 확인
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

t = Tokenizer()
t.fit_on_texts(X_data)
vocab_size = len(t.word_index) + 1
vocab_size

8659

In [11]:
sequences = t.texts_to_sequences(X_data)
print(sequences[0])

[50, 426, 3927, 764, 694, 653, 70, 8, 1174, 96, 127, 413, 1175, 145, 2639, 1176, 63, 60, 3928, 129]


In [12]:
max_len = max(len(seq) for seq in sequences)
max_len

190

In [13]:
# 전체 데이터를 max_len 길이에 맞추어 0 padding
sequences = pad_sequences(sequences, max_len)

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    sequences, y, stratify=y, test_size=0.2, random_state=2023
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4135, 190), (1034, 190), (4135,), (1034,))

#### 모델 정의/설정/학습

In [16]:
import numpy as np
import tensorflow as tf
seed = 2023
np.random.seed(seed)
tf.random.set_seed(seed)

In [17]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [18]:
model = Sequential([
    Embedding(vocab_size, 32, input_length=max_len),      # Embedding vector size: 32
    SimpleRNN(32),      # SimpleRNN node 수 : 32
    Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 190, 32)           277088    
                                                                 
 simple_rnn (SimpleRNN)      (None, 32)                2080      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 279201 (1.07 MB)
Trainable params: 279201 (1.07 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [21]:
model.compile('adam', 'binary_crossentropy', ['accuracy'])

model_path = 'best-spam-rnn.h5'
mc = ModelCheckpoint(model_path, save_best_only=True, verbose=1)
es = EarlyStopping(patience=10)

In [22]:
hist = model.fit(
    X_train, y_train, validation_split=0.2, verbose=0,
    epochs=100, batch_size=64, callbacks=[mc,es]
)


Epoch 1: val_loss improved from inf to 0.13310, saving model to best-spam-rnn.h5


  saving_api.save_model(



Epoch 2: val_loss improved from 0.13310 to 0.08850, saving model to best-spam-rnn.h5

Epoch 3: val_loss improved from 0.08850 to 0.05742, saving model to best-spam-rnn.h5

Epoch 4: val_loss improved from 0.05742 to 0.05341, saving model to best-spam-rnn.h5

Epoch 5: val_loss improved from 0.05341 to 0.05099, saving model to best-spam-rnn.h5

Epoch 6: val_loss did not improve from 0.05099

Epoch 7: val_loss improved from 0.05099 to 0.04513, saving model to best-spam-rnn.h5

Epoch 8: val_loss did not improve from 0.04513

Epoch 9: val_loss improved from 0.04513 to 0.03848, saving model to best-spam-rnn.h5

Epoch 10: val_loss did not improve from 0.03848

Epoch 11: val_loss did not improve from 0.03848

Epoch 12: val_loss did not improve from 0.03848

Epoch 13: val_loss did not improve from 0.03848

Epoch 14: val_loss did not improve from 0.03848

Epoch 15: val_loss did not improve from 0.03848

Epoch 16: val_loss did not improve from 0.03848

Epoch 17: val_loss did not improve from 0.03

In [23]:
best_model = load_model(model_path)
best_model.evaluate(X_test, y_test)



[0.07456034421920776, 0.978723406791687]