# SimpleRNN을 이용한 SMS Spam 분류
    캐글 데이터: https://www.kaggle.com/uciml/sms-spam-collection-dataset
    "https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/10.%20RNN%20Text%20Classification/dataset/spam.csv" 

In [None]:
import pandas as pd
download_url = 'https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/10.%20RNN%20Text%20Classification/dataset/spam.csv'
df = pd.read_csv(download_url, encoding='latin1')
df.head(3)

### 데이터 전처리

In [None]:
# Selection
df = df[['v1','v2']]    # df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'], inplace=True)
df.head(3)

In [None]:
# Null 데이터 확인
df.isnull().sum()

In [None]:
# 중복 데이터 확인
df.shape, df.v2.nunique()

In [None]:
df.drop_duplicates(subset=['v2'], inplace=True)
df.shape

In [None]:
# ['ham', 'spam'] --> [0, 1] 로 변경
df.v1 = df.v1.replace(['ham','spam'], [0,1])
df.head(3)

In [None]:
# Ham/Spam 갯수
df.v1.value_counts()

In [None]:
# x, y data
x = df.v2.values
y = df.v1.values
x.shape, y.shape

### 텍스트 전처리

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# 구둣점 제거, 소문자 변환
import re

def preprocessing(s):
    s = s.encode('utf8').decode('ascii','ignore')   # 파이썬에서 문자열은 유니코드로 처리
    s = re.sub('[^a-z0-9 ]', '', s.lower())
    return s

In [None]:
X_data = [preprocessing(sent) for sent in x]
X_data[2]

In [None]:
# 단어 집합을 만들고, 그 크기를 확인
t = Tokenizer()
t.fit_on_texts(X_data)
vocab_size = len(t.word_index) + 1
vocab_size

In [None]:
sequences = t.texts_to_sequences(X_data)
print(sequences[2])

In [None]:
max_len = max(len(seq) for seq in sequences)
max_len

In [None]:
# 전체 데이터셋의 길이를 max_len에 맞추고, 0 padding을 해줌.
data = pad_sequences(sequences, maxlen=max_len)

- Train/test dataset 분리

In [None]:
import numpy as np
import tensorflow as tf 
seed = 2022
np.random.seed(seed)
tf.random.set_seed(seed)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data, y, stratify=y, test_size=0.2, random_state=seed
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

### 모델 정의/설정/학습

In [None]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [None]:
model = Sequential([ 
    Embedding(vocab_size, 32, input_length=max_len),
    SimpleRNN(32, activation='tanh'),
    Dense(1, activation='sigmoid')
])
model.summary()

In [None]:
model.compile('adam', 'binary_crossentropy', ['accuracy'])

In [None]:
model_path = 'best-spam.h5'
checkpoint = ModelCheckpoint(model_path, verbose=1, save_best_only=True)
early_stop = EarlyStopping(patience=10)

In [None]:
hist = model.fit(X_train, y_train, epochs=100, batch_size=64,
                 validation_split=0.2, callbacks=[checkpoint, early_stop])

In [None]:
best_model = load_model(model_path)
best_model.evaluate(X_test, y_test)

- 훈련과정 시각화

In [None]:
y_acc = hist.history['accuracy']
y_vloss = hist.history['val_loss']
xs = np.arange(1, len(y_acc)+1)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12,8))
plt.plot(xs, y_acc, label='train accuracy')
plt.plot(xs, y_vloss, label='validation loss')
plt.xlabel('Epoch')
plt.grid(), plt.legend()
plt.show()