In [3]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from tensorflow.keras.layers import SimpleRNN

In [6]:
df = pd.read_csv("spam.csv", encoding="ISO-8859-1")
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB
None
     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN 

In [7]:
# 필요없는 row 삭제
df = df.iloc[:, 0:2]
df.rename(columns={"v1": "mail", "v2": "content"}, inplace=True)
print(df)

      mail                                            content
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham              Will Ì_ b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [8]:
# 메일 내용 토큰화, 길이 맞춤
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df["content"])
x = tokenizer.texts_to_sequences(df["content"])
x = pad_sequences(x)
print(x)

[[   0    0    0 ...   58 4411  144]
 [   0    0    0 ...  470    6 1929]
 [   0    0    0 ...  659  389 2988]
 ...
 [   0    0    0 ...  105  250 8919]
 [   0    0    0 ...  200   12   47]
 [   0    0    0 ...    2   61  268]]


In [9]:
# y 데이터를 0 1로 분류
y = df["mail"].replace({"ham": 0, "spam": 1}).to_numpy()
print(y)

[0 0 1 ... 0 0 0]


In [129]:
# 모델 생성 - RNN 모델
inputDim = len(tokenizer.word_index) + 1 # 단어장 단어 총 갯수
embedDim = 32 # 단어 임베딩 출력 차원 수

model = Sequential()
model.add(Embedding(inputDim, embedDim)) # 단어 임베딩을 통해 밀집 벡터로 변환

# model.add(Flatten())
model.add(SimpleRNN(32))
model.add(Dropout(0.3))
model.add(Dense(1, activation="sigmoid")) # 출력: 이진분류 = sigmoid

model.compile(
    loss="binary_crossentropy",
    metrics=["acc"],
    optimizer="adam"
)
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, None, 32)          285472    
                                                                 
 simple_rnn_7 (SimpleRNN)    (None, 32)                2080      
                                                                 
 dropout_1 (Dropout)         (None, 32)                0         
                                                                 
 dense_7 (Dense)             (None, 1)                 33        
                                                                 
Total params: 287,585
Trainable params: 287,585
Non-trainable params: 0
_________________________________________________________________


In [130]:
# 모델 학습
history = model.fit(x, y, epochs=5, batch_size=32, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [137]:
# 모델 내보내기
model.save("model.h5")
print("done!")

done!


In [4]:
# 모델 불러오기
model = keras.models.load_model("model.h5")

In [12]:
# 예측

# 사기 메시지
test_content = "Congratulations! You can win a prize of $3,000,000 as the 1000th visitor to this site! Click this button and enter your address to claim your winnings!"

# 업무 메시지
test_content = "Hello, this is tom from the HR team. To prepare for the untact era, we asked for a date when we could attend the meeting in order to prepare a gradual expansion plan for working from home, but no one has responded yet, so we are requesting it again by e-mail. I would like to have a meeting sometime next Tuesday or Wednesday. If you see the mail, please reply. So have a nice day!"

test_x = tokenizer.texts_to_sequences([test_content])
test_x = pad_sequences(test_x)

res = model.predict(test_x)
res = res[0][0]
if round(res) == 0: # 확률을 반올림해서 판단
    print(f"정상적인 메일입니다! - 스팸일 확률: {res * 100}%")
else:
    print(f"스팸 메일입니다! - 스팸일 확률: {res * 100}%")

정상적인 메일입니다! - 스팸일 확률: 0.4925984889268875%
