In [None]:
import numpy as np
import pandas as pd 
%matplotlib inline
import matplotlib.pyplot as plt

import urllib.request
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv", filename="spam.csv")

data = pd.read_csv('spam.csv', encoding='latin1')

In [None]:
print ( '총 샘플의 수 : ', len(data))

In [None]:
data[:5]

In [None]:
del data['Unnamed: 2']
del data['Unnamed: 3']
del data['Unnamed: 4']
data['v1'] = data['v1'].replace(['ham','spam'],[0,1])
data[:5]

In [None]:
data.info()

In [None]:
data.drop_duplicates(subset=['v2'], inplace=True)

In [None]:
print('총 샘플의 수:', len(data))

In [None]:
data['v1'].value_counts().plot(kind='bar')

In [None]:
print(data.groupby('v1').size().reset_index(name='count'))

In [None]:
X_data = data['v2']
y_data = data['v1']
print('본문의 갯수:{}'.format(len(X_data)) )
print('레이블의 갯수:{}'.format(len(y_data)) )

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_data)
sequences = tokenizer.texts_to_sequences(X_data)

In [None]:
print(sequences[:5])

In [None]:
word_to_index = tokenizer.word_index
print(word_to_index)

In [None]:
tokenizer.word_counts.items()

In [None]:

threshold = 2 
total_cnt = len(word_to_index)
rare_cnt = 0
total_freq = 0
rare_freq = 0


for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value

    if (value < threshold):
        rare_cnt = rare_cnt +1 
        rare_freq = rare_freq + value 

print("등장 빈도가 %s 번 이하인 희귀 단어 수: %s"%(threshold -1, rare_cnt))
print("단어집합(vocabulary)에서 희귀 단어 비율:",(rare_cnt/total_cnt)*100)
print('전체 등장빈도에서 희귀단어 등장 빈도 비율:',(rare_cnt/ total_freq)*100)


In [None]:
vocab_size = len(word_to_index) +1 
print('단어 집합의 크기:{}'.format(vocab_size))


In [None]:
n_of_train = int(len(sequences)*0.8)
n_of_test = int(len(sequences)-n_of_train)
print('훈련 데이터의 개수:',n_of_train)
print('검증 데이터의 개수:',n_of_test)

In [None]:
X_data = sequences
print("메일의 최대 길이:%d"%max(len(l)for l in X_data))
print('메일의 평균 길이:%d'%(sum(map(len,X_data))/len(X_data)))
plt.hist([len(s) for s in X_data], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()


In [None]:
max_len = 189

data = pad_sequences(X_data, maxlen=max_len)
print('훈련 데이터 크기(shape):',data.shape)

#5169 개의 데이터(문장들), 메일들 중 가장 긴 내용 189 크기

In [None]:
X_test = data[n_of_train:]
X_train = data[:n_of_train]

y_test = np.array(y_data[n_of_train:])
y_train = np.array(y_data[:n_of_train])


In [None]:
from tensorflow.keras.layers import SimpleRNN, Embedding, Dense
from tensorflow.keras.models import Sequential


In [None]:
model = Sequential()
model.add(Embedding(vocab_size,32))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer= 'rmsprop',loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train,y_train, epochs=4, batch_size=64, validation_split=0.2) 

In [None]:
print("\n 테스트 정확도:%.4f"%(model.evaluate(X_test,y_test)[1]))

In [None]:
epochs= range(1,len(history.history['acc'])+1)
plt.plot(epochs,history.history['loss'])
plt.plot(epochs,history.history['val_loss'])
plt.title('model_loss')
plt.ylabel('loss')
plt.xlabel('epoch')

plt.legend(['train','val'],loc='upper left')
plt.show()
