In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout

In [None]:
# read data from file
mes_type, mes = [], []
with open('SMSSpamCollection') as f:
    lines = f.readlines()
    for line in lines:
        res = line.split("\t", 1)
        mes_type.append(res[0])
        mes.append(res[1].replace('\n', ''))

In [None]:
messages = pd.DataFrame(list(zip(mes_type, mes)), columns=['label', 'message'])
messages.describe()

In [None]:
spam_mes = messages[messages.label=='spam']
notspam_mes = messages[messages.label=='ham']

In [None]:
# balance data
notspam_mes_balanced = notspam_mes.sample(n = len(spam_mes))
spam_mes_balanced = spam_mes
print(notspam_mes_balanced.shape, spam_mes_balanced.shape)

In [None]:
mes_df = notspam_mes_balanced.append(spam_mes_balanced).reset_index(drop=True)
mes_df['text_length'] = mes_df['message'].apply(len)
mes_df['msg_type']= mes_df['label'].map({'ham': 0, 'spam': 1})
mes_label = mes_df['msg_type'].values
train_mes, test_mes, train_labels, test_labels = train_test_split(mes_df['message'], mes_label, test_size=0.2)

In [None]:
tokenizer = Tokenizer(num_words = 500, char_level=False, oov_token = "<OOV>")
tokenizer.fit_on_texts(train_mes)

In [None]:
trunc_type = "post" 
padding_type = "post" 
max_len = 50

training_sequences = tokenizer.texts_to_sequences(train_mes)
training_padded = pad_sequences (training_sequences, maxlen = max_len, padding = padding_type, truncating = trunc_type)
testing_sequences = tokenizer.texts_to_sequences(test_mes)
testing_padded = pad_sequences(testing_sequences, maxlen = max_len,padding = padding_type, truncating = trunc_type)

In [None]:
vocab_size = 500
embeding_dim = 16
drop_value = 0.2
n_dense = 24

model = Sequential()
model.add(Embedding(vocab_size, embeding_dim, input_length=max_len))
model.add(GlobalAveragePooling1D())
model.add(Dense(24, activation='relu'))
model.add(Dropout(drop_value))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy',optimizer='adam' ,metrics=['accuracy'])

In [None]:
num_epochs = 30
early_stop = EarlyStopping(monitor='val_loss', patience=3)
history = model.fit(training_padded, train_labels, epochs=num_epochs, validation_data=(testing_padded, test_labels),callbacks =[early_stop], verbose=2)

In [None]:
model.evaluate(testing_padded, test_labels)

In [None]:
metrics = pd.DataFrame(history.history)
metrics.rename(columns = {'loss': 'Обучающая выборка', 'accuracy': 'Training_Accuracy', 'val_loss': 'Тестовая выборка', 'val_accuracy': 'Validation_Accuracy'}, inplace = True)

metrics[['Обучающая выборка', 'Тестовая выборка']].plot()
plt.title('Функция ошибки')
plt.xlabel ('Количество прогонок')
plt.ylabel('Ошибка')

In [None]:
def predict_spam(custom_message):
    new_seq = tokenizer.texts_to_sequences(custom_message)
    padded = pad_sequences(new_seq, maxlen =max_len,padding = padding_type,truncating=trunc_type)
    return 'Spam Probability: ' + str((model.predict(padded)[0, 0])*100)+'%'

In [None]:
test_messages = [["Hi, Harry. I'm busy at the moment. Studying calculus. Call me later"],
                ["I don't really think I can attend the show"],
                ["Almost free Dominos pizza on Tuesdays. 2 for the price of 1. Call 777888"],
                ["Want to have a lot of cash almost for free? Follow the link"]]

for message in test_messages:
    print(predict_spam(message))