In [None]:
import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM, SpatialDropout1D
from keras.models import Model, Sequential
from keras.initializers import Constant

from keras.optimizers import Adadelta

In [None]:
BASE_DIR            = os.path.dirname(os.getcwd())
SPAM_DATA_PATH      = os.path.join(BASE_DIR, 'data', 'spam', 'spam.txt')
NOT_SPAM_DATA_PATH  = os.path.join(BASE_DIR, 'data', 'spam', 'not-spam.txt')

MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [None]:
spam_text = open(SPAM_DATA_PATH, 'r')
non_spam_text = open(NOT_SPAM_DATA_PATH, 'r')
spam_lines = spam_text.readlines()
non_spam_lines = non_spam_text.readlines()

print("Spam total", len(spam_lines))
print("Non spam total", len(non_spam_lines))

texts = []
labels = []
for line in spam_lines:
    texts.append(line)
    labels.append(0)
    
for line in non_spam_lines:
    texts.append(line)
    labels.append(1)
    
spam_text.close()
non_spam_text.close()

In [None]:
MAX_NUM_WORDS = 747
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [None]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
data

In [None]:
labels = to_categorical(np.asarray(labels))

In [None]:
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

In [None]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

print('Preparing embedding matrix.')

In [None]:
embed_dim = 128
lstm_out = 196

optimizer = Adadelta()
model = Sequential()
model.add(Embedding(MAX_NUM_WORDS, embed_dim, input_length=x_train.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
print(model.summary())

In [None]:
# todo
# callback

In [None]:
batch_size = 32
epochs = 1
model.fit(x_train, y_train, batch_size=batch_size, verbose=2, epochs=epochs, validation_split=0.2)

In [None]:
model.save("models/test-spam-filter.h5")

In [None]:
def predict(text):
    txt = [text]
    txt = tokenizer.texts_to_sequences(txt)
    txt = pad_sequences(txt, maxlen=MAX_SEQUENCE_LENGTH, dtype='int32', value=0)
    probs = model.predict(txt, batch_size=1, verbose=2)[0]
    return probs, np.argmax(probs)

In [None]:
predict("What a nice surprise!")

In [None]:
predict("Last minute sale on all CELL phones in the UK now. Get urs free")

In [None]:
predict("XXX data plan is now Get urs free")