In [1]:
# !pip install keras tensorflow-gpu

In [2]:
import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM, SpatialDropout1D
from keras.models import Model, Sequential
from keras.initializers import Constant

from keras.optimizers import Adadelta

Using TensorFlow backend.


In [3]:
BASE_DIR            = os.path.dirname(os.getcwd())
SPAM_DATA_PATH      = os.path.join(BASE_DIR, 'data', 'spam', 'spam.txt')
NOT_SPAM_DATA_PATH  = os.path.join(BASE_DIR, 'data', 'spam', 'not-spam.txt')

MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [4]:
spam_text = open(SPAM_DATA_PATH, 'r')
non_spam_text = open(NOT_SPAM_DATA_PATH, 'r')
spam_lines = spam_text.readlines()
non_spam_lines = non_spam_text.readlines()

print("Spam total", len(spam_lines))
print("Non spam total", len(non_spam_lines))

texts = []
labels = []
for line in spam_lines:
    texts.append(line)
    labels.append(0)
    
for line in non_spam_lines:
    texts.append(line)
    labels.append(1)
    
spam_text.close()
non_spam_text.close()

Spam total 747
Non spam total 4827


In [5]:
MAX_NUM_WORDS = 747
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [6]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 9012 unique tokens.


In [7]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [8]:
data

array([[  0,   0,   0, ..., 174, 660, 393],
       [  0,   0,   0, ..., 324, 232,   2],
       [  0,   0,   0, ..., 517, 582,  64],
       ...,
       [  0,   0,   0, ...,  23, 104, 250],
       [  0,   0,   0, ..., 202,  12,  47],
       [  0,   0,   0, ...,   2,  61, 271]])

In [9]:
labels = to_categorical(np.asarray(labels))

In [10]:
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (5574, 1000)
Shape of label tensor: (5574, 2)


In [11]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

print('Preparing embedding matrix.')

Preparing embedding matrix.


In [12]:
embed_dim = 128
lstm_out = 196

optimizer = Adadelta()
model = Sequential()
model.add(Embedding(MAX_NUM_WORDS, embed_dim, input_length=x_train.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 128)         95616     
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 1000, 128)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
Total params: 350,810
Trainable params: 350,810
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
import os
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping

def get_callbacks(name='spam-filter'):
    early_stopping = EarlyStopping(monitor='val_loss',
        patience=7, 
        min_delta=0.0001)
    checkpoint_path = os.path.join(BASE_DIR, 'models', 'checkpoints', name)
    os.makedirs(checkpoint_path, exist_ok=True)
    filepath = os.path.join(checkpoint_path, 
        'weights.{epoch:02d}-{val_loss:.2f}.hdf5')
    checkpoint = ModelCheckpoint(filepath, 
        monitor='val_loss',  
        save_best_only=True,
        save_weights_only=True)
    callbacks = [early_stopping, checkpoint]
    return callbacks

callbacks = get_callbacks()

In [14]:
batch_size = 32
epochs = 100
model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=batch_size, verbose=2, epochs=epochs, callbacks=callbacks)

Train on 4460 samples, validate on 1114 samples
Epoch 1/100
 - 173s - loss: 0.2380 - acc: 0.9168 - val_loss: 0.1624 - val_acc: 0.9497
Epoch 2/100
 - 172s - loss: 0.0897 - acc: 0.9731 - val_loss: 0.1081 - val_acc: 0.9704
Epoch 3/100
 - 172s - loss: 0.0671 - acc: 0.9812 - val_loss: 0.0638 - val_acc: 0.9820
Epoch 4/100
 - 172s - loss: 0.0557 - acc: 0.9832 - val_loss: 0.0614 - val_acc: 0.9829
Epoch 5/100
 - 172s - loss: 0.0495 - acc: 0.9836 - val_loss: 0.0590 - val_acc: 0.9820
Epoch 6/100
 - 173s - loss: 0.0473 - acc: 0.9861 - val_loss: 0.0556 - val_acc: 0.9856
Epoch 7/100
 - 171s - loss: 0.0426 - acc: 0.9872 - val_loss: 0.0516 - val_acc: 0.9847
Epoch 8/100
 - 171s - loss: 0.0397 - acc: 0.9879 - val_loss: 0.0513 - val_acc: 0.9865
Epoch 9/100
 - 171s - loss: 0.0355 - acc: 0.9897 - val_loss: 0.0518 - val_acc: 0.9847
Epoch 10/100
 - 171s - loss: 0.0348 - acc: 0.9901 - val_loss: 0.0606 - val_acc: 0.9847
Epoch 11/100
 - 170s - loss: 0.0310 - acc: 0.9917 - val_loss: 0.0519 - val_acc: 0.9883
Epoc

<keras.callbacks.History at 0x251151f3048>

In [15]:
save_path =  os.path.join(BASE_DIR, 'models', 'test-spam-filter.h5')
model.save(save_path)

In [37]:
def predict(text):
    txt = [text]
    txt = tokenizer.texts_to_sequences(txt)
    txt = pad_sequences(txt, maxlen=MAX_SEQUENCE_LENGTH, dtype='int32', value=0)
    probs = model.predict(txt, batch_size=1, verbose=2)[0]
    best = np.argmax(probs)
    return probs, best, "%.2f%% likelihood"%(probs[best] * 100)

In [38]:
predict("What a nice surprise!")

(array([2.6508796e-04, 9.9973494e-01], dtype=float32), 1, '99.97% likelihood')

In [39]:
predict("Last minute sale on all CELL phones in the UK now. Get urs free")

(array([0.9731119 , 0.02688812], dtype=float32), 0, '97.31% likelihood')

In [40]:
predict("call for a free double plan")

(array([0.5048461 , 0.49515387], dtype=float32), 0, '50.48% likelihood')