In [26]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
#pd.set_option('display.max_colwidth', 1000)

spamdata = pd.read_csv('./spam.csv', encoding='latin-1')
spamdata.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [27]:
spamdata = spamdata.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
spamdata.columns = ["label", "text"]
labels = np.where(spamdata['label']=='spam', 1, 0)

X_train, X_test, y_train, y_test = train_test_split(spamdata['text'],
                                                    labels, test_size=0.2)

In [28]:
labels

array([0, 0, 1, ..., 0, 0, 0])

In [29]:
print(X_train[0:5])
print(y_train[0:5])

2385    Someone has contacted our dating service and e...
334     Valentines Day Special! Win over å£1000 in our...
3187    This is the 2nd time we have tried 2 contact u...
2239    Every day i use to sleep after  &lt;#&gt;  so ...
1049    18 days to Euro2004 kickoff! U will be kept in...
Name: text, dtype: object
[1 1 1 0 1]


In [30]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [31]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [32]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [33]:
X_train_ped_seq = pad_sequences(X_train_seq, 50)
X_test_ped_seq = pad_sequences(X_test_seq, 50)

In [34]:
import keras.backend as K
from keras.layers import Dense, Embedding, LSTM
from keras.models import Sequential

In [35]:
def custom_f1(y_true, y_pred):

    def recall_m(y_true, y_pred):
            true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
            possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
            recall = true_positives / (possible_positives + K.epsilon())
            return recall

    def precision_m(y_true, y_pred):
            true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
            predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
            precision = true_positives / (predicted_positives + K.epsilon())
            return precision
    
    precision, recall = precision_m(y_true, y_pred), recall_m(y_true, y_pred)
    
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [36]:
model = Sequential()
#Vectorize pad sequences
model.add(Embedding(len(tokenizer.index_word)+1, 32))

model.add(LSTM(32, dropout=0, recurrent_dropout=0))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 32)          252992    
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_2 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 262,401
Trainable params: 262,401
Non-trainable params: 0
_________________________________________________________________


In [37]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy', custom_f1])

In [41]:
history = model.fit(X_train_ped_seq, y_train, 
                    batch_size=32, epochs=10,
                    validation_data=(X_test_ped_seq, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [51]:
from sklearn.metrics import f1_score

y_pred = model.predict(X_test_ped_seq)
len(X_test_ped_seq)

1115

In [52]:
y_pred = model.predict_classes(X_test_ped_seq, verbose=1)



In [53]:
len(y_pred)

1115

In [54]:
f1 = f1_score(y_test, y_pred)

In [55]:
print(f1)

0.939655172413793
