# Spam Detection Using CNN

In [1]:
from keras.utils import to_categorical
from keras.preprocessing import sequence, text
from sklearn.model_selection import train_test_split
from keras.models import Model, load_model
from keras.layers import Conv1D, GlobalMaxPooling1D, Dropout, Dense, Input, Embedding, MaxPooling1D, Flatten
from keras.callbacks import ModelCheckpoint
import numpy as np
import pandas as pd

MAX_WORDS_IN_SEQ = 1000
EMBED_DIM = 100
MODEL_NAME = "/model/spam_detect"

Using TensorFlow backend.


## Load Data

In [3]:
data = pd.read_csv("~/Development/datasets/enron.csv")
print(f"Total emails: {len(data)}")
data.head()

Total emails: 33716


Unnamed: 0,label,index,msg,dataset,file
0,spam,0,Subject: dobmeos with hgh my energy level has ...,1,enron1/spam/0006.2003-12-18.GP.spam.txt
1,spam,1,Subject: your prescription is ready . . oxwq s...,1,enron1/spam/0008.2003-12-18.GP.spam.txt
2,ham,2,Subject: christmas tree farm pictures,1,enron1/ham/0001.1999-12-10.farmer.ham.txt
3,ham,3,"Subject: vastar resources , inc .gary , produc...",1,enron1/ham/0002.1999-12-13.farmer.ham.txt
4,ham,4,Subject: calpine daily gas nomination- calpine...,1,enron1/ham/0003.1999-12-14.farmer.ham.txt


In [4]:
emails = data['msg'].values
labels = [1 if x == "spam" else 0 for x in data['label'].values]

In [5]:
max_len = max(map(lambda x: len(x), emails))
max_len

226609

## Pre-Process Data

In [89]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(emails)
sequences = tokenizer.texts_to_sequences(emails)
word2index = tokenizer.word_index
num_words = len(word2index)
print(f"Found {num_words} unique tokens")


Found 309362 unique tokens


In [90]:
data = sequence.pad_sequences(sequences, maxlen=MAX_WORDS_IN_SEQ, padding='post', truncating='post')
print(labels[:10])
labels = to_categorical(labels)
print(labels[:10])

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2)

[1, 1, 0, 0, 0, 0, 0, 1, 0, 1]
[[ 0.  1.]
 [ 0.  1.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 0.  1.]
 [ 1.  0.]
 [ 0.  1.]]
Shape of data tensor: (33716, 1000)
Shape of label tensor: (33716, 2)


## Building the Model: Basic CNN

In [91]:
input_seq = Input(shape=[MAX_WORDS_IN_SEQ, ], dtype='int32')
embed_seq = Embedding(num_words, EMBED_DIM, embeddings_initializer='glorot_uniform', input_length=MAX_WORDS_IN_SEQ)(
    input_seq)
conv_1 = Conv1D(128, 5, activation='relu')(embed_seq)
conv_1 = MaxPooling1D(pool_size=5)(conv_1)
conv_2 = Conv1D(128, 5, activation='relu')(conv_1)
conv_2 = MaxPooling1D(pool_size=5)(conv_2)
conv_3 = Conv1D(128, 5, activation='relu')(conv_2)
conv_3 = MaxPooling1D(pool_size=35)(conv_3)
flat = Flatten()(conv_3)
# flat = Dropout(0.25)(flat)
fc1 = Dense(128, activation='relu')(flat)
# dense_1 = Dropout(0.25)(flat)
fc2 = Dense(2, activation='softmax')(fc1)

model = Model(input_seq, fc2)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [92]:
# Testing ---------------------------------------
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 1000, 100)         30936200  
_________________________________________________________________
conv1d_19 (Conv1D)           (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d_19 (MaxPooling (None, 199, 128)          0         
_________________________________________________________________
conv1d_20 (Conv1D)           (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_20 (MaxPooling (None, 39, 128)           0         
_________________________________________________________________
conv1d_21 (Conv1D)           (None, 35, 128)           82048     
__________

In [None]:
model.fit()