<a href="https://colab.research.google.com/github/dude123studios/AdvancedDeepLearning/blob/main/SMS_Spam_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import gensim.downloader as api
import numpy as np
import os
import shutil
import tensorflow as tf
from sklearn.metrics import confusion_matrix

In [2]:
def download_and_read(url):
  local_file = url.split('/')[-1]
  p = tf.keras.utils.get_file(local_file,url,extract=True,cache_dir='.')
  labels, texts = [], []
  local_file = os.path.join('datasets','SMSSpamCollection')
  with open(local_file, 'r') as fin:
    for line in fin:
      label, text = line.strip().split('\t')
      labels.append(1 if label == 'spam' else 0)
      texts.append(text)
  return labels,texts

In [3]:
DATASET_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
labels,texts = download_and_read(DATASET_URL)

In [4]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(texts)
text_sequences = tokenizer.texts_to_sequences(texts)
text_sequences = tf.keras.preprocessing.sequence.pad_sequences(text_sequences)
num_records = len(text_sequences)
max_seqlen = len(text_sequences[0])
print('{:d} senteces, max len :{:d}'.format(num_records,max_seqlen))

5574 senteces, max len :189


In [5]:
cat_labels = tf.keras.utils.to_categorical(labels,2)

In [6]:
word2idx = tokenizer.word_index
idx2word = {v:k for k, v in word2idx.items()}
word2idx['PAD'] = 0
idx2word[0] = 'PAD'
vocab_size = len(word2idx)
print('vocab size: ', vocab_size)

vocab size:  9010


In [7]:
dataset = tf.data.Dataset.from_tensor_slices((text_sequences,cat_labels))
dataset = dataset.shuffle(10000)
test_size = num_records // 10
val_size = num_records // 10
test_dataset = dataset.take(test_size)
val_dataset = dataset.skip(test_size).take(val_size)
train_dataset = dataset.skip(test_size + val_size)

BATCH_SIZE = 128
test_dataset = test_dataset.batch(BATCH_SIZE,drop_remainder=True)
val_dataset = val_dataset.batch(BATCH_SIZE,drop_remainder=True)
train_dataset = train_dataset.batch(BATCH_SIZE,drop_remainder=True)

In [8]:
def build_embedding_matrix(sequences,word2idx,embedding_dim,embedding_file):
  if os.path.exists(embedding_file):
    print('Loading cached file ...')
    E = np.load(embedding_file)
    
  
  else:
    vocab_size = len(word2idx)
    E = np.zeros((vocab_size,embedding_dim))
    word_vectors = api.load(EMBEDDING_MODEL)
    for word, idx, in word2idx.items():
      try:
         E[idx] = word_vectors.word_vec(word)
      except KeyError:
        pass
    np.save(embedding_file,E)
  return E

In [9]:
EMBEDDING_DIM = 300
DATA_DIR = 'data'
EMBEDDING_NUMPY_FILE = os.path.join(DATA_DIR,'E.npy')
EMBEDDING_MODEL = 'glove-wiki-gigaword-300'
E = build_embedding_matrix(text_sequences,word2idx,EMBEDDING_DIM,EMBEDDING_NUMPY_FILE)
print('Embedding matrix: ', E.shape)

Loading cached file ...
Embedding matrix:  (9010, 300)


In [10]:
class SpamClassifierModel(tf.keras.Model):
  def __init__(self, vocab_sz, embed_sz, input_length, num_filters, kernel_sz,
               output_sz, run_mode, embedding_weights, **kwargs):
    super(SpamClassifierModel,self).__init__(**kwargs)
    if run_mode == 'scratch':
      self.embedding = tf.keras.layers.Embedding(vocab_sz,embed_sz,
                                                 input_length=input_length, 
                                                 trainable=True)
    else:
      self.embedding = tf.keras.layers.Embedding(vocab_sz, embed_sz,
                                                 input_length = input_length,
                                                 weights=[embedding_weights],
                                                 trainable=False)
    self.conv = tf.keras.layers.Conv1D(filters=num_filters,kernel_size=kernel_sz,
                                       activation='relu')
    self.dropout = tf.keras.layers.SpatialDropout1D(0.2)
    self.pool = tf.keras.layers.GlobalMaxPool1D()
    self.dense = tf.keras.layers.Dense(output_sz,activation='softmax')
  
  def call(self, x):
    x = self.embedding(x)
    x = self.conv(x)
    x = self.dropout(x)
    x = self.pool(x)
    x = self.dense(x)
    return x

We will make the next part a function, so that we can run the code with a GPU in the same tensorflow graph

In [11]:

def get_model():
  model = SpamClassifierModel(vocab_size,EMBEDDING_DIM,max_seqlen,256,3,2,' ',E)
  model.build(input_shape=(None,max_seqlen))
  model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
  return model


In [12]:
EPOCHS = 3

CLASS_WEIGHTS = {0:1,1:8}

with tf.device('gpu:0'):
  model = get_model()
  model.fit(train_dataset,epochs=EPOCHS,validation_data = val_dataset, class_weight=CLASS_WEIGHTS)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [15]:
labels, predictions = [], []
for xtest, ytest in test_dataset:
  ytest_ = model.predict_on_batch(xtest)
  ytest = np.argmax(ytest,axis=1)
  ytest_ = np.argmax(ytest_,axis=1)
  labels.extend(ytest.tolist())
  predictions.extend(ytest_.tolist())

print('confusion matrix')
print(confusion_matrix(labels,predictions))

confusion matrix
[[446   3]
 [  0  63]]
