In [None]:
!pip install tensorflow-datasets
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds
import os

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, Flatten
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from os.path import exists

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
TEST_PATH = "valid-data.tsv"
TRAIN_PATH = "train-data.tsv"

CLASS = "class"
MESSAGE = "message"
HAM = "ham"
SPAM = "spam"

columns = [CLASS, MESSAGE]

In [None]:
if not exists(TRAIN_PATH):
  !wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv

if not exists(TEST_PATH):
  !wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv


In [None]:
train_df = pd.read_csv(TRAIN_PATH, sep="\t", names=columns)
train_df.head()

Unnamed: 0,class,message
0,ham,ahhhh...just woken up!had a bad dream about u ...
1,ham,you can never do nothing
2,ham,"now u sound like manky scouse boy steve,like! ..."
3,ham,mum say we wan to go then go... then she can s...
4,ham,never y lei... i v lazy... got wat? dat day ü ...


In [None]:
test_df = pd.read_csv(TEST_PATH, sep="\t", names=columns)
test_df.head()

Unnamed: 0,class,message
0,ham,i am in hospital da. . i will return home in e...
1,ham,"not much, just some textin'. how bout you?"
2,ham,i probably won't eat at all today. i think i'm...
3,ham,don‘t give a flying monkeys wot they think and...
4,ham,who are you seeing?


In [None]:
train_msg = train_df[MESSAGE].values.tolist()
train_label = np.array([0 if x == HAM else 1 for x in train_df[CLASS].values.tolist()])

test_msg = test_df[MESSAGE].values.tolist()
test_label = np.array([0 if x == HAM else 1 for x in test_df[CLASS].values.tolist()])

In [None]:
vocabulary_dict = {}

for msg in train_msg:
  for voc in msg.split():
    if voc not in vocabulary_dict:
      vocabulary_dict[voc] = 1
    else:
      vocabulary_dict[voc] += 1

In [None]:
VOC_SIZE = len(vocabulary_dict) 
MAX_LEN = len(max(train_msg, key=lambda p: len(p.split())).split())

In [None]:
enc_train_msg = [one_hot(d, VOC_SIZE) for d in train_msg]
enc_test_msg = [one_hot(d, VOC_SIZE) for d in test_msg]
padded_train_msg = pad_sequences(enc_train_msg, maxlen=MAX_LEN, padding='post')
padded_test_msg = pad_sequences(enc_test_msg, maxlen=MAX_LEN, padding='post')

In [None]:

model = Sequential()
embedding_layer = Embedding(VOC_SIZE, 100, input_length=MAX_LEN)
model.add(embedding_layer)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

es = EarlyStopping(monitor='val_acc',
                   min_delta=1e-4,
                   patience=25,
                   verbose=1,
                   mode='max',
                   restore_best_weights=True)

model.fit(padded_train_msg,
          train_label,
          validation_data=(padded_test_msg, test_label),
          callbacks=[es], epochs=1000,
          verbose=2)
     
# model.compile(optimizer="adam", loss="binary_crossentropy",
#               metrics=["acc"])
# es = EarlyStopping(monitor="val_acc",
#                    min_delta=1e-4,
#                    patience=25,
#                    verbose=1,
#                    mode='max',
#                    restore_best_weights=True)

# model.fit(padded_train_msg,
#           train_label,
#           validation_data=(padded_test_msg, test_label),
#           callbacks=[es],
#           epochs=1100,
#           verbose=2)

Epoch 1/1000
131/131 - 2s - loss: 0.2324 - acc: 0.9141 - val_loss: 0.0980 - val_acc: 0.9777 - 2s/epoch - 13ms/step
Epoch 2/1000
131/131 - 1s - loss: 0.0542 - acc: 0.9840 - val_loss: 0.0568 - val_acc: 0.9835 - 1s/epoch - 8ms/step
Epoch 3/1000
131/131 - 1s - loss: 0.0263 - acc: 0.9928 - val_loss: 0.0442 - val_acc: 0.9864 - 946ms/epoch - 7ms/step
Epoch 4/1000
131/131 - 1s - loss: 0.0132 - acc: 0.9966 - val_loss: 0.0419 - val_acc: 0.9849 - 1s/epoch - 9ms/step
Epoch 5/1000
131/131 - 1s - loss: 0.0076 - acc: 0.9988 - val_loss: 0.0403 - val_acc: 0.9856 - 969ms/epoch - 7ms/step
Epoch 6/1000
131/131 - 1s - loss: 0.0048 - acc: 0.9995 - val_loss: 0.0406 - val_acc: 0.9864 - 1s/epoch - 8ms/step
Epoch 7/1000
131/131 - 1s - loss: 0.0033 - acc: 0.9998 - val_loss: 0.0412 - val_acc: 0.9864 - 1s/epoch - 8ms/step
Epoch 8/1000
131/131 - 1s - loss: 0.0025 - acc: 0.9998 - val_loss: 0.0422 - val_acc: 0.9856 - 1s/epoch - 11ms/step
Epoch 9/1000
131/131 - 1s - loss: 0.0019 - acc: 0.9998 - val_loss: 0.0399 - val_

<keras.callbacks.History at 0x7f5421192c10>

In [None]:
def pred_msg(pred_text):
  class_dict = {
      0 : HAM,
      1 : SPAM,
  }

  encoded_msg = [one_hot(pred_text, VOC_SIZE)]
  padded_msg = pad_sequences(encoded_msg, maxlen=MAX_LEN, padding='post')
  pred = [model.predict(padded_msg)[0][0], class_dict[np.round(model.predict(padded_msg)[0][0])]]
  return pred

pred_text = "how are you doing today?"

prediction = pred_msg(pred_text)
print(prediction)

[7.777649e-05, 'ham']


In [None]:
def test_pred():
  test_msg = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, answer in zip(test_msg, test_answers):
    pred = pred_msg(msg)

    if prediction[1] != answer:
      passed = False

  if passed:
    print("Passed")
  else:
    print("Not passed")

test_pred()


Not passed
