In [None]:
# import libraries
#try:
  # %tensorflow_version only exists in Colab.
 # !pip install tf-nightly
#except Exception:
#  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
# !pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

print(tf.__version__)

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
train_dataset = pd.read_csv(
    train_file_path,
    sep='\t',
    names=['Label', 'Feature']
)
test_dataset = pd.read_csv(
    test_file_path,
    sep='\t',
    names=['Label', 'Feature']
)

In [None]:
train_dataset.head()

In [None]:
train_dataset['Label'].value_counts()

In [None]:
# Make sure both values have same amount of rows.
ham_df = train_dataset[train_dataset['Label'] == 'ham']
spam_df = train_dataset[train_dataset['Label'] == 'spam']

ham_df = ham_df.sample(n=len(spam_df), random_state=42)

train_dataset = pd.concat([ham_df, spam_df])

train_dataset = train_dataset.sample(frac=1, random_state=42)

In [None]:
train_dataset['Label'].value_counts()

In [None]:
train_features = train_dataset.pop('Feature')
train_labels = train_dataset.pop('Label')
test_features = test_dataset.pop('Feature')
test_labels = test_dataset.pop('Label')

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

train_labels = label_encoder.fit_transform(train_labels)
test_labels = label_encoder.fit_transform(test_labels)

In [None]:
# Parameters

vocab_size = 500
max_length = 60
embedding_dim = 16
trunc_type='post'
oov_tok = "<OOV>"

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Initialize the Tokenizer class
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)

# Generate the word index dictionary for the training sentences
tokenizer.fit_on_texts(train_features)
word_index = tokenizer.word_index

# Generate and pad the training sequences
sequences = tokenizer.texts_to_sequences(train_features)
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)

# Generate and pad the test sequences
testing_sequences = tokenizer.texts_to_sequences(test_features)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length, truncating=trunc_type)

In [None]:
# import tensorflow as tf

# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Setup the training parameters
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

# Print the model summary
model.summary()

In [None]:
num_epochs = 15

# Train the model
model.fit(padded, train_labels, epochs=num_epochs, validation_data=(testing_padded, test_labels))

In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):
  sequences = tokenizer.texts_to_sequences([pred_text])
  padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)

  prediction_float = model.predict(padded).flatten()[0]
  #prediction = []
  if prediction_float > 0.5:
      return ([prediction_float, "spam"])
  return ([prediction_float, "ham"])

pred_text = "how are you doing today?"
# 0.00091705
#pred_text = "sale today! to stop texts call 98912460324"
# 0.00137486
#pred_text = "urgent! call 09066350750 from your landline. your complimentary 4* ibiza holiday or 10,000 cash await collection sae t&cs po box 434 sk3 8wp 150 ppm 18+"
prediction = predict_message(pred_text)
print(prediction)

In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    print(prediction)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
