<a href="https://colab.research.google.com/github/gitos-h/FreeCodeCamp_Machine-Learning-with-Python/blob/main/fcc_sms_text_classification2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import libraries
import tensorflow as tf
import pandas as pd
from tensorflow import keras
import tensorflow_datasets as tfds
import numpy as np

print(tf.__version__)

2.18.0


In [2]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

--2025-07-17 08:51:15--  https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 104.26.2.33, 172.67.70.149, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/tab-separated-values]
Saving to: ‘train-data.tsv’


2025-07-17 08:51:15 (5.26 MB/s) - ‘train-data.tsv’ saved [358233/358233]

--2025-07-17 08:51:16--  https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 104.26.2.33, 172.67.70.149, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/tab-separated-values]
Saving to: ‘valid-data.tsv’


2025-07-17 08:51:16 (3.56 MB/s) - ‘valid-data.tsv’ saved [118774/118774]



In [3]:
# Read in data from tsv file
train_data = pd.read_csv(train_file_path, sep="\t", names=["class", "text"])
test_data = pd.read_csv(test_file_path, sep="\t", names=["class", "text"])

In [4]:
train_data.head()

Unnamed: 0,class,text
0,ham,ahhhh...just woken up!had a bad dream about u ...
1,ham,you can never do nothing
2,ham,"now u sound like manky scouse boy steve,like! ..."
3,ham,mum say we wan to go then go... then she can s...
4,ham,never y lei... i v lazy... got wat? dat day ü ...


In [5]:
MAXLEN = 250
BATCH_SIZE = 64
BUFFER_SIZE = 10000

In [6]:
# Create text encoder
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(train_data['text'].map(lambda text: text))

In [7]:
# Convert pandas DataFrames to TensorFlow Datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_data['text'], train_data['class']))
test_dataset = tf.data.Dataset.from_tensor_slices((test_data['text'], test_data['class']))

# Map text to encoded sequences and convert labels to numerical format
def encode_text_and_label(text, label):
  encoded_text = encoder(text)
  # Convert 'ham' to 0 and 'spam' to 1
  numerical_label = tf.cast(label == 'spam', tf.int32)
  return encoded_text, numerical_label

train_dataset = train_dataset.map(encode_text_and_label)
test_dataset = test_dataset.map(encode_text_and_label)

# Batch and prefetch the datasets for training using padded_batch
train_dataset = train_dataset.padded_batch(BATCH_SIZE).prefetch(BUFFER_SIZE)
test_dataset = test_dataset.padded_batch(BATCH_SIZE).prefetch(BUFFER_SIZE)

In [8]:
# Setup model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [9]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [16]:
# Train the model
history = model.fit(train_dataset, epochs=10,
                    validation_data=test_dataset)

Epoch 1/10
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 123ms/step - accuracy: 0.9883 - loss: 0.0400 - val_accuracy: 0.9842 - val_loss: 0.0621
Epoch 2/10
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 113ms/step - accuracy: 0.9897 - loss: 0.0348 - val_accuracy: 0.9835 - val_loss: 0.0602
Epoch 3/10
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 106ms/step - accuracy: 0.9915 - loss: 0.0302 - val_accuracy: 0.9849 - val_loss: 0.0594
Epoch 4/10
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 125ms/step - accuracy: 0.9939 - loss: 0.0262 - val_accuracy: 0.9864 - val_loss: 0.0593
Epoch 5/10
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 118ms/step - accuracy: 0.9953 - loss: 0.0228 - val_accuracy: 0.9864 - val_loss: 0.0595
Epoch 6/10
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 103ms/step - accuracy: 0.9957 - loss: 0.0198 - val_accuracy: 0.9864 - val_loss: 0.0603
Epoch 7/10
[1m66/66[0m [

In [17]:
test_loss, test_acc = model.evaluate(test_dataset)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step - accuracy: 0.9863 - loss: 0.0764
Test Loss: 0.07061877101659775
Test Accuracy: 0.9841954112052917


In [18]:
# Preprocess the input string
encoded_text = encoder(["Hi, how are you"]) # Pass as a list to add a batch dimension

prediction = model.predict(encoded_text)
print(prediction)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[[0.02367932]]


In [19]:
# function to predict messages based on model
def predict_message(pred_text):
  # Preprocess the input string
  encoded_text = encoder([pred_text]) # Pass as a list to add a batch dimension

  # Get the prediction from the model
  prediction = model.predict(encoded_text)[0][0]

  # Determine the label based on the prediction (e.g., threshold at 0.5)
  label = 'spam' if prediction > 0.5 else 'ham'

  return [prediction, label]

pred_text = "how are you doing today?"

prediction_result = predict_message(pred_text)
print(prediction_result)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[np.float32(0.002713909), 'ham']


In [20]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
You passed the challenge. Great job!
