<a href="https://colab.research.google.com/github/emanueleiacca/Machine-Learning-with-Python/blob/main/fcc_sms_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import libraries
try:
  # %tensorflow_version only exists in Colab.
  !pip install tf-nightly
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"


# Load and Explore the Data

In [None]:
import pandas as pd

# Load the data from the tsv files
train_data = pd.read_csv('train-data.tsv', sep='\t', header=None, names=['label', 'message'])
test_data = pd.read_csv('valid-data.tsv', sep='\t', header=None, names=['label', 'message'])

# Display the first few rows of the dataset
print(train_data.head())
print(test_data.head())

In [None]:
ham_count = train_data[train_data['label'] == 'ham'].shape[0]
spam_count = train_data[train_data['label'] == 'spam'].shape[0]

# Display the counts
print(f"Ham count: {ham_count}")
print(f"Spam count: {spam_count}")
print(f"Ratio (Ham/Spam): {ham_count / spam_count:.2f}")


#  Preprocess the Data
Convert text messages into numerical data that can be fed into a neural network.

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize the text data
# convert text data into sequences of integers, where each unique word is assigned a unique integer
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>') # We're considering the top 10,000 words in the dataset. Words outside this range will be replaced by <OOV> (Out Of Vocabulary).
tokenizer.fit_on_texts(train_data['message']) # builds the word index based on the training messages.

# Converts the text data into sequences of integers
train_sequences = tokenizer.texts_to_sequences(train_data['message'])
test_sequences = tokenizer.texts_to_sequences(test_data['message'])

# Ensures all sequences are of the same length by padding shorter sequences with zeros, since NN require uniform input sizes
max_length = 100
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

# Convert labels to binary format (0 for 'ham', 1 for 'spam')
train_labels = [1 if label == 'spam' else 0 for label in train_data['label']]
test_labels = [1 if label == 'spam' else 0 for label in test_data['label']]

#Converts the list of labels into a TensorFlow tensor for model training
train_labels = tf.convert_to_tensor(train_labels)
test_labels = tf.convert_to_tensor(test_labels)


# Build and Train the Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding

# Define a simple model
model = Sequential([
    Embedding(10000, 16, input_length=max_length),
    Flatten(), # Converts the 2D matrix of word embeddings into a 1D vector, making it suitable for the Dense layers that follow
    Dense(16, activation='relu'), # A fully connected layer with 16 neurons. The ReLU activation function introduces non-linearity, allowing the model to learn complex patterns.
    Dense(1, activation='sigmoid') # The output layer with 1 neuron. The sigmoid activation function outputs a probability between 0 and 1, making it suitable for binary classification (ham vs. spam).
])

# Compile the model
model.compile(optimizer='adam', # adaptive learning rate optimization algorithm
              loss='binary_crossentropy', # loss function for binary classification tasks. It measures the difference between the predicted probability and the actual label.
              metrics=['accuracy'])

#Since spam messages are less frequent than ham messages, adjusting class weights helps the model pay more attention to the minority class. Increasing the weight for the spam class penalizes misclassifications more heavily
class_weights = {0: 1.0, 1: 2.0}

# Train the model
history = model.fit(
    train_padded,
    train_labels,
    epochs=10,
    validation_data=(test_padded, test_labels),
    batch_size=32,
    class_weight=class_weights,
    verbose=2
)


In [None]:
import matplotlib.pyplot as plt

# Plot training & validation accuracy values
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train')
plt.plot(history.history['val_accuracy'], label='Validation')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train')
plt.plot(history.history['val_loss'], label='Validation')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()

plt.show()

# We need to monitor the validation metrics to ensure that increasing the class weight doesn't lead to overfitting or decreased performance on ham messages


In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):
    '''
    The function preprocesses a single input message, predicts its class using the trained model, and returns a list with the prediction probability and the predicted label.
    '''
    # Preprocess the input text
    sequence = tokenizer.texts_to_sequences([pred_text])
    padded = pad_sequences(sequence, maxlen=max_length, padding='post')

    # Predict using the trained model
    prediction = model.predict(padded)[0][0]

    # Determine label based on prediction
    label = 'spam' if prediction > 0.5 else 'ham'

    return [float(prediction), label]

pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

In [None]:
# Check the model predictions on the training data to understand what the model is doing
predictions = model.predict(train_padded)

for i in range(5):
    print(f"Message: {train_data['message'][i]}")
    print(f"Prediction: {predictions[i][0]} | Actual: {train_labels[i]}\n")


In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    print(f"Message: '{msg}'\nPredicted: {prediction[1]} ({prediction[0]:.4f}) | Actual: {ans}\n")
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
