<b> Notebook to use pytorch for creating a simple feedforward netowrk and train it on dataset for binary classification for spam detection of sms. The data is available on the following link: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset/data </b>

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.utils.rnn as rnn_utils
from sklearn.metrics import classification_report
import nltk
import re
import string


# Download necessary NLTK resources (for tokenization)
#nltk.download('punkt')

warnings.filterwarnings("ignore")

<b> SECTION 1: first we open the dataset file and explore the data to have better understanding of it</B>

In [2]:
# Step 1: Load the dataset
# Since there is a Unicode error in the dataset, we use the encoding 'latin-1' to handle the text properly
df = pd.read_csv("spam.csv", encoding='latin-1')

# Step 2: Display the first few rows of the dataset to understand its structure
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
# Display the column names to understand the structure of the dataset
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [4]:
set(df["v1"].values)

{'ham', 'spam'}

In [5]:
# The dataset has unnecessary columns named 'Unnamed: 2', 'Unnamed: 3', and 'Unnamed: 4'.
# Drop these columns as they seem to contain mostly NaN values.
df_cleaned = df[['v1', 'v2']]

# Rename columns for better readability
df_cleaned.columns = ['Label', 'Text']

# Display the first few rows of the cleaned dataset
df_cleaned.head()


Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# Check how many 'ham' and 'spam' messages we have in the dataset
df_cleaned['Label'].value_counts()

Label
ham     4825
spam     747
Name: count, dtype: int64

In [7]:
# Display a few examples of spam and ham messages
spam_messages = df_cleaned[df_cleaned['Label'] == 'spam']
ham_messages = df_cleaned[df_cleaned['Label'] == 'ham']

# Display first 5 spam messages
spam_messages.head()

Unnamed: 0,Label,Text
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."


In [8]:
# Display first 5 ham messages
ham_messages.head()

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
6,ham,Even my brother is not like to speak with me. ...


<b> SECTION 2: next step is pre-processing of the text data and preparing it for training </b>

In [9]:
# Clean the text data
def clean_text(text):
    """
    This function cleans the text by removing punctuation, lowercasing the text, 
    and removing non-alphabetic characters.
    
    Args:
    text (str): Input text string.
    
    Returns:
    str: Cleaned text.
    """
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove any non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)
    return text

# Apply the cleaning function to the 'Text' column
df_cleaned['Cleaned_Text'] = df_cleaned['Text'].apply(clean_text)

# Display the first few rows of the cleaned data
df_cleaned.head()

Unnamed: 0,Label,Text,Cleaned_Text
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


In [10]:
# Step 3: Convert labels ('ham' and 'spam') into binary values
label_encoder = LabelEncoder()

# Fit the label encoder and transform labels
df_cleaned['Label_Encoded'] = label_encoder.fit_transform(df_cleaned['Label'])

# Display the mapping of labels 
print(f"Label Encoding: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")

# Check the dataset after encoding the labels
df_cleaned[['Label', 'Label_Encoded']].head()


Label Encoding: {'ham': 0, 'spam': 1}


Unnamed: 0,Label,Label_Encoded
0,ham,0
1,ham,0
2,spam,1
3,ham,0
4,ham,0


In [11]:
#Split the dataset into training and test sets
train_data, test_data = train_test_split(df_cleaned[['Cleaned_Text', 'Label_Encoded']], test_size=0.2, random_state=42)

# Convert labels to binary (0 or 1)
train_data['Label_Encoded'] = train_data['Label_Encoded'].astype(int)
test_data['Label_Encoded'] = test_data['Label_Encoded'].astype(int)


# Check the shape of the training and test data
print(f"Training Data Shape: {train_data.shape}")
print(f"Test Data Shape: {test_data.shape}")

Training Data Shape: (4457, 2)
Test Data Shape: (1115, 2)


In [12]:
#Tokenization using NLTK
def tokenize(text):
    """
    This function tokenizes the input text into a list of words (tokens) using nltk's word_tokenize.
    
    Args:
    text (str): Input text string.
    
    Returns:
    list: List of tokens (words).
    """
    return nltk.word_tokenize(text)

# Apply the tokenization function to both training and test data
train_data['Tokenized_Text'] = train_data['Cleaned_Text'].apply(tokenize)
test_data['Tokenized_Text'] = test_data['Cleaned_Text'].apply(tokenize)

# Display the tokenized text
train_data.head()

Unnamed: 0,Cleaned_Text,Label_Encoded,Tokenized_Text
1978,no im in the same boat still here at my moms c...,0,"[no, im, in, the, same, boat, still, here, at,..."
3989,bank of granite issues strongbuy explosive pic...,1,"[bank, of, granite, issues, strongbuy, explosi..."
3935,they r giving a second chance to rahul dengra,0,"[they, r, giving, a, second, chance, to, rahul..."
4078,o i played smash bros ltgt religiously,0,"[o, i, played, smash, bros, ltgt, religiously]"
4086,private your account statement for shows un...,1,"[private, your, account, statement, for, shows..."


In [13]:
# what columns do we have now?
print(train_data.columns)

Index(['Cleaned_Text', 'Label_Encoded', 'Tokenized_Text'], dtype='object')


<b> SECTION 3: define a simple feedforward network </b>

In [14]:
# Create a list of all tokens in the dataset
all_tokens = [token for tokens in train_data['Tokenized_Text'] for token in tokens]

# Build the vocabulary (unique words)
vocab = Counter(all_tokens)
vocab_size = len(vocab) + 1
print(f"length of vocabulary is: {vocab_size}")


class FeedForwardNNWithEmbedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size):
        """
        Initializes the feedforward neural network with an embedding layer.

        Args:
        vocab_size (int): The size of the vocabulary (number of unique tokens).
        embedding_dim (int): The size of the word embedding vectors.
        hidden_size (int): The size of the hidden layer.
        output_size (int): The size of the output layer (for binary classification, this is 1).
        """
        super(FeedForwardNNWithEmbedding, self).__init__()
        
        # Embedding layer to convert word indices into dense vectors
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Define the layers of the network
        self.fc1 = nn.Linear(embedding_dim, hidden_size)  # Fully connected layer (input from embedding layer)
        self.sigmoid = nn.functional.sigmoid  # Activation function. Try ReLu?
        self.fc2 = nn.Linear(hidden_size, output_size)  # Output layer

    def forward(self, x):
        """
        Defines the forward pass of the neural network.
        
        Args:
        x (torch.Tensor): The input tensor containing word indices.
        
        Returns:
        torch.Tensor: The output prediction tensor.
        """
        # Pass the input through the embedding layer
        x = self.embedding(x)
        # Average the embeddings (since input is a sequence of words)
        x = x.mean(dim=1)  # Reduce the sequence dimension to get a single vector for each example
        # Pass the averaged embedding through the first fully connected layer
        out = self.fc1(x)
        # Apply activation
        out = self.sigmoid(out)
        # Pass the result through the output layer
        out = self.fc2(out)
        return out

# Step 3: Set hyperparameters. Play around with them?
embedding_dim = 10  # The size of the word embedding vectors
hidden_size = 60   # Number of hidden units in the hidden layer
output_size = 1     # Output size for binary classification (1 output neuron)

# Create the model
model = FeedForwardNNWithEmbedding(vocab_size, embedding_dim, hidden_size, output_size)

# Display the model architecture
print(model)

length of vocabulary is: 7546
FeedForwardNNWithEmbedding(
  (embedding): Embedding(7546, 10)
  (fc1): Linear(in_features=10, out_features=60, bias=True)
  (fc2): Linear(in_features=60, out_features=1, bias=True)
)


In [15]:
#  Define the loss function and optimizer
criterion = nn.BCEWithLogitsLoss()  # Binary Cross-Entropy Loss with logits (since output is raw score)
optimizer = optim.Adam(model.parameters(), lr=0.05)  # Adam optimizer with a learning rate of 0.05

<b> SECTION 4: Create DataLoader object </b>

In [16]:
# Add <UNK> token to the vocabulary
vocab['<UNK>'] = len(vocab)  # Assign a unique index to <UNK> token

# Step 2: Create a mapping from words to indices
word_to_idx = {word: idx for idx, word in enumerate(vocab)}

# Convert tokenized text into lists of indices
def tokens_to_indices(tokenized_text):
    """
    Convert tokenized text into indices based on the vocabulary.

    Args:
    tokenized_text (list of str): Tokenized text (list of words).
    
    Returns:
    list: List of word indices.
    """
    return [word_to_idx.get(token, word_to_idx['<UNK>']) for token in tokenized_text]  # Use <UNK> for unknown words

# Apply the conversion to the tokenized text in both train and test datasets
train_data['Indices'] = train_data['Tokenized_Text'].apply(tokens_to_indices)
test_data['Indices'] = test_data['Tokenized_Text'].apply(tokens_to_indices)

# Display the first few rows to see the converted indices
train_data.head()


Unnamed: 0,Cleaned_Text,Label_Encoded,Tokenized_Text,Indices
1978,no im in the same boat still here at my moms c...,0,"[no, im, in, the, same, boat, still, here, at,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
3989,bank of granite issues strongbuy explosive pic...,1,"[bank, of, granite, issues, strongbuy, explosi...","[18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 2..."
3935,they r giving a second chance to rahul dengra,0,"[they, r, giving, a, second, chance, to, rahul...","[37, 38, 39, 35, 40, 41, 42, 43, 44]"
4078,o i played smash bros ltgt religiously,0,"[o, i, played, smash, bros, ltgt, religiously]","[45, 46, 47, 48, 49, 50, 51]"
4086,private your account statement for shows un...,1,"[private, your, account, statement, for, shows...","[52, 53, 54, 55, 25, 56, 57, 58, 46, 59, 60, 6..."


In [17]:
class SpamDataset(Dataset):
    def __init__(self, texts, labels):
        """
        Initialize the dataset with tokenized text (converted to indices) and labels.
        
        Args:
        texts (list of lists): List of tokenized text converted to indices.
        labels (list of int): List of encoded labels (0 for ham, 1 for spam).
        """
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]  # Already padded indices
        label = self.labels[idx]
        return torch.tensor(text, dtype=torch.long), torch.tensor(label, dtype=torch.float)  # Label should be float for BCEWithLogitsLoss


# Create datasets for training and testing
train_dataset = SpamDataset(train_data['Indices'].tolist(), train_data['Label_Encoded'].tolist())
test_dataset = SpamDataset(test_data['Indices'].tolist(), test_data['Label_Encoded'].tolist())

# Create DataLoader objects for batching
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [18]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)

# Move the model to the GPU
model = model.to(device)

def collate_fn(batch):
    texts, labels = zip(*batch)
    
    # Pad the sequences to have the same length
    texts = [torch.tensor(text, dtype=torch.long) for text in texts]
    texts_padded = rnn_utils.pad_sequence(texts, batch_first=True, padding_value=0)  # Padding value 0 for <PAD>
    
    # Convert labels to a tensor
    labels = torch.tensor(labels, dtype=torch.float)
    
    # Move the data to the appropriate device (GPU if available)
    texts_padded = texts_padded.to(device)
    labels = labels.to(device)
    
    return texts_padded, labels

# Step 2: Update the DataLoader to use the custom collate function
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)


cuda


<b> SECTION 5: creating the training loop and training the model </b>

In [19]:
# Set the number of epochs
num_epochs = 10

#  Define the training loop
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0
    
    # Loop over batches of training data
    for texts, labels in train_loader:
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass: compute the model output
        outputs = model(texts)
        
        # Compute the loss
        loss = criterion(outputs.squeeze(), labels)
        
        # Backward pass: compute the gradients
        loss.backward()
        
        # Update the model parameters
        optimizer.step()
        
        # Accumulate the loss for monitoring
        running_loss += loss.item()

    # Print the loss after each epoch
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")
    
    # Step 3: Evaluate the model on the test data
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0
    
    with torch.no_grad():  # Disable gradient computation for evaluation
        for texts, labels in test_loader:
            outputs = model(texts)
            predicted = torch.sigmoid(outputs).squeeze() > 0.5  # Convert logits to binary output
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    accuracy = correct / total
    print(f"Test Accuracy after Epoch {epoch+1}: {accuracy:.4f}")


Epoch [1/10], Loss: 0.1829
Test Accuracy after Epoch 1: 0.9686
Epoch [2/10], Loss: 0.0339
Test Accuracy after Epoch 2: 0.9740
Epoch [3/10], Loss: 0.0171
Test Accuracy after Epoch 3: 0.9785
Epoch [4/10], Loss: 0.0111
Test Accuracy after Epoch 4: 0.9713
Epoch [5/10], Loss: 0.0058
Test Accuracy after Epoch 5: 0.9668
Epoch [6/10], Loss: 0.0024
Test Accuracy after Epoch 6: 0.9758
Epoch [7/10], Loss: 0.0172
Test Accuracy after Epoch 7: 0.9695
Epoch [8/10], Loss: 0.0047
Test Accuracy after Epoch 8: 0.9758
Epoch [9/10], Loss: 0.0034
Test Accuracy after Epoch 9: 0.9731
Epoch [10/10], Loss: 0.0017
Test Accuracy after Epoch 10: 0.9776


In [20]:
# Collect predictions and true labels for the entire test set
all_preds = []
all_labels = []

model.eval()  # Set model to evaluation mode
with torch.no_grad():  # Disable gradient computation for evaluation
    for texts, labels in test_loader:
        outputs = model(texts)
        predicted = torch.sigmoid(outputs).squeeze() > 0.5  # Convert logits to binary output
        all_preds.extend(predicted.cpu().numpy())  # Move to CPU and convert to numpy array
        all_labels.extend(labels.cpu().numpy())  # Move to CPU and convert to numpy array

# Generate a classification report
print(classification_report(all_labels, all_preds, target_names=['ham', 'spam']))

              precision    recall  f1-score   support

         ham       0.98      0.99      0.99       965
        spam       0.94      0.89      0.91       150

    accuracy                           0.98      1115
   macro avg       0.96      0.94      0.95      1115
weighted avg       0.98      0.98      0.98      1115



<b> tasks: Do changes in the following: hyperparameters, embedding, activation function, optimizer, dataset, multiclass dataset, etc</b>