In [4]:
import urllib.request
import zipfile
import os
from pathlib import Path
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

In [5]:
def download_and_unzip_spam_data(
    url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download "
            "and extraction."
        )
        return
    with urllib.request.urlopen(url) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saved as {data_file_path}")

download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)

File downloaded and saved as sms_spam_collection/SMSSpamCollection.tsv


#### Load into a Pandas DF

In [9]:
import pandas as pd
df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
df

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


#### Examine value distribution

In [10]:
print(df["Label"].value_counts())

Label
ham     4825
spam     747
Name: count, dtype: int64


There is a class inbalance. For ease, we will keep a 50:50 class-split

In [20]:
def create_balanced_dataset(df):
    spam_df = df[df["Label"] == "spam"]
    ham_df = df[df["Label"] == "ham"].sample(n=len(spam_df), random_state=123)
    balanced_df = pd.concat([spam_df, ham_df])
    return balanced_df

balanced_df = create_balanced_dataset(df)
print(balanced_df["Label"].value_counts())

Label
spam    747
ham     747
Name: count, dtype: int64


Map labels into integers {1, 0}

In [21]:
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})
balanced_df

Unnamed: 0,Label,Text
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
5,1,FreeMsg Hey there darling it's been 3 week's n...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...
11,1,"SIX chances to win CASH! From 100 to 20,000 po..."
...,...,...
4707,0,Wow so healthy. Old airport rd lor. Cant thk o...
3293,0,Dear good morning how you feeling dear
1278,0,Dont put your phone on silent mode ok
4079,0,Gam gone after outstanding innings.


##### Create random split for training, validation, test (70%, 10%, 20%)

In [22]:
def random_split(df, train_frac=0.7, val_frac=0.1, random_state=123):
    # Shuffle the DataFrame.
    df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    train_end = int(len(df) * train_frac)
    val_end = int(len(df) * (train_frac + val_frac))
    df_train = df[:train_end]
    df_val = df[train_end:val_end]
    df_test = df[val_end:]
    return df_train, df_val, df_test

df_train, df_val, df_test = random_split(balanced_df)
print(f"Train size: {len(df_train)}")
print(f"Validation size: {len(df_val)}")
print(f"Test size: {len(df_test)}")

Train size: 1045
Validation size: 150
Test size: 299


In [24]:
# Save to CSV files.
df_train.to_csv("train.csv", index=None)
df_val.to_csv("validation.csv", index=None)
df_test.to_csv("test.csv", index=None)

### Create Dataloader

We now need to create a dataloader to ingest the data into the LLM. 

Note that the text messages may have different sizes, and thus we will pad all messages to have the same length in a given batch.

In [42]:
import torch
from torch.utils.data import Dataset

class SpamDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None):
        self.data = pd.read_csv(csv_file)
        self.tokenizer = tokenizer
        self.pad_token_id = tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"})

        # Encode all texts in the dataset.
        self.encoded_texts = [tokenizer.encode(text) for text in self.data["Text"]]
        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length
            # Truncate texts that are longer than max_length.
            self.encoded_texts = [enc[:max_length] for enc in self.encoded_texts]
        
        # Pad all encoded texts to max_length.
        self.encoded_texts = [
            enc + [self.pad_token_id] * (self.max_length - len(enc)) for enc in self.encoded_texts
        ]

    def __getitem__(self, idx):
        text = torch.tensor(self.encoded_texts[idx], dtype=torch.long)
        label = torch.tensor(self.data.iloc[idx]["Label"], dtype=torch.long)
        return text, label

    def __len__(self):
        return len(self.data)

    def _longest_encoded_length(self):
        return max(len(enc) for enc in self.encoded_texts)

In [43]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
train_dataset = SpamDataset("train.csv", max_length=None, tokenizer=tokenizer)
print( "Number of tokens in the longest sequence:", train_dataset.max_length)

Number of tokens in the longest sequence: 120
