In [1]:
import urllib.request
import zipfile
import os
from pathlib import Path


## Dataset

### Download

In [2]:
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = 'sms_spam_collection.zip'
extracted_path = 'sms_spam_collection'
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

In [3]:
def download_and_unzip(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f'{data_file_path} already exists.')
        return
    
    with urllib.request.urlopen(url) as response:
        with open(zip_path, 'wb') as f:
            f.write(response.read())
    
    with zipfile.ZipFile(zip_path, 'r') as f:
        f.extractall(extracted_path)
    
    og_file_path = Path(extracted_path) / 'SMSSpamCollection'
    os.rename(og_file_path, data_file_path)  # adds ".tsv"
    print(f'Downloaded as {data_file_path}')

In [4]:
download_and_unzip(url, zip_path, extracted_path, data_file_path)

Downloaded as sms_spam_collection\SMSSpamCollection.tsv


### Explore + Prepare

In [5]:
import pandas as pd

df = pd.read_csv(data_file_path, sep='\t', header=None, names=['Label', 'Text'])
df

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
print(df['Label'].value_counts())

Label
ham     4825
spam     747
Name: count, dtype: int64


In [7]:
# undersample
def create_balanced_dataset(df):
    num_spam = df[df['Label'] == 'spam'].shape[0]
    ham_subset = df[df['Label'] == 'ham'].sample(num_spam, random_state=123)

    balanced = pd.concat([
        ham_subset,
        df[df['Label'] == 'spam']
    ])

    return balanced

In [8]:
balanced_df = create_balanced_dataset(df)
print(balanced_df['Label'].value_counts())

Label
ham     747
spam    747
Name: count, dtype: int64


In [9]:
# encode labels as 0/1
balanced_df['Label'] = balanced_df['Label'].map({'ham': 0, 'spam': 1})

In [10]:
# train/val/test split
def random_split(df, train_frac, val_frac):
    # shuffle
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)
    
    train_end = int(len(df) * train_frac)
    val_end = train_end + int(len(df) * val_frac)

    train_df = df[:train_end]
    val_df = df[train_end:val_end]
    test_df = df[val_end:]

    return train_df, val_df, test_df


In [11]:
train_df, val_df, test_df = random_split(balanced_df, 0.7, 0.1)

In [12]:
# save for reuse
train_df.to_csv('train.csv', index=None)
val_df.to_csv('val.csv', index=None)
test_df.to_csv('test.csv', index=None)

### Data Loaders

For varying lengths of text, we have 2 options:
- Truncate all messages
- Pad all messages

Padding is done here to preserve entire content of all messages.

In [13]:
import tiktoken

# pad with <|endoftoken|>, check its token id
tokenizer = tiktoken.get_encoding('gpt2')
print(tokenizer.encode('<|endoftext|>', allowed_special={'<|endoftext|>'}))

[50256]


In [14]:
import torch
from torch.utils.data import Dataset

class SpamDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_len=None, pad_token_id=50256):
        self.data = pd.read_csv(csv_file)
        self.encoded = [tokenizer.encode(text) for text in self.data['Text']]
        
        if max_len is None:
            self.max_len = self._longest_encoded_length()
        else:
            self.max_len = max_len
            self.encoded = [e[:self.max_len] for e in self.encoded]
        
        # pad
        self.encoded = [
            e + [pad_token_id]*(self.max_len - len(e))
            for e in self.encoded
        ]

    def __getitem__(self, index):
        encoded = self.encoded[index]
        label = self.data.iloc[index]['Label']
        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(label, dtype=torch.long),
        )
    
    def __len__(self):
        return len(self.data)

    def _longest_encoded_length(self):
        max_len = max(map(len, self.encoded))
        return max_len

In [None]:
train_dataset = SpamDataset(csv_file='train.csv', tokenizer=tokenizer, max_len=None)
# model can handle upto 1024, so we don't need to set max_len=1024
print(train_dataset.max_len)

120


In [16]:
# don't really need to provide max_len because the model can handle upto 1024
val_dataset = SpamDataset('val.csv', tokenizer, train_dataset.max_len)
test_dataset = SpamDataset('test.csv', tokenizer, train_dataset.max_len)