In [1]:
import urllib.request
import zipfile
import os
from pathlib import Path


## Dataset

### Download

In [2]:
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = 'sms_spam_collection.zip'
extracted_path = 'sms_spam_collection'
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

In [3]:
def download_and_unzip(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f'{data_file_path} already exists.')
        return
    
    with urllib.request.urlopen(url) as response:
        with open(zip_path, 'wb') as f:
            f.write(response.read())
    
    with zipfile.ZipFile(zip_path, 'r') as f:
        f.extractall(extracted_path)
    
    og_file_path = Path(extracted_path) / 'SMSSpamCollection'
    os.rename(og_file_path, data_file_path)  # adds ".tsv"
    print(f'Downloaded as {data_file_path}')

In [4]:
download_and_unzip(url, zip_path, extracted_path, data_file_path)

Downloaded as sms_spam_collection\SMSSpamCollection.tsv


### Explore + Prepare

In [5]:
import pandas as pd

df = pd.read_csv(data_file_path, sep='\t', header=None, names=['Label', 'Text'])
df

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
print(df['Label'].value_counts())

Label
ham     4825
spam     747
Name: count, dtype: int64


In [7]:
# undersample
def create_balanced_dataset(df):
    num_spam = df[df['Label'] == 'spam'].shape[0]
    ham_subset = df[df['Label'] == 'ham'].sample(num_spam, random_state=123)

    balanced = pd.concat([
        ham_subset,
        df[df['Label'] == 'spam']
    ])

    return balanced

In [8]:
balanced_df = create_balanced_dataset(df)
print(balanced_df['Label'].value_counts())

Label
ham     747
spam    747
Name: count, dtype: int64


In [9]:
# encode labels as 0/1
balanced_df['Label'] = balanced_df['Label'].map({'ham': 0, 'spam': 1})

In [10]:
# train/val/test split
def random_split(df, train_frac, val_frac):
    # shuffle
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)
    
    train_end = int(len(df) * train_frac)
    val_end = train_end + int(len(df) * val_frac)

    train_df = df[:train_end]
    val_df = df[train_end:val_end]
    test_df = df[val_end:]

    return train_df, val_df, test_df


In [11]:
train_df, val_df, test_df = random_split(balanced_df, 0.7, 0.1)

In [12]:
# save for reuse
train_df.to_csv('train.csv', index=None)
val_df.to_csv('val.csv', index=None)
test_df.to_csv('test.csv', index=None)

### Data Loaders

For varying lengths of text, we have 2 options:
- Truncate all messages
- Pad all messages

Padding is done here to preserve entire content of all messages.

In [13]:
import tiktoken

# pad with <|endoftoken|>, check its token id
tokenizer = tiktoken.get_encoding('gpt2')
print(tokenizer.encode('<|endoftext|>', allowed_special={'<|endoftext|>'}))

[50256]


In [14]:
import torch
from torch.utils.data import Dataset

class SpamDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_len=None, pad_token_id=50256):
        self.data = pd.read_csv(csv_file)
        self.encoded = [tokenizer.encode(text) for text in self.data['Text']]
        
        if max_len is None:
            self.max_len = self._longest_encoded_length()
        else:
            self.max_len = max_len
            self.encoded = [e[:self.max_len] for e in self.encoded]
        
        # pad
        self.encoded = [
            e + [pad_token_id]*(self.max_len - len(e))
            for e in self.encoded
        ]

    def __getitem__(self, index):
        encoded = self.encoded[index]
        label = self.data.iloc[index]['Label']
        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(label, dtype=torch.long),
        )
    
    def __len__(self):
        return len(self.data)

    def _longest_encoded_length(self):
        max_len = max(map(len, self.encoded))
        return max_len

In [None]:
train_dataset = SpamDataset(csv_file='train.csv', tokenizer=tokenizer, max_len=None)
# model can handle upto 1024, so we don't need to set max_len=1024
print(train_dataset.max_len)

120


In [16]:
# don't really need to provide max_len because the model can handle upto 1024
val_dataset = SpamDataset('val.csv', tokenizer, train_dataset.max_len)
test_dataset = SpamDataset('test.csv', tokenizer, train_dataset.max_len)

In [17]:
from torch.utils.data import DataLoader

n_workers = 0  # ensures compatibility with most computers
batch_size = 8
torch.manual_seed(123)

train_loader = DataLoader(
    train_dataset,
    batch_size,
    shuffle=True,
    num_workers=n_workers,
    drop_last=True,
)

val_loader = DataLoader(
    val_dataset,
    batch_size,
    num_workers=n_workers,
    drop_last=False,
)

test_loader = DataLoader(
    test_dataset,
    batch_size,
    num_workers=n_workers,
    drop_last=False,
)

In [18]:
for input_batch, target_batch in train_loader:
    pass
print(f'Input shape: {input_batch.shape}')
print(f'Label shape: {target_batch.shape}')

Input shape: torch.Size([8, 120])
Label shape: torch.Size([8])


In [19]:
print(f'{len(train_loader)} train batches')
print(f'{len(val_loader)} val batches')
print(f'{len(test_loader)} test batches')

130 train batches
19 val batches
38 test batches


## Pretrained Model

In [20]:
MODEL = 'gpt2-small (124M)'
prompt = 'Every effort moves'
BASE_CONFIG = {
    'vocab_size': 50257,
    'context_len': 1024,
    'drop_rate': 0.0,
    'qkv_bias': True,
}

model_configs = {
 "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
 "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
 "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
 "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

BASE_CONFIG.update(model_configs[MODEL])

In [21]:
from gpt_download import download_and_load_gpt2

model_size = MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(model_size=model_size, models_dir='gpt2')

checkpoint: 100%|██████████| 77.0/77.0 [00:00<?, ?iB/s]
encoder.json: 100%|██████████| 1.04M/1.04M [00:13<00:00, 79.4kiB/s]
hparams.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 6.95kiB/s]
model.ckpt.data-00000-of-00001: 100%|██████████| 498M/498M [21:51<00:00, 379kiB/s]    
model.ckpt.index: 100%|██████████| 5.21k/5.21k [00:00<?, ?iB/s]
model.ckpt.meta: 100%|██████████| 471k/471k [00:01<00:00, 309kiB/s]  
vocab.bpe: 100%|██████████| 456k/456k [00:01<00:00, 254kiB/s]  


In [22]:
from gpt_model import GPTModel
from load_weights import load_weights_into_gpt

model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_features=768, bias=True)
        (W_k): Linear(in_features=768, out_features=768, bias=True)
        (W_v): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_feat

In [23]:
from generate_text import generate_text_simple
from text_token_id_conversion import text_to_token_ids, token_ids_to_text

text_1 = "Every effort moves you"
token_ids = generate_text_simple(model, text_to_token_ids(text_1, tokenizer),
                                max_new_tokens=15,
                                context_size=BASE_CONFIG['context_len'])
print(token_ids_to_text(token_ids, tokenizer))

Every effort moves you forward.

The first step is to understand the importance of your work


In [24]:
# check if it can already classify
text_2 = (
    "Is the following text 'spam'? Answer with 'yes' or 'no':"
    " 'You are a winner you have been specially"
    " selected to receive $1000 cash or a $2000 award.'"
)

token_ids = generate_text_simple(model, text_to_token_ids(text_2, tokenizer),
                                max_new_tokens=23,
                                context_size=BASE_CONFIG['context_len'])
print(token_ids_to_text(token_ids, tokenizer))

Is the following text 'spam'? Answer with 'yes' or 'no': 'You are a winner you have been specially selected to receive $1000 cash or a $2000 award.'

The following text 'spam'? Answer with 'yes' or 'no': 'You are a winner


## Classification Head

In [25]:
model

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_features=768, bias=True)
        (W_k): Linear(in_features=768, out_features=768, bias=True)
        (W_v): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_feat

In [26]:
# freeze the model
for param in model.parameters():
    param.requires_grad = False

In [27]:
# replace output layer

torch.manual_seed(123)
n_classes = 2
model.out_head = torch.nn.Linear(
    in_features=BASE_CONFIG['emb_dim'],
    out_features=n_classes,
)

In [28]:
# configure last block and layer norm to be trainable too for improved performance

for param in model.trf_blocks[-1].parameters():
    param.requires_grad = True
for param in model.final_norm.parameters():
    param.requires_grad = True

In [29]:
# similar usage as before

inputs = tokenizer.encode('Do you have time')
inputs = torch.tensor(inputs).unsqueeze(0)

print(f'Inputs: {inputs}')
print(f'Inputs Shape: {inputs.shape}')

Inputs: tensor([[5211,  345,  423,  640]])
Inputs Shape: torch.Size([1, 4])


In [30]:
with torch.no_grad():
    outputs = model(inputs)

print(f'Outputs:\n{outputs}')
print(f'Outputs Shape: {outputs.shape}')

Outputs:
tensor([[[-1.5854,  0.9904],
         [-3.7235,  7.4548],
         [-2.2661,  6.6049],
         [-3.5983,  3.9902]]])
Outputs Shape: torch.Size([1, 4, 2])


In [32]:
# will only finetune the last token
# because the last token attends to all tokens due to causal attention

print(f'Last: {outputs[:,-1,:]}')

Last: tensor([[-3.5983,  3.9902]])


## Loss

In [33]:
probas = torch.softmax(outputs[:,-1,:], dim=-1)
label = torch.argmax(probas)
print(f'Class Label: {label.item()}')

Class Label: 1


In [34]:
# don't need softmax because largest logit corresponds to highest proba
label = torch.argmax(outputs[:,-1,:])
print(f'Class Label: {label.item()}')

Class Label: 1


In [36]:
def calc_accuracy_loader(data_loader, model, device, n_batches=None):
    model.eval()

    correct_preds, n_examples = 0, 0
    
    if n_batches is None:
        n_batches = len(data_loader)
    else:
        n_batches = min(n_batches, len(data_loader))
    
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i >= n_batches:
            break

        input_batch = input_batch.to(device)
        target_batch = target_batch.to(device)

        with torch.no_grad():
            logits = model(input_batch)[:,-1,:]
        predicted = torch.argmax(logits, dim=-1)

        n_examples += predicted.shape[0]
        correct_preds += (predicted == target_batch).sum().item()

    # proportion
    return correct_preds / n_examples

In [37]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

torch.manual_seed(123)
train_accuracy = calc_accuracy_loader(train_loader, model, device, 10)
val_accuracy = calc_accuracy_loader(val_loader, model, device, 10)
test_accuracy = calc_accuracy_loader(test_loader, model, device, 10)

In [38]:
print(f'Training accuracy: {train_accuracy*100:.2f}%')
print(f'Val accuracy: {val_accuracy*100:.2f}%')
print(f'Test accuracy: {test_accuracy*100:.2f}%')

Training accuracy: 46.25%
Val accuracy: 45.00%
Test accuracy: 48.75%


In [39]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)

    logits = model(input_batch)[:,-1,:]
    loss = torch.nn.functional.cross_entropy(logits, target_batch)

    return loss

In [40]:
def calc_loss_loader(data_loader, model, device, n_batches=None):
    total_loss = 0

    if len(data_loader) == 0:
        return float('nan')
    elif n_batches is None:
        n_batches = len(data_loader)
    else:
        n_batches = min(n_batches, len(data_loader))
    
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i >= n_batches:
            break

        loss = calc_loss_batch(input_batch, target_batch, model, device)
        total_loss += loss
    
    return total_loss / n_batches

In [41]:
with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device, 5)
    val_loss = calc_loss_loader(val_loader, model, device, 5)
    test_loss = calc_loss_loader(test_loader, model, device, 5)

In [42]:
print(f'Training loss: {train_loss:.3f}')
print(f'Val loss: {val_loss:.3f}')
print(f'Test loss: {test_loss:.3f}')

Training loss: 2.453
Val loss: 2.583
Test loss: 2.322


## Fine-Tuning

In [44]:
def train_classifier_simple(
        model, train_loader, val_loader, optimizer, device,
        n_epochs, eval_freq, eval_iter,
):
    train_losses, val_losses = [], []
    # accuracies
    train_accs, val_accs = [], []

    examples_seen, global_step = 0, -1

    for epoch in range(n_epochs):
        model.train()

        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()
            optimizer.step()

            examples_seen += input_batch.shape[0]
            global_step += 1

            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader,
                    device, eval_iter,
                )
                
                train_losses.append(train_loss)
                val_losses.append(val_loss)

                print(
                    f'Ep {epoch+1} (Step {global_step:06d}):'
                    f'Train Loss {train_loss:.3f}'
                    f'Val Loss {val_loss:.3f}'
                )
        
        train_acc = calc_accuracy_loader(train_loader, model, device, n_batches=eval_iter)
        val_acc = calc_accuracy_loader(val_loader, model, device, n_batches=eval_iter)

        train_accs.append(train_acc)
        val_accs.append(val_acc)

        print(f'Training accuracy: {train_acc*100:.2f}%')
        print(f'Val accuracy: {val_acc*100:.2f}%')
    
    return train_losses, val_losses, train_accs, val_accs

def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()

    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, n_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, n_batches=eval_iter)

    model.train()
    
    return train_loss, val_loss