In [4]:
from importlib.metadata import version
print("torch version:", version("torch"))

torch version: 2.8.0+cu126


In [5]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


# Fine-tuning for classification

## 6.1 - Different categories of fine-tuning

- Most common ways to fine tune:
  - instruction fine-tuning - training a language model on a set of tasks using specific instructions to improve its ability to understand and execute tasks described in natural language prompts
  - classification fine-tuning - the model is trained to recognize a specific set of class labels, such as "spam" and "not spam."  The key point is that a classification fine-tuned model is restricted to predicting classes it has encountered during its training.

## 6.2 - Preparing the dataset

- Modify the GPT model we previously implemented and pretrained.

In [6]:
#   First step is to download the dataset.
import urllib.request
import zipfile
import os
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(
    url, zip_path, extracted_path, data_file_path):
  if data_file_path.exists():
    print(f"{data_file_path} already exists.  Skipping download and extraction.")
    return

  with urllib.request.urlopen(url) as response:
    with open(zip_path, "wb") as out_file:
      out_file.write(response.read())

  with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(extracted_path)

  orginal_file_path = Path(extracted_path) / "SMSSpamCollection"
  os.rename(orginal_file_path, data_file_path)
  print(f"File downloaded and saved as {data_file_path}")

download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)

File downloaded and saved as sms_spam_collection/SMSSpamCollection.tsv


In [7]:
#   Load the data into a pandas dataframe
import pandas as pd

df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
df.head()

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
#   Let's examine the class label distribution:
print(df["Label"].value_counts())

Label
ham     4825
spam     747
Name: count, dtype: int64


In [9]:
#   To speed up training we will undersample the dataset to include 747 instances
def create_balanced_dataset(df):
  num_spam = df[df["Label"] == "spam"].shape[0]
  ham_subset = df[df["Label"] == "ham"].sample(
      num_spam, random_state=123
  )
  balanced_df = pd.concat([
      ham_subset, df[df["Label"] == "spam"]
  ])
  return balanced_df

balanced_df = create_balanced_dataset(df)
print(balanced_df["Label"].value_counts())


Label
ham     747
spam    747
Name: count, dtype: int64


In [10]:
#   Next convert the "string" class labels to "ham" and "spam"
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})

In [11]:
#   Next create a random_split function
def random_split(df, train_frac, validation_frac):
  df = df.sample(
      frac=1, random_state=123
  ).reset_index(drop=True)
  train_end = int(len(df) * train_frac)
  validation_end = train_end + int(len(df) * validation_frac)

  train_df = df[:train_end]
  validation_df = df[train_end:validation_end]
  test_df = df[validation_end:]

  return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(
    balanced_df, 0.7, 0.1
)

In [12]:
#   Save the dataset as CSV
train_df.to_csv("train.csv", index=False)
validation_df.to_csv("validation.csv", index=False)
test_df.to_csv("test.csv", index=False)

## 6.3 - Creating data loader

- We are now working with spam dataset that contains text messages of varying lengths.  To batch these messges as we did with the text chuncks, we have two primary options
  - Truncate
  - Pad

In [13]:
#   We are going to use "<|endoftext|>" as a padding token.  Instead of appending
# the string to the end of each text message, we are adding the token ID
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


In [19]:
#  Set up a pytorch dataset class
import torch
from torch.utils.data import Dataset

class SpamDataset(Dataset):
  def __init__(self, csv_file, tokenizer, max_length=None,
               pad_token_id=50256):
    self.data = pd.read_csv(csv_file)

    self.encoded_texts = [
        tokenizer.encode(text) for text in self.data["Text"]
    ]

    if max_length is None:
      self.max_length = self._longest_encoded_length()
    else:
      self.max_length = max_length
      self.encoded_texts = [
          encoded_text[:self.max_length]
          for encoded_text in self.encoded_texts
      ]

    self.encoded_texts = [
        encoded_text + [pad_token_id] *
        (self.max_length - len(encoded_text))
        for encoded_text in self.encoded_texts
    ]

  def __getitem__(self, index):
    encoded = self.encoded_texts[index]
    label = self.data.iloc[index]["Label"]
    return (
        torch.tensor(encoded, dtype=torch.long),
        torch.tensor(label, dtype=torch.long)
    )

  def __len__(self):
    return len(self.data)

  def _longest_encoded_length(self):
    max_length = 0
    for encoded_text in self.encoded_texts:
      encoded_length = len(encoded_text)
      if encoded_length > max_length:
        max_length = encoded_length
    return max_length

In [20]:
#   The SpamDataset class loads data from the CSV files
train_dataset = SpamDataset(
    csv_file="train.csv",
    tokenizer=tokenizer,
    max_length=None
)

In [21]:
#  Print the longest sequence
print(train_dataset.max_length)

120


In [22]:
#   Nex we pad the validation and test sets to match the length
val_dataset = SpamDataset(
    csv_file="validation.csv",
    tokenizer=tokenizer,
    max_length=train_dataset.max_length
)
test_dataset = SpamDataset(
    csv_file="test.csv",
    tokenizer=tokenizer,
    max_length=train_dataset.max_length
)

In [23]:
#   Create the training, validation, and test data sets loaders, that load
# messages and labels in batches of 8
from torch.utils.data import DataLoader

num_workrs = 0
batch_size = 8
torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workrs,
    drop_last=True
)
val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workrs,
    drop_last=False
)
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workrs,
    drop_last=False
)

In [24]:
#   Ensure the data loaders are working
for input_batch, target_batch in train_loader:
  pass
print("Input batch dimensions: ", input_batch.shape)
print("Target batch dimensions: ", target_batch.shape)

Input batch dimensions:  torch.Size([8, 120])
Target batch dimensions:  torch.Size([8])


In [25]:
#   Print the total number of batches in each dataset:
print(f"{len(train_loader)} training batches")
print(f"{len(val_loader)} validation batches")
print(f"{len(test_loader)} test batches")

130 training batches
19 validation batches
38 test batches
