In [1]:
from importlib.metadata import version
print("torch version:", version("torch"))

torch version: 2.8.0+cu126


In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


# Fine-tuning for classification

## 6.1 - Different categories of fine-tuning

- Most common ways to fine tune:
  - instruction fine-tuning - training a language model on a set of tasks using specific instructions to improve its ability to understand and execute tasks described in natural language prompts
  - classification fine-tuning - the model is trained to recognize a specific set of class labels, such as "spam" and "not spam."  The key point is that a classification fine-tuned model is restricted to predicting classes it has encountered during its training.

## 6.2 - Preparing the dataset

- Modify the GPT model we previously implemented and pretrained.

In [3]:
#   First step is to download the dataset.
import urllib.request
import zipfile
import os
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(
    url, zip_path, extracted_path, data_file_path):
  if data_file_path.exists():
    print(f"{data_file_path} already exists.  Skipping download and extraction.")
    return

  with urllib.request.urlopen(url) as response:
    with open(zip_path, "wb") as out_file:
      out_file.write(response.read())

  with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(extracted_path)

  orginal_file_path = Path(extracted_path) / "SMSSpamCollection"
  os.rename(orginal_file_path, data_file_path)
  print(f"File downloaded and saved as {data_file_path}")

download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)

File downloaded and saved as sms_spam_collection/SMSSpamCollection.tsv


In [5]:
#   Load the data into a pandas dataframe
import pandas as pd

df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
df.head()

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
#   Let's examine the class label distribution:
print(df["Label"].value_counts())

Label
ham     4825
spam     747
Name: count, dtype: int64


In [8]:
#   To speed up training we will undersample the dataset to include 747 instances
def create_balanced_dataset(df):
  num_spam = df[df["Label"] == "spam"].shape[0]
  ham_subset = df[df["Label"] == "ham"].sample(
      num_spam, random_state=123
  )
  balanced_df = pd.concat([
      ham_subset, df[df["Label"] == "spam"]
  ])
  return balanced_df

balanced_df = create_balanced_dataset(df)
print(balanced_df["Label"].value_counts())


Label
ham     747
spam    747
Name: count, dtype: int64


In [9]:
#   Next convert the "string" class labels to "ham" and "spam"
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})

In [10]:
#   Next create a random_split function
def random_split(df, train_frac, validation_frac):
  df = df.sample(
      frac=1, random_state=123
  ).reset_index(drop=True)
  train_end = int(len(df) * train_frac)
  validation_end = train_end + int(len(df) * validation_frac)

  train_df = df[:train_end]
  validation_df = df[train_end:validation_end]
  test_df = df[validation_end:]

  return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(
    balanced_df, 0.7, 0.1
)

In [11]:
#   Save the dataset as CSV
train_df.to_csv("train.csv", index=False)
validation_df.to_csv("validation.csv", index=False)
test_df.to_csv("test.csv", index=False)

## 6.3 - Creating data loader

-