In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

## Setup Datset And Dataloader

In [1]:
from datasets import load_dataset
import pandas as pd

# Load dataset from Hugging Face
dataset = load_dataset('ucirvine/sms_spam')
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['sms', 'label'],
        num_rows: 5574
    })
})

In [2]:
combined_df = dataset['train'].to_pandas()
combined_df

Unnamed: 0,sms,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...\n,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...
5569,This is the 2nd time we have tried 2 contact u...,1
5570,Will ü b going to esplanade fr home?\n,0
5571,"Pity, * was in mood for that. So...any other s...",0
5572,The guy did some bitching but I acted like i'd...,0


In [3]:
# Convert numeric labels to text labels and rename columns
combined_df['Label'] = combined_df['label']
combined_df = combined_df.rename(columns={'sms': 'Text'})[['Label', 'Text']]
combined_df

Unnamed: 0,Label,Text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...\n
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5569,1,This is the 2nd time we have tried 2 contact u...
5570,0,Will ü b going to esplanade fr home?\n
5571,0,"Pity, * was in mood for that. So...any other s..."
5572,0,The guy did some bitching but I acted like i'd...


In [4]:
# Balance the dataset by undersampling the majority class
ham_samples = combined_df[combined_df['Label'] == 0]
spam_samples = combined_df[combined_df['Label'] == 1]
min_class_count = min(len(ham_samples), len(spam_samples))

balanced_ham = ham_samples.sample(min_class_count, random_state=42)
balanced_spam = spam_samples.sample(min_class_count, random_state=42)
balanced_df = pd.concat([balanced_ham, balanced_spam])

In [5]:
# Shuffle the dataset
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
balanced_df

Unnamed: 0,Label,Text
0,1,You are a winner U have been specially selecte...
1,1,Urgent! Please call 09061743811 from landline....
2,1,"I want some cock! My hubby's away, I need a re..."
3,1,"Latest News! Police station toilet stolen, cop..."
4,1,BangBabes Ur order is on the way. U SHOULD rec...
...,...,...
1489,1,SplashMobile: Choose from 1000s of gr8 tones e...
1490,1,"January Male Sale! Hot Gay chat now cheaper, c..."
1491,1,Please call our customer service representativ...
1492,1,Free msg: Single? Find a partner in your area!...


In [6]:
# Split into train/validation/test (70%/10%/20%)
total_samples = len(balanced_df)
train_end = int(0.7 * total_samples)
val_end = int(0.8 * total_samples)  # 70% + 10%

train_df = balanced_df.iloc[:train_end]
validation_df = balanced_df.iloc[train_end:val_end]
test_df = balanced_df.iloc[val_end:]

# Save to CSV files
train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)

In [7]:
!pip install torch -q
!pip install transformers -q
!pip install tiktoken -q

In [8]:
import torch
from torch.utils.data import Dataset
import tiktoken
import pandas as pd

class CustomSpamDataset(Dataset):
    def __init__(self, csv_path, max_length=None, tokenizer=None):
        self.df = pd.read_csv(csv_path)
        self.tokenizer = tokenizer or tiktoken.get_encoding("gpt2")
        self.pad_token = self.tokenizer.eot_token  # Using end-of-text token for padding

        # Tokenize all texts
        self.texts = [self.tokenizer.encode(text) for text in self.df['Text']]

        # Determine max length
        if max_length is None:
            self.max_length = max(len(t) for t in self.texts)
        else:
            self.max_length = max_length

        # Process labels
        self.labels = torch.tensor(self.df['Label'].values, dtype=torch.long)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Get tokenized text and label
        tokens = self.texts[idx]
        label = self.labels[idx]

        # Truncate or pad to max_length
        if len(tokens) > self.max_length:
            tokens = tokens[:self.max_length]
        else:
            tokens = tokens + [self.pad_token] * (self.max_length - len(tokens))

        return torch.tensor(tokens), label

# Initialize tokenizer
tokenizer = tiktoken.get_encoding("gpt2")

# Create datasets
train_dataset = CustomSpamDataset("train.csv", max_length=None, tokenizer=tokenizer)
val_dataset = CustomSpamDataset("validation.csv",
                               max_length=train_dataset.max_length,
                               tokenizer=tokenizer)
test_dataset = CustomSpamDataset("test.csv",
                                max_length=train_dataset.max_length,
                                tokenizer=tokenizer)

In [9]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

## Load The Model And Change The Output Layer

In [11]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

model_name = "gpt2-xl"

# Load tokenizer and model directly from Hugging Face
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [12]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1600)
    (wpe): Embedding(1024, 1600)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-47): 48 x GPT2Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=4800, nx=1600)
          (c_proj): Conv1D(nf=1600, nx=1600)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=6400, nx=1600)
          (c_proj): Conv1D(nf=1600, nx=6400)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1600, out_features=50257, bias=False)
)

In [13]:
model.lm_head = torch.nn.Linear(in_features=1600, out_features=2)

In [14]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1600)
    (wpe): Embedding(1024, 1600)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-47): 48 x GPT2Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=4800, nx=1600)
          (c_proj): Conv1D(nf=1600, nx=1600)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=6400, nx=1600)
          (c_proj): Conv1D(nf=1600, nx=6400)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1600, out_features=2, bias=True)
)

## Setup And Replace LoRa Layers

In [15]:
@torch.no_grad()
def accuracy(data_loader, model, device, num_batches=None):
    model.eval()
    correct_predictions, num_examples = 0, 0

    if num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))

    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            input_batch = input_batch.to(device)
            target_batch = target_batch.to(device)

            # Get model outputs and extract logits
            outputs = model(input_batch)
            logits = outputs.logits[:, -1, :]  # Access logits from outputs object

            predicted_labels = torch.argmax(logits, dim=-1)

            num_examples += predicted_labels.shape[0]
            correct_predictions += (predicted_labels == target_batch).sum().item()
        else:
            break
    return correct_predictions / num_examples if num_examples > 0 else 0.0

In [16]:
train_acc = accuracy(train_loader, model, device, num_batches=10)
val_acc = accuracy(val_loader, model, device, num_batches=10)
test_acc = accuracy(test_loader, model, device, num_batches=10)

print(f"Training accuracy: {train_acc*100}%")
print(f"Validation accuracy: {val_acc*100}%")
print(f"Test accuracy: {test_acc*100}%")

Training accuracy: 56.25%
Validation accuracy: 53.75%
Test accuracy: 58.75%


In [17]:
"""
Alpha -> Scaling parameter noted in the paper.
It is responsible for conrtrolling the extent to which the adapted layer's output is allowed to influence the original output of the layer being adapted.
"""
class LoRA(torch.nn.Module):
  def __init__(self, in_dim, out_dim, rank, alpha):
    super().__init__()
    self.A = torch.nn.Parameter(torch.empty(in_dim, rank))
    torch.nn.init.normal_(self.A, mean = 0.0, std = 0.1)
    self.B = torch.nn.Parameter(torch.zeros(rank, out_dim))
    self.rank = rank
    self.alpha = alpha

  def forward(self, x):
    return self.alpha * x @ self.A @ self.B

In [18]:
class LinearLoRA(torch.nn.Module):
  def __init__(self, linear, rank, alpha):
    super().__init__()
    self.linear = linear
    self.lora = LoRA(linear.in_features, linear.out_features, rank, alpha)

  def forward(self, x):
    return self.linear(x) + self.lora(x)

In [19]:
def replace_linear_layers(model, rank, alpha):
  for name, module in model.named_children():
    if isinstance(module, torch.nn.Linear):
      setattr(model, name, LinearLoRA(module, rank, alpha))

In [20]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters before: {total_params:,}")

Total trainable parameters before: 1,557,614,402


In [21]:
for param in model.parameters():
    param.requires_grad = False

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters after: {total_params:,}")

Total trainable parameters after: 0


In [22]:
replace_linear_layers(model, rank=1, alpha=8)

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable LoRA parameters: {total_params:,}")

Total trainable LoRA parameters: 1,602


In [23]:
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1600)
    (wpe): Embedding(1024, 1600)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-47): 48 x GPT2Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=4800, nx=1600)
          (c_proj): Conv1D(nf=1600, nx=1600)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=6400, nx=1600)
          (c_proj): Conv1D(nf=1600, nx=6400)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): LinearLoRA(
    (linear): Linear(in_features=1600, out_features=2, bias=True)
    (lora): LoRA()


In [24]:
train_acc = accuracy(train_loader, model, device, num_batches=10)
val_acc = accuracy(val_loader, model, device, num_batches=10)
test_acc = accuracy(test_loader, model, device, num_batches=10)

print(f"Training accuracy: {train_acc*100}%")
print(f"Validation accuracy: {val_acc*100}%")
print(f"Test accuracy: {test_acc*100}%")

Training accuracy: 56.25%
Validation accuracy: 53.75%
Test accuracy: 58.75%


## Fine-tune For Classification

In [25]:
def calc_loss(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    outputs = model(input_ids=input_batch)  # Explicitly use input_ids parameter
    logits = outputs.logits[:, -1, :]  # Properly access logits from model outputs
    loss = torch.nn.functional.cross_entropy(logits, target_batch)
    return loss

def train(model, train_loader, val_loader, optimizer, device, num_epochs, eval_iter=5):
    train_accs, val_accs = [], []

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            loss = calc_loss(input_batch, target_batch, model, device)
            loss.backward()
            optimizer.step()

        # Evaluation phase
        model.eval()
        with torch.no_grad():
            # Train accuracy
            train_correct, train_total = 0, 0
            for i, (inputs, labels) in enumerate(train_loader):
                if i >= eval_iter: break
                inputs, labels = inputs.to(device), labels.to(device)
                logits = model(inputs).logits[:, -1, :]
                train_correct += (logits.argmax(-1) == labels).sum().item()
                train_total += labels.size(0)

            # Validation accuracy
            val_correct, val_total = 0, 0
            for i, (inputs, labels) in enumerate(val_loader):
                if i >= eval_iter: break
                inputs, labels = inputs.to(device), labels.to(device)
                logits = model(inputs).logits[:, -1, :]
                val_correct += (logits.argmax(-1) == labels).sum().item()
                val_total += labels.size(0)

        # Calculate and store accuracies
        train_acc = train_correct / train_total
        val_acc = val_correct / val_total
        train_accs.append(train_acc)
        val_accs.append(val_acc)

        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"Train Acc: {train_acc*100:.2f}%")

    return train_accs, val_accs

In [27]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=0.1)
train_accs, val_accs = train(
    model, train_loader, val_loader, optimizer, device, num_epochs=10
)


Epoch 1/10
Train Acc: 45.00%
Epoch 2/10
Train Acc: 47.50%
Epoch 3/10
Train Acc: 45.00%
Epoch 4/10
Train Acc: 55.00%
Epoch 5/10
Train Acc: 55.00%
Epoch 6/10
Train Acc: 57.50%
Epoch 7/10
Train Acc: 55.00%
Epoch 8/10
Train Acc: 60.00%
Epoch 9/10
Train Acc: 67.50%
Epoch 10/10
Train Acc: 70.00%

