# Pip

In [1]:
!pip install sentencepiece -q
!pip install transformers -q
!pip install accelerate -q
!pip install peft -q
!pip install bitsandbytes -q
!pip install lightning -q
!pip install flash-attn --no-build-isolation -q

# Imports

In [2]:
import numpy as np
import pandas as pd

from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM,
    get_linear_schedule_with_warmup,
    get_cosine_schedule_with_warmup
)

import os
import time
import zipfile
import urllib.request
from pathlib import Path
from tqdm.auto import tqdm

import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score
import lightning as L

from peft import (
    get_peft_config,
    get_peft_model,
    LoraConfig,
    TaskType
)

tqdm.pandas()
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Get data

In [3]:
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction.")
        return

    # Downloading the file
    with urllib.request.urlopen(url) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    # Unzipping the file
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # Add .tsv file extension
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saved as {data_file_path}")

download_and_unzip(url, zip_path, extracted_path, data_file_path)

sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.


In [4]:
df = pd.read_csv(data_file_path, sep="\t", header=None, names=["target", "text"])
df

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
def create_balanced_dataset(df):

    # Count the instances of "spam"
    num_spam = df[df["target"] == "spam"].shape[0]

    # Randomly sample "ham" instances to match the number of "spam" instances
    ham_subset = df[df["target"] == "ham"].sample(num_spam, random_state=123)

    # Combine ham "subset" with "spam"
    balanced_df = pd.concat([ham_subset, df[df["target"] == "spam"]])

    return balanced_df

balanced_df = create_balanced_dataset(df)
print(balanced_df["target"].value_counts())

target
ham     747
spam    747
Name: count, dtype: int64


In [6]:
balanced_df['target'] = df.target.map({'spam': 1, 'ham': 0})

In [7]:
def random_split(df, train_frac, validation_frac):
    # Shuffle the entire DataFrame
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)

    # Calculate split indices
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    # Split the DataFrame
    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
# Test size is implied to be 0.2 as the remainder

train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)

# Load tokenizer

In [8]:
# Initialize tokenizer with model ID and authentication token
model_id = 'h2oai/h2o-danube-1.8b-chat'
hf_token = 'hf_' # Replace your token here on huggingface

tokenizer = AutoTokenizer.from_pretrained(model_id)

# Set padding token to end-of-sequence token and configure padding side
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Split data

In [9]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
val = pd.read_csv('validation.csv')

In [10]:
train['text'] = tokenizer.bos_token + train['text']
test['text'] = tokenizer.bos_token + test['text']
val['text'] = tokenizer.bos_token + val['text']

In [11]:
sample = tokenizer(train.text[0], add_special_tokens=False).input_ids
tokenizer.decode(sample)

'<s> Dude how do you like the buff wind.'

# Dataset and DataLoader

In [12]:
class CustomDataset(Dataset):
    def __init__(self, texts, targets):
        self.texts = texts
        self.targets = torch.tensor(targets, dtype=torch.long)

    def __getitem__(self, idx):
        text = self.texts[idx]
        target = self.targets[idx]
        return text, target

    def __len__(self):
        return len(self.targets)

In [13]:
# Set seed for reproducibility
L.seed_everything(seed=252)

# Create train dataset and dataloader
train_dataset = CustomDataset(
    texts=train['text'].values.tolist(),
    targets=train['target'].values.tolist()
)
train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=2,
    shuffle=True,
    drop_last=True
)

# Create test dataset and dataloader
test_dataset = CustomDataset(
    texts=test['text'].values.tolist(),
    targets=test['target'].values.tolist()
)
test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=16,
    shuffle=False,
    drop_last=False
)

# Create validation dataset and dataloader
val_dataset = CustomDataset(
    texts=val['text'].values.tolist(),
    targets=val['target'].values.tolist()
)
val_dataloader = DataLoader(
    dataset=val_dataset,
    batch_size=16,
    shuffle=False,
    drop_last=False
)

INFO: Seed set to 252
INFO:lightning.fabric.utilities.seed:Seed set to 252


# Tokenization function

In [14]:
def tokenize_text(text):
    """
    Tokenize the text and return PyTorch tensors with dynamic padding
    """
    encodings = tokenizer(
        text,
        return_tensors='pt',
        padding='longest',  # Dynamically pad each batch to the length of the longest sequence
        add_special_tokens=False
    )

    return encodings


# Architecture

In [15]:
def disable_dropout(model: torch.nn.Module):
    """Disable dropout in a model."""
    for module in model.modules():
        if isinstance(module, torch.nn.Dropout):
            module.p = 0

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        # Get LLM configuration
        config = AutoConfig.from_pretrained(model_id)

        # LoRA config
        peft_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM,
            inference_mode=False,
            r=8,
            lora_alpha=16,
            target_modules='all-linear',
            lora_dropout=0.
        )

        # Load pre-trained language model with specific configurations
        self.backbone = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="cuda",
            low_cpu_mem_usage=True,
            trust_remote_code=True,
        )

        # Replace language model head with an identity function
        self.backbone.lm_head = nn.Identity()

        # Apply LoRA
        self.backbone = get_peft_model(self.backbone, peft_config)
        self.backbone.print_trainable_parameters()

        # Define classification head
        self.cls_head = nn.Sequential(
            nn.Linear(config.hidden_size, 768),
            nn.ReLU(),
            nn.LayerNorm(768),
            nn.Linear(768, 2)
        )

    def forward(self, input_ids, attention_mask):
        x = self.backbone(input_ids, attention_mask).logits  # get last hidden state
        logits = self.cls_head(x)[:, -1, :]  # Apply classification head to the last token's output
        return logits


# Optimizer and Scheduler

In [16]:
def get_optimizer(model, learning_rate=0.0001, diff_lr=0.00001, weight_decay=0.01):
    """
    Get optimizer with different learning rates for specified layers.

    Args:
        model (torch.nn.Module): The neural network model.
        learning_rate (float): Learning rate for non-differential layers.
        diff_lr (float): Learning rate for differential layers.
        weight_decay (float): Weight decay (decoupled from L2 penalty) for optimizer.

    Returns:
        torch.optim.AdamW: Optimizer for the model.
    """

    # Define parameters with different learning rates and weight decay
    no_decay = ['bias', 'LayerNorm.weight']
    differential_layers = ['backbone']

    optimizer = torch.optim.AdamW(
            [
                {
                    "params": [
                        param
                        for name, param in model.named_parameters()
                        if (not any(layer in name for layer in differential_layers))
                        and (not any(nd in name for nd in no_decay))
                    ],
                    "lr": learning_rate,
                    "weight_decay": weight_decay,
                },
                {
                    "params": [
                        param
                        for name, param in model.named_parameters()
                        if (not any(layer in name for layer in differential_layers))
                        and (any(nd in name for nd in no_decay))
                    ],
                    "lr": learning_rate,
                    "weight_decay": 0,
                },
                {
                    "params": [
                        param
                        for name, param in model.named_parameters()
                        if (any(layer in name for layer in differential_layers))
                        and (not any(nd in name for nd in no_decay))
                    ],
                    "lr": diff_lr,
                    "weight_decay": weight_decay,
                },
                {
                    "params": [
                        param
                        for name, param in model.named_parameters()
                        if (any(layer in name for layer in differential_layers))
                        and (any(nd in name for nd in no_decay))
                    ],
                    "lr": diff_lr,
                    "weight_decay": 0,
                },
            ],
            lr=learning_rate,
            weight_decay=weight_decay,
    )

    return optimizer

# Hyperameters

In [17]:
num_epochs = 2
learning_rate = 0.0002
diff_lr = 0.00001
warmup_steps = 0
seed = 252
weight_decay = 0.01
acc_steps = 2

# Fine-tuning

In [18]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

Using device: cuda


In [19]:
# Set seed for reproducibility
L.seed_everything(seed=seed)

# Instantiate the neural network model
model = Net()
model.to(device)  # Move model to the device

# Display the names of trainable parameters
print('Here are the trainable parameters:')
for n, p in model.named_parameters():
    if p.requires_grad:
        print(n)

# Get the optimizer
optimizer = get_optimizer(
    model,
    learning_rate=learning_rate,
    diff_lr=diff_lr,
    weight_decay=weight_decay
)

# Set up the scheduler for learning rate adjustment
scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=num_epochs*len(train_dataloader)
)

INFO: Seed set to 252
INFO:lightning.fabric.utilities.seed:Seed set to 252


trainable params: 8,650,752 || all params: 1,757,932,032 || trainable%: 0.49209820644533314
Here are the trainable parameters:
backbone.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight
backbone.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight
backbone.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight
backbone.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight
backbone.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight
backbone.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight
backbone.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight
backbone.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight
backbone.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight
backbone.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight
backbone.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight
back

In [20]:
scaler = GradScaler()

start_time = time.time()
for epoch in range(num_epochs):

    for batch_idx, batch in enumerate(train_dataloader):

        model.train()

        prompt, targets = batch

        encodings = tokenize_text(prompt)

        input_ids = encodings['input_ids'].to(device)
        attention_mask = encodings['attention_mask'].to(device)
        targets = targets.to(device)

        # Perform forward pass with autocast for mixed precision training
        with autocast():
            logits = model(input_ids, attention_mask)
            loss = F.cross_entropy(logits, targets) / acc_steps

        if not batch_idx % acc_steps:
        # Backward pass, optimization step, and learning rate adjustment
            optimizer.zero_grad()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

        # Logging training progress
        print(
            f'Epoch: {epoch+1} / {num_epochs}'
            f'| Batch: {batch_idx+1}/{len(train_dataloader)}'
            f'| Loss: {loss.item():.4f}'
        )

end_time = time.time()
training_time = (end_time - start_time) / 60
print(f'Total training time: {training_time:.2f} min')

Epoch: 1 / 2| Batch: 1/522| Loss: 0.3683
Epoch: 1 / 2| Batch: 2/522| Loss: 0.3977
Epoch: 1 / 2| Batch: 3/522| Loss: 0.3281
Epoch: 1 / 2| Batch: 4/522| Loss: 0.1803
Epoch: 1 / 2| Batch: 5/522| Loss: 0.4722
Epoch: 1 / 2| Batch: 6/522| Loss: 0.1276
Epoch: 1 / 2| Batch: 7/522| Loss: 0.0361
Epoch: 1 / 2| Batch: 8/522| Loss: 0.4598
Epoch: 1 / 2| Batch: 9/522| Loss: 0.2556
Epoch: 1 / 2| Batch: 10/522| Loss: 0.0484
Epoch: 1 / 2| Batch: 11/522| Loss: 0.0332
Epoch: 1 / 2| Batch: 12/522| Loss: 0.0087
Epoch: 1 / 2| Batch: 13/522| Loss: 0.0603
Epoch: 1 / 2| Batch: 14/522| Loss: 0.3647
Epoch: 1 / 2| Batch: 15/522| Loss: 0.0083
Epoch: 1 / 2| Batch: 16/522| Loss: 0.0299
Epoch: 1 / 2| Batch: 17/522| Loss: 0.0049
Epoch: 1 / 2| Batch: 18/522| Loss: 0.1150
Epoch: 1 / 2| Batch: 19/522| Loss: 0.0122
Epoch: 1 / 2| Batch: 20/522| Loss: 0.0019
Epoch: 1 / 2| Batch: 21/522| Loss: 0.1002
Epoch: 1 / 2| Batch: 22/522| Loss: 0.9186
Epoch: 1 / 2| Batch: 23/522| Loss: 0.1254
Epoch: 1 / 2| Batch: 24/522| Loss: 0.0265
E

# Evaluation

In [21]:
def calc_accuracy(dataloader, type):
    with torch.no_grad():
        model.eval()
        pred_scores = []
        actual_scores = []
        for batch in tqdm(dataloader, total=len(dataloader), desc=f'Calc {type} accuracy'):
            prompt, targets = batch
            encodings = tokenize_text(prompt)

            input_ids = encodings['input_ids'].to(device)
            attention_mask = encodings['attention_mask'].to(device)

            with autocast():
                logits = model(input_ids, attention_mask)
                pred_score = F.softmax(logits, dim=-1).argmax(dim=-1).cpu().detach().numpy().tolist()
                pred_scores.extend(pred_score)
                actual_scores.extend(targets.numpy().tolist())

        pred_scores = np.array(pred_scores)
        accuracy = accuracy_score(actual_scores, pred_scores)

        return accuracy

In [22]:
train_acc = calc_accuracy(train_dataloader, type='train')
test_acc = calc_accuracy(test_dataloader, type='test')
val_acc = calc_accuracy(val_dataloader, type='val')

print('Train accuracy:', train_acc)
print('Test accuracy:', test_acc)
print('Val accuracy:', val_acc)

Calc train accuracy:   0%|          | 0/522 [00:00<?, ?it/s]

Calc test accuracy:   0%|          | 0/19 [00:00<?, ?it/s]

Calc val accuracy:   0%|          | 0/10 [00:00<?, ?it/s]

Train accuracy: 0.9971264367816092
Test accuracy: 0.9733333333333334
Val accuracy: 0.9932885906040269


| Model        | Weights   | Trainable token | Trainable layers        | Context length                            | CPU/GPU | Training time | Training acc | Validation acc | Test acc |
|--------------|-----------|-----------------|-------------------------|-------------------------------------------|---------|---------------|--------------|----------------|-----------|
| h20-danube (1.8 B)| instruct   | last            | LoRA                    | dynamic padding (batch-wise)        | T4 (Colab free)    | 3.50 min      | 99.71%       | 99.32%         | 97.33%    |