# Pip

In [1]:
!pip install sentencepiece -q
!pip install transformers -q
!pip install accelerate -q
!pip install bitsandbytes -q
!pip install lightning -q
!pip install flash-attn --no-build-isolation -q

# Imports

In [2]:
import numpy as np
import pandas as pd

from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM,
    get_linear_schedule_with_warmup,
    get_cosine_schedule_with_warmup
)

import os
import time
import zipfile
import urllib.request
from pathlib import Path
from tqdm.auto import tqdm

import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score
import lightning as L

tqdm.pandas()
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Get data

In [3]:
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction.")
        return

    # Downloading the file
    with urllib.request.urlopen(url) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    # Unzipping the file
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # Add .tsv file extension
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saved as {data_file_path}")

download_and_unzip(url, zip_path, extracted_path, data_file_path)

File downloaded and saved as sms_spam_collection/SMSSpamCollection.tsv


In [4]:
df = pd.read_csv(data_file_path, sep="\t", header=None, names=["target", "text"])
df

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
def create_balanced_dataset(df):

    # Count the instances of "spam"
    num_spam = df[df["target"] == "spam"].shape[0]

    # Randomly sample "ham" instances to match the number of "spam" instances
    ham_subset = df[df["target"] == "ham"].sample(num_spam, random_state=123)

    # Combine ham "subset" with "spam"
    balanced_df = pd.concat([ham_subset, df[df["target"] == "spam"]])

    return balanced_df

balanced_df = create_balanced_dataset(df)
print(balanced_df["target"].value_counts())

target
ham     747
spam    747
Name: count, dtype: int64


In [7]:
balanced_df['target'] = df.target.map({'spam': 1, 'ham': 0})

In [8]:
def random_split(df, train_frac, validation_frac):
    # Shuffle the entire DataFrame
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)

    # Calculate split indices
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    # Split the DataFrame
    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
# Test size is implied to be 0.2 as the remainder

train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)

# Load tokenizer

In [9]:
# Initialize tokenizer with model ID and authentication token
model_id = 'microsoft/Phi-3-mini-128k-instruct'
hf_token = 'hf_' # Replace your token here on huggingface

tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)

# Set padding token to end-of-sequence token and configure padding side
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

tokenizer_config.json:   0%|          | 0.00/3.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Split data

In [10]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
val = pd.read_csv('validation.csv')

In [11]:
# follow the phi 3 on huggingface:
# "Some applications/frameworks might not include a BOS token (<s>) at the start of the conversation.
# Please ensure that it is included since it provides more reliable results."

train['text'] = tokenizer.bos_token + train['text']
test['text'] = tokenizer.bos_token + test['text']
val['text'] = tokenizer.bos_token + val['text']

In [12]:
sample = tokenizer(train.text[0], add_special_tokens=False).input_ids
tokenizer.decode(sample)

'<s> Dude how do you like the buff wind.'

# Dataset and DataLoader

In [13]:
class CustomDataset(Dataset):
    def __init__(self, texts, targets):
        self.texts = texts
        self.targets = torch.tensor(targets, dtype=torch.long)

    def __getitem__(self, idx):
        text = self.texts[idx]
        target = self.targets[idx]
        return text, target

    def __len__(self):
        return len(self.targets)

In [14]:
# Set seed for reproducibility
L.seed_everything(seed=252)

# Create train dataset and dataloader
train_dataset = CustomDataset(
    texts=train['text'].values.tolist(),
    targets=train['target'].values.tolist()
)
train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=8,
    shuffle=True,
    drop_last=True
)

# Create test dataset and dataloader
test_dataset = CustomDataset(
    texts=test['text'].values.tolist(),
    targets=test['target'].values.tolist()
)
test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=16,
    shuffle=False,
    drop_last=False
)

# Create validation dataset and dataloader
val_dataset = CustomDataset(
    texts=val['text'].values.tolist(),
    targets=val['target'].values.tolist()
)
val_dataloader = DataLoader(
    dataset=val_dataset,
    batch_size=16,
    shuffle=False,
    drop_last=False
)

Seed set to 252


# Tokenization function

In [15]:
def tokenize_text(text):
    """
    Tokenize the text and return PyTorch tensors with dynamic padding
    """
    encodings = tokenizer(
        text,
        return_tensors='pt',
        padding='longest',  # Dynamically pad each batch to the length of the longest sequence
        add_special_tokens=False
    )

    return encodings


# Architecture

In [16]:
def disable_dropout(model: torch.nn.Module):
    """Disable dropout in a model."""
    for module in model.modules():
        if isinstance(module, torch.nn.Dropout):
            module.p = 0

# Define a neural network class
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        # Load configuration from a pre-trained model
        config = AutoConfig.from_pretrained(model_id)

        # Load pre-trained language model with specific configurations
        self.llm = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="cuda",
            low_cpu_mem_usage=True,
            torch_dtype=torch.float16,
            trust_remote_code=True,
            token=hf_token,
            # attn_implementation='flash_attention_2'
        )

        # Replace language model head with an identity function
        self.llm.lm_head = nn.Identity()

        # Freeze all parameters of the language model backbone
        for name, param in self.llm.named_parameters():
            param.requires_grad = False

        # Define classification head
        self.cls_head = nn.Sequential(
            nn.Linear(config.hidden_size, 768),
            nn.ReLU(),
            nn.LayerNorm(768),
            nn.Linear(768, 2)
        )

    # Define the forward pass
    def forward(self, input_ids, attention_mask):
        x = self.llm(input_ids, attention_mask).logits  # get last hidden state
        logits = self.cls_head(x)[:, -1, :]  # Apply classification head to the last token's output
        return logits


# Optimizer and Scheduler

In [17]:
def get_optimizer(model, learning_rate=0.0001, diff_lr=0.00001, weight_decay=0.01):
    """
    Get optimizer with different learning rates for specified layers.

    Args:
        model (torch.nn.Module): The neural network model.
        learning_rate (float): Learning rate for non-differential layers.
        diff_lr (float): Learning rate for differential layers.
        weight_decay (float): Weight decay (decoupled from L2 penalty) for optimizer.

    Returns:
        torch.optim.AdamW: Optimizer for the model.
    """

    # Define parameters with different learning rates and weight decay
    no_decay = ['bias', 'LayerNorm.weight']
    differential_layers = ['llm']

    optimizer = torch.optim.AdamW(
            [
                {
                    "params": [
                        param
                        for name, param in model.named_parameters()
                        if (not any(layer in name for layer in differential_layers))
                        and (not any(nd in name for nd in no_decay))
                    ],
                    "lr": learning_rate,
                    "weight_decay": weight_decay,
                },
                {
                    "params": [
                        param
                        for name, param in model.named_parameters()
                        if (not any(layer in name for layer in differential_layers))
                        and (any(nd in name for nd in no_decay))
                    ],
                    "lr": learning_rate,
                    "weight_decay": 0,
                },
                {
                    "params": [
                        param
                        for name, param in model.named_parameters()
                        if (any(layer in name for layer in differential_layers))
                        and (not any(nd in name for nd in no_decay))
                    ],
                    "lr": diff_lr,
                    "weight_decay": weight_decay,
                },
                {
                    "params": [
                        param
                        for name, param in model.named_parameters()
                        if (any(layer in name for layer in differential_layers))
                        and (any(nd in name for nd in no_decay))
                    ],
                    "lr": diff_lr,
                    "weight_decay": 0,
                },
            ],
            lr=learning_rate,
            weight_decay=weight_decay,
    )

    return optimizer

# Hyperameters

In [18]:
num_epochs = 1
learning_rate = 0.0002
diff_lr = 0.00001 # not being used because I freeze the llm backbone
warmup_steps = 0
seed = 252
weight_decay = 0.01

# Fine-tuning

In [19]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

Using device: cuda


In [20]:
# Set seed for reproducibility
L.seed_everything(seed=seed)

# Instantiate the neural network model
model = Net()
model.to(device)  # Move model to the device

# Display the names of trainable parameters
print('Here are the trainable parameters:')
for n, p in model.named_parameters():
    if p.requires_grad:
        print(n)

# Get the optimizer
optimizer = get_optimizer(
    model,
    learning_rate=learning_rate,
    diff_lr=diff_lr,
    weight_decay=weight_decay
)

# Set up the scheduler for learning rate adjustment
scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=num_epochs*len(train_dataloader)
)

Seed set to 252


config.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

Here are the trainable parameters:
cls_head.0.weight
cls_head.0.bias
cls_head.2.weight
cls_head.2.bias
cls_head.3.weight
cls_head.3.bias


In [21]:
scaler = GradScaler()

start_time = time.time()
for epoch in range(num_epochs):

    for batch_idx, batch in enumerate(train_dataloader):

        model.train()

        prompt, targets = batch

        encodings = tokenize_text(prompt)

        input_ids = encodings['input_ids'].to(device)
        attention_mask = encodings['attention_mask'].to(device)
        targets = targets.to(device)

        # Perform forward pass with autocast for mixed precision training
        with autocast():
            logits = model(input_ids, attention_mask)
            loss = F.cross_entropy(logits, targets)

        # Backward pass, optimization step, and learning rate adjustment
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        # Logging training progress
        print(
            f'Epoch: {epoch+1} / {num_epochs}'
            f'| Batch: {batch_idx+1}/{len(train_dataloader)}'
            f'| Loss: {loss.item():.4f}'
        )

end_time = time.time()
training_time = (end_time - start_time) / 60
print(f'Total training time: {training_time:.2f} min')

You are not running the flash-attention implementation, expect numerical differences.


Epoch: 1 / 1| Batch: 1/130| Loss: 0.9639
Epoch: 1 / 1| Batch: 2/130| Loss: 0.5885
Epoch: 1 / 1| Batch: 3/130| Loss: 0.1971
Epoch: 1 / 1| Batch: 4/130| Loss: 0.2017
Epoch: 1 / 1| Batch: 5/130| Loss: 0.0810
Epoch: 1 / 1| Batch: 6/130| Loss: 0.7381
Epoch: 1 / 1| Batch: 7/130| Loss: 0.0914
Epoch: 1 / 1| Batch: 8/130| Loss: 0.0290
Epoch: 1 / 1| Batch: 9/130| Loss: 0.0236
Epoch: 1 / 1| Batch: 10/130| Loss: 0.4679
Epoch: 1 / 1| Batch: 11/130| Loss: 0.1236
Epoch: 1 / 1| Batch: 12/130| Loss: 0.0742
Epoch: 1 / 1| Batch: 13/130| Loss: 0.0056
Epoch: 1 / 1| Batch: 14/130| Loss: 0.0109
Epoch: 1 / 1| Batch: 15/130| Loss: 0.0084
Epoch: 1 / 1| Batch: 16/130| Loss: 0.7428
Epoch: 1 / 1| Batch: 17/130| Loss: 0.0224
Epoch: 1 / 1| Batch: 18/130| Loss: 0.0745
Epoch: 1 / 1| Batch: 19/130| Loss: 0.2782
Epoch: 1 / 1| Batch: 20/130| Loss: 0.0510
Epoch: 1 / 1| Batch: 21/130| Loss: 0.7222
Epoch: 1 / 1| Batch: 22/130| Loss: 0.0885
Epoch: 1 / 1| Batch: 23/130| Loss: 0.0070
Epoch: 1 / 1| Batch: 24/130| Loss: 0.0647
E

# Evaluation

In [22]:
def calc_accuracy(dataloader, type):
    with torch.no_grad():
        model.eval()
        pred_scores = []
        actual_scores = []
        for batch in tqdm(dataloader, total=len(dataloader), desc=f'Calc {type} accuracy'):
            prompt, targets = batch
            encodings = tokenize_text(prompt)

            input_ids = encodings['input_ids'].to(device)
            attention_mask = encodings['attention_mask'].to(device)

            with autocast():
                logits = model(input_ids, attention_mask)
                pred_score = F.softmax(logits, dim=-1).argmax(dim=-1).cpu().detach().numpy().tolist()
                pred_scores.extend(pred_score)
                actual_scores.extend(targets.numpy().tolist())

        pred_scores = np.array(pred_scores)
        accuracy = accuracy_score(actual_scores, pred_scores)

        return accuracy

In [23]:
train_acc = calc_accuracy(train_dataloader, type='train')
test_acc = calc_accuracy(test_dataloader, type='test')
val_acc = calc_accuracy(val_dataloader, type='val')

print('Train accuracy:', train_acc)
print('Test accuracy:', test_acc)
print('Val accuracy:', val_acc)

Calc train accuracy:   0%|          | 0/130 [00:00<?, ?it/s]

Calc test accuracy:   0%|          | 0/19 [00:00<?, ?it/s]

Calc val accuracy:   0%|          | 0/10 [00:00<?, ?it/s]

Train accuracy: 0.9951923076923077
Test accuracy: 0.9666666666666667
Val accuracy: 0.9932885906040269


| Model        | Weights   | Trainable token | Trainable layers        | Context length                            | CPU/GPU | Training time | Training acc | Validation acc | Test acc |
|--------------|-----------|-----------------|-------------------------|-------------------------------------------|---------|---------------|--------------|----------------|-----------|
| Phi-3 (3.8 B)| instruct  | last            | head                    | dynamic padding (batch-wise)        | T4 (Colab free)    | 0.63 min      | 99.51%       | 99.32%         | 96.66%    |