
## Step 1: Loading and Evaluating a Foundation Model


In [14]:
# Step 1.1: Load the sms_spam dataset

from datasets import load_dataset

# NOTE: to use bigger dataset, just change DATA_SET_RATE to a bigger number, max is 1.0
DATA_SET_RATE = 0.02

# split the datasets
dataset = load_dataset("sms_spam", split="train").train_test_split(
    train_size=DATA_SET_RATE, test_size=DATA_SET_RATE/2.0, shuffle=True, seed=23
)

splits = ["train", "test"]

# View the dataset characteristics
print(dataset["train"])
print(dataset["test"])

Dataset({
    features: ['sms', 'label'],
    num_rows: 111
})
Dataset({
    features: ['sms', 'label'],
    num_rows: 56
})


In [15]:
# Inspect the first example from both test and train
print(dataset["train"][0])
print(dataset["test"][0])

{'sms': "I'm so in love with you. I'm excited each day i spend with you. You make me so happy.\n", 'label': 0}
{'sms': 'Yup... Hey then one day on fri we can ask miwa and jiayin take leave go karaoke \n', 'label': 0}


In [16]:
# Step 1.2: Tokenize datasets

from transformers import AutoTokenizer

tokenizer_gpt = AutoTokenizer.from_pretrained("gpt2")
tokenizer_gpt.add_special_tokens({'pad_token': '[PAD]'})
tokenizer_gpt.pad_token = tokenizer_gpt.eos_token

# Let's use a lambda function to tokenize all the examples
tokenized_dataset_gpt = {}
for split in splits:
    tokenized_dataset_gpt[split] = dataset[split].map(
        lambda x: tokenizer_gpt(x["sms"], padding="max_length", truncation=True), batched=True,
    )

# Inspect the available columns in the dataset
print(tokenized_dataset_gpt["train"])
print(tokenized_dataset_gpt["test"])


Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Map:   0%|          | 0/56 [00:00<?, ? examples/s]

Dataset({
    features: ['sms', 'label', 'input_ids', 'attention_mask'],
    num_rows: 111
})
Dataset({
    features: ['sms', 'label', 'input_ids', 'attention_mask'],
    num_rows: 56
})


In [17]:
# Step 1.3: Load pre trained GPT2 Model

from transformers import AutoModelForSequenceClassification
import torch 

def get_device_name(): 
    device_name = "cpu"
    if torch.cuda.is_available():
        device_name = "cuda"
    elif torch.backends.mps.is_available():
        device_name = "mps"
    return device_name
    
model_gpt = AutoModelForSequenceClassification.from_pretrained(
    "gpt2", 
    num_labels=2,
    id2label={0: "not spam", 1: "spam"},
    label2id={"not spam": 0, "spam": 1},
)

# Unfreeze all the model parameters.
for param in model_gpt.parameters():
    param.requires_grad = True

# Correct padding token id
model_gpt.config.pad_token_id = model_gpt.config.eos_token_id

# move the model to current device
model_gpt.to(get_device_name())

print(model_gpt)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)


In [18]:
# Step 1.4: Evaluate and check accuracy from the pre-trained GPT2 model

import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments

train_args = TrainingArguments(
        output_dir="./data/spam_not_spam/peft",
        # Set the learning rate
        learning_rate = 2e-5,
        # Set the per device train batch size and eval batch size
        per_device_train_batch_size = 16,
        per_device_eval_batch_size = 16,
        # Evaluate and save the model after each epoch
        evaluation_strategy = "epoch",
        save_strategy = "epoch",
        num_train_epochs=1,
        weight_decay=0.01,
        load_best_model_at_end=True,
    )

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

def make_trainer_from_model(model): 
    return Trainer(
        model= model,
        args= train_args,
        train_dataset=tokenized_dataset_gpt["train"],
        eval_dataset=tokenized_dataset_gpt["test"],
        tokenizer=tokenizer_gpt,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer_gpt),
        compute_metrics=compute_metrics,
    )
    
# Create HuggingFace Trainer from the pre-trained GPT2 model 
trainer = make_trainer_from_model(model_gpt)

# Inspect the accuracy before fine tuning
print("Evaluate before finetuning:")
print("-------------------------------------")
trainer.evaluate()


Evaluate before finetuning:
-------------------------------------


{'eval_loss': 0.4706331789493561,
 'eval_accuracy': 0.9107142857142857,
 'eval_runtime': 4.9497,
 'eval_samples_per_second': 11.314,
 'eval_steps_per_second': 0.808}


## Step 2: Performing Parameter-Efficient Fine-Tuning


In [19]:
# Step 2.1: Convert pre-trained GPT2 model to PEFT model 

from peft import LoraConfig
from peft import get_peft_model
from peft import TaskType
from peft import AutoPeftModelForSequenceClassification

module_list = ["c_attn", "c_proj", "c_fc", "c_proj"]
config = LoraConfig(task_type=TaskType.SEQ_CLS, 
                    inference_mode=False, 
                    r=8, 
                    lora_alpha=32,
                    lora_dropout=0.1,
                    target_modules = module_list)

lora_model = get_peft_model(model_gpt, config)
lora_model.print_trainable_parameters()


trainable params: 1,181,184 || all params: 125,622,528 || trainable%: 0.9402644723086611




In [20]:

# Step 2.2: Perform PEFT on pre-trained GPT2 model 

lora_trainer =  make_trainer_from_model(lora_model)

print("Training starts ...")
lora_trainer.train()
print("Training ends ...")


Training starts ...


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.456884,0.910714


Training ends ...


In [21]:
# Step 2.3: Save the fine-tuned model 

TUNED_MODEL_NAME = "my_peft_model"
lora_model.save_pretrained(TUNED_MODEL_NAME)



## Step 3: Performing Inference with a PEFT Model


In [27]:
# Step 3.1: Load the fine-tuned model 

# load newly fine-tuned model
tuned_model = AutoPeftModelForSequenceClassification.from_pretrained(TUNED_MODEL_NAME)
print(tuned_model)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): GPT2ForSequenceClassification(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D()
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict(

In [29]:

# Step 3.2: Evaluate and check the fine-tuned model's accuracy

# Correct pading token id
tuned_model.config.pad_token_id = model_gpt.config.eos_token_id

tuned_trainer = make_trainer_from_model(tuned_model)

# get fine-tuned evaulation results
results = tuned_trainer.evaluate()
print(results)


{'eval_loss': 0.4568840563297272, 'eval_accuracy': 0.9107142857142857, 'eval_runtime': 12.4306, 'eval_samples_per_second': 4.505, 'eval_steps_per_second': 0.322}
