
## Step 1: Loading and Evaluating a Foundation Model


In [1]:
# Step 1.1: Load the sms_spam dataset

from datasets import load_dataset

DATA_SET_RATE = 0.01

# split the datasets
dataset = load_dataset("sms_spam", split="train").train_test_split(
    train_size=DATA_SET_RATE, test_size=DATA_SET_RATE, shuffle=True, seed=23
)

splits = ["train", "test"]

# View the dataset characteristics
print(dataset["train"])
print(dataset["test"])



Dataset({
    features: ['sms', 'label'],
    num_rows: 55
})
Dataset({
    features: ['sms', 'label'],
    num_rows: 56
})


In [2]:
# Inspect the first example from both test and train
print(dataset["train"][0])
print(dataset["test"][0])

{'sms': "I'm so in love with you. I'm excited each day i spend with you. You make me so happy.\n", 'label': 0}
{'sms': 'Yup... Hey then one day on fri we can ask miwa and jiayin take leave go karaoke \n', 'label': 0}


In [3]:
# Step 1.2: Tokenize datasets

from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

tokenizer_gpt = AutoTokenizer.from_pretrained("gpt2")
tokenizer_gpt.add_special_tokens({'pad_token': '[PAD]'})
tokenizer_gpt.pad_token = tokenizer_gpt.eos_token

# Let's use a lambda function to tokenize all the examples
tokenized_dataset_gpt = {}
for split in splits:
    tokenized_dataset_gpt[split] = dataset[split].map(
        lambda x: tokenizer_gpt(x["sms"], padding="max_length", truncation=True), batched=True,
    )

# Inspect the available columns in the dataset
print(tokenized_dataset_gpt["train"])
print(tokenized_dataset_gpt["test"])


Map:   0%|          | 0/56 [00:00<?, ? examples/s]

Dataset({
    features: ['sms', 'label', 'input_ids', 'attention_mask'],
    num_rows: 55
})
Dataset({
    features: ['sms', 'label', 'input_ids', 'attention_mask'],
    num_rows: 56
})


In [8]:
# Step 1.3: Load pre trained GPT2 Model

from transformers import AutoModelForSequenceClassification
import torch 

def get_device_name(): 
    device_name = "cpu"
    if torch.cuda.is_available():
        device_name = "cuda"
    elif torch.backends.mps.is_available():
        device_name = "mps"
    return device_name
    
model_gpt = AutoModelForSequenceClassification.from_pretrained(
    "gpt2", 
    num_labels=2,
    id2label={0: "not spam", 1: "spam"},
    label2id={"not spam": 0, "spam": 1},
)

for param in model_gpt.parameters():
    param.requires_grad = True
    
model_gpt.config.pad_token_id = model_gpt.config.eos_token_id
model_gpt.to(get_device_name())
print(model_gpt)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)


In [9]:
# Step 1.4: Evaluate and check accuracy from the pre-trained GPT2 model

import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

train_args = TrainingArguments(
        output_dir="./data/spam_not_spam/peft",
        # Set the learning rate
        learning_rate = 2e-5,
        # Set the per device train batch size and eval batch size
        per_device_train_batch_size = 16,
        per_device_eval_batch_size = 16,
        # Evaluate and save the model after each epoch
        evaluation_strategy = "epoch",
        save_strategy = "epoch",
        num_train_epochs=1,
        weight_decay=0.01,
        load_best_model_at_end=True,
    )

# Create HuggingFace Trainer from the pre-trained GPT2 model 
trainer = Trainer(
    model= model_gpt,
    args= train_args,
    train_dataset=tokenized_dataset_gpt["train"],
    eval_dataset=tokenized_dataset_gpt["test"],
    tokenizer=tokenizer_gpt,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer_gpt),
    compute_metrics=compute_metrics,
)

# Inspect the accuracy before fine tuning
print("Evaluate before finetuning:")
print("-------------------------------------")
trainer.evaluate()


Evaluate before finetuning:
-------------------------------------


{'eval_loss': 0.32273033261299133,
 'eval_accuracy': 0.9107142857142857,
 'eval_runtime': 3.4962,
 'eval_samples_per_second': 16.018,
 'eval_steps_per_second': 1.144}


## Step 2: Performing Parameter-Efficient Fine-Tuning


In [10]:
# Step 2.1: Convert pre-trained GPT2 model to PEFT model 

from peft import LoraConfig
from peft import get_peft_model
from peft import TaskType
from peft import AutoPeftModelForSequenceClassification

module_list = ["c_attn", "c_proj", "c_fc", "c_proj"]
config = LoraConfig(task_type=TaskType.SEQ_CLS, 
                    inference_mode=False, 
                    r=8, 
                    lora_alpha=32,
                    lora_dropout=0.1,
                    target_modules = module_list)

lora_model = get_peft_model(model_gpt, config)
lora_model.print_trainable_parameters()


trainable params: 1,181,184 || all params: 125,622,528 || trainable%: 0.9402644723086611


In [11]:

# Step 2.2: Perform PEFT on pre-trained GPT2 model 

lora_trainer = Trainer(
    model=lora_model,
    args= train_args,
    train_dataset=tokenized_dataset_gpt["train"],
    eval_dataset=tokenized_dataset_gpt["test"],
    tokenizer=tokenizer_gpt,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer_gpt),
    compute_metrics=compute_metrics,
)

print("Training starts ...")
lora_trainer.train()
print("Training ends ...")


Training starts ...


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.318013,0.910714


Checkpoint destination directory ./data/spam_not_spam/peft/checkpoint-4 already exists and is non-empty. Saving will proceed but saved results may be invalid.


Training ends ...


In [13]:
# Step 2.3: Save the fine-tuned model 

TUNED_MODEL_NAME = "my_peft_model"
lora_model.save_pretrained(TUNED_MODEL_NAME)



## Step 3: Performing Inference with a PEFT Model


In [14]:
# Step 3.1: Load the fine-tuned model 

# load newly fine-tuned model
tuned_model = AutoModelForSequenceClassification.from_pretrained(TUNED_MODEL_NAME)
print(tuned_model)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): lora.Linear(
            (base_layer): Conv1D()
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=768, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=2304, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (c_proj): lora.Linear(
            (base_layer): Conv1D()
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1

In [15]:

# Step 3.2: Evaluate and check the fine-tuned model's accuracy
    
tuned_trainer = Trainer(
    model = tuned_model,
    args = train_args,
    train_dataset=tokenized_dataset_gpt["train"],
    eval_dataset=tokenized_dataset_gpt["test"],
    tokenizer = tokenizer_gpt,
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer_gpt),
    compute_metrics = compute_metrics,
)

# get fine-tuned evaulation results
results = tuned_trainer.evaluate()
print(results)


AssertionError: Cannot handle batch sizes > 1 if no padding token is defined.