In [1]:
# Load the sms_spam dataset
# See: https://huggingface.co/datasets/sms_spam

from datasets import load_dataset

# The sms_spam dataset only has a train split, so we use the train_test_split method to split it into train and test
dataset = load_dataset("sms_spam", split="train").train_test_split(
    test_size=0.2, shuffle=True, seed=23
)

splits = ["train", "test"]

# View the dataset characteristics
print(dataset["train"])
print(dataset["test"])



Dataset({
    features: ['sms', 'label'],
    num_rows: 4459
})
Dataset({
    features: ['sms', 'label'],
    num_rows: 1115
})


In [2]:
# Inspect the first example. Do you think this is spam or not?
print(dataset["train"][0])
print(dataset["test"][0])

{'sms': 'Had your mobile 10 mths? Update to the latest Camera/Video phones for FREE. KEEP UR SAME NUMBER, Get extra free mins/texts. Text YES for a call\n', 'label': 1}
{'sms': 'Yup... Hey then one day on fri we can ask miwa and jiayin take leave go karaoke \n', 'label': 0}


In [3]:
# Tokenize datasets
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

tokenizer_gpt = AutoTokenizer.from_pretrained("gpt2")
#GPT2ForSequenceClassification.from_pretrained("gpt2") 
#
#tokenizer_gpt.add_special_tokens({'pad_token': '[PAD]'})
tokenizer_gpt.pad_token = tokenizer_gpt.eos_token

# Let's use a lambda function to tokenize all the examples
tokenized_dataset_gpt = {}
for split in splits:
    tokenized_dataset_gpt[split] = dataset[split].map(
        lambda x: tokenizer_gpt(x["sms"], padding="max_length", truncation=True), batched=True,
    )

# Inspect the available columns in the dataset
print(tokenized_dataset_gpt["train"])
print(tokenized_dataset_gpt["test"])


Dataset({
    features: ['sms', 'label', 'input_ids', 'attention_mask'],
    num_rows: 4459
})
Dataset({
    features: ['sms', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1115
})


In [4]:
#Load GPT2 Model
from transformers import AutoModelForSequenceClassification

model_gpt = AutoModelForSequenceClassification.from_pretrained(
    "gpt2", 
    num_labels=2,
    id2label={0: "not spam", 1: "spam"},
    label2id={"not spam": 0, "spam": 1},
)

model_gpt.config.pad_token_id = model_gpt.config.eos_token_id
print(model_gpt)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)


In [5]:
# Evaluate accuracy before PE Fine Tuning

import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}


# The HuggingFace Trainer class handles the training and eval loop for PyTorch for us.
# Read more about it here https://huggingface.co/docs/transformers/main_classes/trainer
trainer = Trainer(
    model= model_gpt,
    args=TrainingArguments(
        output_dir="./data/spam_not_spam/peft",
        # Set the learning rate
        learning_rate = 2e-5,
        # Set the per device train batch size and eval batch size
        per_device_train_batch_size = 16,
        per_device_eval_batch_size = 16,
        
        # Evaluate and save the model after each epoch
        evaluation_strategy = "epoch",
        save_strategy = "epoch",
        
        num_train_epochs=1,
        weight_decay=0.01,
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_dataset_gpt["train"],
    eval_dataset=tokenized_dataset_gpt["test"],
    tokenizer=tokenizer_gpt,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer_gpt),
    compute_metrics=compute_metrics,
)

print("Evaluate before finetuning:")
print("-------------------------------------")
trainer.evaluate()


Evaluate before finetuning:
-------------------------------------


{'eval_loss': 7.3595499992370605,
 'eval_accuracy': 0.12914798206278028,
 'eval_runtime': 68.6647,
 'eval_samples_per_second': 16.238,
 'eval_steps_per_second': 1.019}

In [14]:

# Perform PEFT 

from peft import LoraConfig
from peft import get_peft_model
from peft import TaskType
from peft import AutoPeftModelForSequenceClassification

#PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.7

# ["lin1", "lin2", "q_lin", "k_lin", "v_lin"]
module_list = ["c_attn", "c_proj"] #, "c_fc", "c_proj"]
# config = LoraConfig(target_modules = module_list)
"""
PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0

config = LoraConfig(
    r=8, # Rank
    lora_alpha=32,
    target_modules=['c_attn', 'c_proj'],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS #Specify the task as Sequence Classification.
)
"""

config = LoraConfig(task_type=TaskType.SEQ_CLS, 
                    inference_mode=False, 
                    r=8, 
                    lora_alpha=32,
                    lora_dropout=0.1,
                    target_modules = module_list)

lora_model = get_peft_model(model_gpt, config)
lora_model.print_trainable_parameters()

for param in lora_model.parameters():
    param.requires_grad = True

lora_trainer = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir="./data/spam_not_spam/lora",
        # Set the learning rate
        learning_rate = 2e-5,
        # Set the per device train batch size and eval batch size
        per_device_train_batch_size = 16,
        per_device_eval_batch_size = 16,
        
        # Evaluate and save the model after each epoch
        evaluation_strategy = "epoch",
        save_strategy = "epoch",
        
        num_train_epochs=1,
        weight_decay=0.01,
        load_best_model_at_end=True,
        eval_accumulation_steps = 1
    ),
    train_dataset=tokenized_dataset_gpt["train"],
    eval_dataset=tokenized_dataset_gpt["test"],
    tokenizer=tokenizer_gpt,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer_gpt),
    compute_metrics=compute_metrics,
)

print("Training starts ...")
lora_trainer.train()
print("Training ends ...")


trainable params: 812,544 || all params: 125,253,888 || trainable%: 0.6487175871139426
Training starts ...


RuntimeError: MPS backend out of memory (MPS allocated: 35.10 GB, other allocations: 1.16 GB, max allowed: 36.27 GB). Tried to allocate 192.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [7]:
# Show the performance of the model on the test set
# What do you think the evaluation accuracy will be?

print("Evaluate after finetuning:")
print("-------------------------------------")
lora_trainer.evaluate()


Evaluate after finetuning:
-------------------------------------


RuntimeError: MPS backend out of memory (MPS allocated: 35.83 GB, other allocations: 1.02 GB, max allowed: 36.27 GB). Tried to allocate 256 bytes on shared pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [10]:
# Make a dataframe with the predictions and the text and the labels
import pandas as pd

items_for_manual_review = tokenized_dataset_gpt["test"].select(
    [0, 1, 22, 31, 43, 292, 448, 487]
)

results = lora_trainer.predict(items_for_manual_review)
df = pd.DataFrame(
    {
        "sms": [item["sms"] for item in items_for_manual_review],
        "predictions": results.predictions.argmax(axis=1),
        "labels": results.label_ids,
    }
)
# Show all the cell
pd.set_option("display.max_colwidth", None)
df

RuntimeError: MPS backend out of memory (MPS allocated: 36.04 GB, other allocations: 201.55 MB, max allowed: 36.27 GB). Tried to allocate 72.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


'\ndef find_all_linear_names(model):\n    lora_module_names = set()\n    for name, module in model.named_modules():\n        if isinstance(module, bnb.nn.Linear4bit):\n            names = name.split(".")\n            lora_module_names.add(names[0] if len(names) == 1 else names[-1])\n\n    if "lm_head" in lora_module_names:  # needed for 16-bit\n        lora_module_names.remove("lm_head")\n    return list(lora_module_names)\n\nprint("Linear modules: ")\nprint(find_all_linear_names(model_gpt2))\n'

In [11]:
from peft import LoraConfig, TaskType
from transformers import AutoModelForSeq2SeqLM
from peft import get_peft_model


"""
peft_model = AutoModelForSeq2SeqLM.from_pretrained(
    "bigscience/mt0-large", 
    num_labels=2,
    id2label={0: "not spam", 1: "spam"},
    label2id={"not spam": 0, "spam": 1},
)  
"""

peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, 
                         inference_mode=False, 
                         r=8, 
                         lora_alpha=32, 
                         lora_dropout=0.1)
                         #target_modules = ["c_fc", "c_proj"])    

lora_model = get_peft_model(model_gpt2, peft_config)
lora_model.print_trainable_parameters()

trainable params: 1,179,648 || all params: 125,619,456 || trainable%: 0.939064725769868
