# Lightweight Fine-Tuning Project

TODO: In this cell, describe your choices for each of the following

* PEFT technique: 
* Model: 
* Evaluation approach: 
* Fine-tuning dataset: 

## Loading and Evaluating a Foundation Model

TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import AutoPeftModelForSequenceClassification
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
from transformers import BitsAndBytesConfig
from datasets import load_dataset
import numpy as np
from peft import LoraConfig, get_peft_model
import torch
import pandas as pd

In [2]:
dataset = load_dataset("IbrahimAmin/egyptian-arabic-hate-speech")
for split in ['train', 'test']:
    dataset[split].shuffle(seed=42)
print(dataset)
id2label   = { i: name for i, name in enumerate(set(dataset['test']['label'])) }
label2id   = { name: i for i, name in enumerate(set(dataset['test']['label'])) }
print(id2label)
print(label2id)

Downloading readme: 0.00B [00:00, ?B/s]

Downloading data: 100%|██████████| 420k/420k [00:00<00:00, 2.11MB/s]
Downloading data: 100%|██████████| 108k/108k [00:00<00:00, 1.43MB/s]


Generating train split:   0%|          | 0/6535 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1634 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 6535
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1634
    })
})
{0: 'Religious Discrimination', 1: 'Offensive', 2: 'Racism', 3: 'Neutral', 4: 'Sexism'}
{'Religious Discrimination': 0, 'Offensive': 1, 'Racism': 2, 'Neutral': 3, 'Sexism': 4}


In [3]:
print("Labels information")
num_labels = len(set(dataset['train']['label']))
print(f"Number of classes - train: {num_labels}")
num_labels = len(set(dataset['test']['label']))
print(f"Number of classes - test : {num_labels}")
print(f"Labels - train: {set(dataset['train']['label'])}")
print(f"Labels - test : {set(dataset['test']['label'])}")

Labels information
Number of classes - train: 5
Number of classes - test : 5
Labels - train: {'Religious Discrimination', 'Offensive', 'Racism', 'Neutral', 'Sexism'}
Labels - test : {'Religious Discrimination', 'Offensive', 'Racism', 'Neutral', 'Sexism'}


In [4]:
base_model = AutoModelForSequenceClassification.from_pretrained(
    "gpt2",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
base_model.config.pad_token_id = tokenizer.eos_token_id


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [5]:
base_model

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=5, bias=False)
)

In [6]:
head = base_model.score
for n, p in head.named_parameters():
    print(f"{n}: {p}")

weight: Parameter containing:
tensor([[ 0.0031,  0.0055,  0.0034,  ...,  0.0051, -0.0155, -0.0438],
        [ 0.0113,  0.0241, -0.0040,  ...,  0.0078, -0.0217, -0.0076],
        [-0.0054, -0.0023, -0.0147,  ...,  0.0127,  0.0315, -0.0270],
        [-0.0137,  0.0243, -0.0260,  ..., -0.0130,  0.0027,  0.0454],
        [ 0.0168,  0.0175, -0.0219,  ..., -0.0233, -0.0272,  0.0018]],
       requires_grad=True)


In [7]:
def add_label_id(example):
    example["label_id"] = label2id[example["label"]]
    return example

def tokenize_and_attach_id(example):
    tokenized = tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        return_tensors='pt'
    )
    tokenized["labels"] = example["label_id"]
    return tokenized

dataset = dataset.map(add_label_id)
tokenized_dataset = {}
for split in ['train', 'test']:
    tokenized_dataset[split] = dataset[split].map(tokenize_and_attach_id,
                                                  batched=True,
                                                  remove_columns=dataset[split].column_names)

Map:   0%|          | 0/6535 [00:00<?, ? examples/s]

Map:   0%|          | 0/1634 [00:00<?, ? examples/s]

Map:   0%|          | 0/6535 [00:00<?, ? examples/s]

Map:   0%|          | 0/1634 [00:00<?, ? examples/s]

In [8]:
sample = dataset["train"][3445]
print(f"{dataset['train'][3445]['text']}\n")
tokenized_sample = tokenized_dataset["train"][3445]["input_ids"]
print(f"{tokenized_sample[0:70]}\n")
if hasattr(tokenized_sample, "tolist"):
    tokenized_sample = tokenized_sample.tolist()
tokens = tokenizer.convert_ids_to_tokens(tokenized_sample)
print(f"{tokens[0:70]}\n")
reconstructed = tokenizer.convert_tokens_to_string(tokens)[0:73]
print(reconstructed)

البنات مش نافعين لا في كوره ولا في السياسه ولا في شغل ولا في أي حاجه خالص

[23525, 39848, 23338, 34247, 103, 47048, 148, 112, 18923, 228, 12919, 149, 223, 44690, 22654, 23338, 220, 13862, 12919, 18923, 223, 22654, 18923, 225, 30335, 26897, 29519, 42092, 13862, 12919, 18923, 223, 22654, 28981, 45692, 22654, 34247, 111, 29519, 42092, 13862, 12919, 18923, 223, 22654, 17550, 112, 148, 118, 13862, 42092, 13862, 12919, 18923, 223, 22654, 17550, 96, 22654, 17550, 255, 34247, 105, 29519, 17550, 106, 23525, 148, 113, 50256]

['Ø§ÙĦ', 'Ø¨', 'ÙĨ', 'Ø§Ø', 'ª', 'ĠÙħ', 'Ø', '´', 'ĠÙ', 'Ĩ', 'Ø§', 'Ù', 'ģ', 'Ø¹', 'ÙĬ', 'ÙĨ', 'Ġ', 'ÙĦ', 'Ø§', 'ĠÙ', 'ģ', 'ÙĬ', 'ĠÙ', 'ĥ', 'ÙĪ', 'Ø±', 'Ùĩ', 'ĠÙĪ', 'ÙĦ', 'Ø§', 'ĠÙ', 'ģ', 'ÙĬ', 'ĠØ§ÙĦ', 'Ø³', 'ÙĬ', 'Ø§Ø', '³', 'Ùĩ', 'ĠÙĪ', 'ÙĦ', 'Ø§', 'ĠÙ', 'ģ', 'ÙĬ', 'ĠØ', '´', 'Ø', 'º', 'ÙĦ', 'ĠÙĪ', 'ÙĦ', 'Ø§', 'ĠÙ', 'ģ', 'ÙĬ', 'ĠØ', '£', 'ÙĬ', 'ĠØ', 'Ń', 'Ø§Ø', '¬', 'Ùĩ', 'ĠØ', '®', 'Ø§ÙĦ', 'Ø', 'µ', '<|endoftext|>']

البنات مش نافعين لا في كوره ولا في السياسه ولا في شغل

In [9]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}


In [10]:
base_model_trainer = Trainer(
    model= base_model,
    args= TrainingArguments(
    output_dir= "./base_model_evaluation",
    per_device_eval_batch_size= 16,
    do_train= False,
    do_eval= True,
),
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

base_model_metrics = base_model_trainer.evaluate()
print(base_model_metrics)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 3.2939870357513428, 'eval_accuracy': 0.1835985312117503, 'eval_runtime': 131.1589, 'eval_samples_per_second': 12.458, 'eval_steps_per_second': 0.785}


In [11]:
raw_test = dataset["test"]
pred_out = base_model_trainer.predict(tokenized_dataset["test"])
pred_ids = np.argmax(pred_out.predictions, axis=1)

df = pd.DataFrame({
    "text": raw_test["text"],
    "label": raw_test["label"],
    "predicted_label": [ id2label[i] for i in pred_ids ],
})

df.head(5)

Unnamed: 0,text,label,predicted_label
0,الخليجيين مبيستحموش واكتر ناس معفنا ونتنا ممكن...,Racism,Racism
1,طريق زحمه قوي مش عارف هيفضل زحمه كده لحد امتي,Neutral,Racism
2,ده حتي فيه زنوج بيترياو علي بعض من كتر ما هما ...,Racism,Racism
3,انا بزعل امي كتير ربنا يسامحني,Neutral,Racism
4,انا نفسي في موبايل جديد بس مش معايا فلوس,Neutral,Racism


## Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

In [12]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["c_attn"],
    lora_dropout=0.02,
    task_type="SEQ_CLS",
)

lora_model = get_peft_model(base_model, lora_config)



In [13]:
lora_model.print_trainable_parameters()

trainable params: 597,504 || all params: 125,037,312 || trainable%: 0.4778605605341228


In [14]:
lora_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): GPT2ForSequenceClassification(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): Linear(
                in_features=768, out_features=2304, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.02, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embeddi

In [15]:
head = lora_model.model.score
for n, p in head.named_parameters():
    print(f"{n}: {p}")

original_module.weight: Parameter containing:
tensor([[ 0.0031,  0.0055,  0.0034,  ...,  0.0051, -0.0155, -0.0438],
        [ 0.0113,  0.0241, -0.0040,  ...,  0.0078, -0.0217, -0.0076],
        [-0.0054, -0.0023, -0.0147,  ...,  0.0127,  0.0315, -0.0270],
        [-0.0137,  0.0243, -0.0260,  ..., -0.0130,  0.0027,  0.0454],
        [ 0.0168,  0.0175, -0.0219,  ..., -0.0233, -0.0272,  0.0018]],
       device='cuda:0', requires_grad=True)
modules_to_save.default.weight: Parameter containing:
tensor([[ 0.0031,  0.0055,  0.0034,  ...,  0.0051, -0.0155, -0.0438],
        [ 0.0113,  0.0241, -0.0040,  ...,  0.0078, -0.0217, -0.0076],
        [-0.0054, -0.0023, -0.0147,  ...,  0.0127,  0.0315, -0.0270],
        [-0.0137,  0.0243, -0.0260,  ..., -0.0130,  0.0027,  0.0454],
        [ 0.0168,  0.0175, -0.0219,  ..., -0.0233, -0.0272,  0.0018]],
       device='cuda:0', requires_grad=True)


In [16]:
lora_trainer = Trainer(
    model= lora_model,
    args= TrainingArguments(
    output_dir= "lora-gpt2",
    per_device_train_batch_size= 2,
    per_device_eval_batch_size= 2,
    gradient_accumulation_steps= 2,
    evaluation_strategy= "epoch",
    save_strategy= "epoch",
    num_train_epochs= 6,
    learning_rate=5e-4,
    warmup_steps=100,
    weight_decay=0.01,
    load_best_model_at_end= True,
),
    train_dataset= tokenized_dataset["train"],
    eval_dataset= tokenized_dataset["test"],
    tokenizer= tokenizer,
    data_collator= DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics= compute_metrics,
)

print(lora_trainer.args.optim)

OptimizerNames.ADAMW_TORCH


In [17]:
lora_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.3103,1.050025,0.591799
2,0.9335,0.930773,0.671359
3,0.7753,0.737174,0.753366
4,0.6592,0.717998,0.771726
5,0.5961,0.739166,0.793758
6,0.5248,0.733569,0.805386


TrainOutput(global_step=9804, training_loss=0.8413012927524998, metrics={'train_runtime': 9208.0135, 'train_samples_per_second': 4.258, 'train_steps_per_second': 1.065, 'total_flos': 2.063446359146496e+16, 'train_loss': 0.8413012927524998, 'epoch': 6.0})

###  ⚠️ IMPORTANT ⚠️

Due to workspace storage constraints, you should not store the model weights in the same directory but rather use `/tmp` to avoid workspace crashes which are irrecoverable.
Ensure you save it in /tmp always.

In [18]:
# Saving the model
lora_model.save_pretrained("temp/lora-gpt2/best")

## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.

In [24]:
finetuned_model = AutoPeftModelForSequenceClassification.from_pretrained(
    "temp/lora-gpt2/best",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    is_trainable=False
)
finetuned_model.config.pad_token_id = tokenizer.eos_token_id
finetuned_model.eval()

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): GPT2ForSequenceClassification(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): Linear(
                in_features=768, out_features=2304, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.02, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embeddi

In [25]:
finetuned_model_trainer = Trainer(
    model=finetuned_model,
    args=TrainingArguments(
        output_dir="./finetuned_model_evaluation",
        per_device_eval_batch_size=16,
        do_train=False,
        do_eval=True,
    ),
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

finetuned_model_metrics = finetuned_model_trainer.evaluate()
print(finetuned_model_metrics)

{'eval_loss': 0.7179978489875793, 'eval_accuracy': 0.7717258261933905, 'eval_runtime': 139.6509, 'eval_samples_per_second': 11.701, 'eval_steps_per_second': 0.738}


In [28]:
finetuned_pred_out = finetuned_model_trainer.predict(tokenized_dataset["test"])
finetuned_pred_ids = np.argmax(finetuned_pred_out.predictions, axis=1)

df_finetuned = pd.DataFrame({
    "text": raw_test["text"],
    "label": raw_test["label"],
    "predicted_label": [ id2label[i] for i in finetuned_pred_ids ],
})

df_finetuned.head(5)

Unnamed: 0,text,label,predicted_label
0,الخليجيين مبيستحموش واكتر ناس معفنا ونتنا ممكن...,Racism,Racism
1,طريق زحمه قوي مش عارف هيفضل زحمه كده لحد امتي,Neutral,Neutral
2,ده حتي فيه زنوج بيترياو علي بعض من كتر ما هما ...,Racism,Racism
3,انا بزعل امي كتير ربنا يسامحني,Neutral,Neutral
4,انا نفسي في موبايل جديد بس مش معايا فلوس,Neutral,Neutral


In [29]:
print("Pretrained gpt2 model accuracy:    ", base_model_metrics["eval_accuracy"])
print("Finetuned LoRA-gpt2 model accuracy:", finetuned_model_metrics["eval_accuracy"])


Pretrained gpt2 model accuracy:     0.1835985312117503
Finetuned LoRA-gpt2 model accuracy: 0.7717258261933905
