In [44]:
#pip install transformers datasets peft

In [32]:
from datasets import load_dataset

In [45]:
# Load dataset
dataset = load_dataset("imdb")

In [46]:
# Split the training set into 80% train and 20% validation
train_test_split = dataset["train"].train_test_split(test_size=0.2)
train_dataset    = train_test_split["train"]
val_dataset      = train_test_split["test"]

In [47]:
val_dataset.shape

(5000, 2)

In [48]:
small_val_dataset = val_dataset.shuffle(seed=42).select(range(100))

In [49]:
from transformers import DistilBertTokenizerFast

In [50]:
# Load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")


In [51]:
# Prepare validation dataset for predictions
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)


In [52]:
val_tokenized = small_val_dataset.map(preprocess_function, batched=True)
val_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [53]:
from torch.utils.data import DataLoader
import torch
from sklearn.metrics import accuracy_score

In [54]:
val_loader = DataLoader(val_tokenized, batch_size=16)
model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [55]:
preds, labels = [], []
with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, label = batch["input_ids"], batch["attention_mask"], batch["label"]
        outputs = model(input_ids, attention_mask=attention_mask)
        preds.extend(torch.argmax(outputs.logits, dim=1).tolist())
        labels.extend(label.tolist())

In [56]:
val_accuracy = accuracy_score(labels, preds)
print(f"Validation Accuracy (Pretrained Model): {val_accuracy:.4f}")

Validation Accuracy (Pretrained Model): 0.3900


In [58]:
#pip install peft

In [59]:
from peft import LoraConfig, get_peft_model

In [62]:
# Modify LoRA Configuration for DistilBERT
lora_config = LoraConfig(
    r=8,  # Low-rank dimension
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,  # Dropout rate
    target_modules=["q_lin", "k_lin", "v_lin"],  # Correct names for DistilBERT attention layers
    task_type="SEQ_CLS",
)

In [63]:
# Add LoRA to the model
model = get_peft_model(model, lora_config)

In [64]:
# Verify LoRA integration
print(model)

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): DistilBertSdpaAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=76

In [65]:
small_train_dataset = train_dataset.shuffle(seed=42).select(range(100))
train_tokenized = small_train_dataset.map(preprocess_function, batched=True)
train_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [66]:
from transformers import Trainer, TrainingArguments

In [67]:
# Training Arguments
training_args = TrainingArguments(
    output_dir="./results_lora",
    evaluation_strategy="epoch",
    learning_rate=2e-4,  # Slightly higher for LoRA fine-tuning
    per_device_train_batch_size=16,
    num_train_epochs=3,
    logging_dir="./logs_lora",
    logging_steps=10,
    save_strategy="no",  # No checkpoint saving for simplicity
)



In [68]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [69]:
# Fine-tune LoRA parameters
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,0.686641
2,0.694200,0.677399
3,0.671700,0.67526


TrainOutput(global_step=21, training_loss=0.6805570920308431, metrics={'train_runtime': 1211.3207, 'train_samples_per_second': 0.248, 'train_steps_per_second': 0.017, 'total_flos': 40489769779200.0, 'train_loss': 0.6805570920308431, 'epoch': 3.0})

In [70]:
model.eval()
preds, labels = [], []
with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, label = batch["input_ids"], batch["attention_mask"], batch["label"]
        outputs = model(input_ids, attention_mask=attention_mask)
        preds.extend(torch.argmax(outputs.logits, dim=1).tolist())
        labels.extend(label.tolist())


In [71]:
val_accuracy_lora = accuracy_score(labels, preds)
print(f"Validation Accuracy (Fine-Tuned LoRA): {val_accuracy_lora:.4f}")

Validation Accuracy (Fine-Tuned LoRA): 0.5500


#### Key Points to Understand

1. **Efficient Fine-Tuning**:
   - LoRA aims to make the fine-tuning process more efficient by focusing on adapting a small number of parameters (the `A` and `B` matrices) instead of retraining the entire model. This is ideal when you need a lightweight solution for deploying a model across different tasks with minimal computational cost.

2. **Deploying LoRA Models**:
   Once you fine-tune the model with LoRA, you can deploy the model in two ways:
   
   - **Fine-tuned model**:
     - If you fine-tune LoRA on different tasks or data, you can deploy the updated LoRA parameters alongside the base model.
   
   - **Separate Fine-Tuning with Different Data**:
     - Since LoRA doesn't affect the base model's parameters significantly, you can fine-tune the same base model with different data (and adapt LoRA) without re-training the entire model.
