In [1]:
import os
os.chdir("../")

In [3]:
from src.hf_dataset.dataset import get_banking_77
from src.hf_models.get_model import get_distilbert
from src.config.configuration import ConfigurationManager


In [4]:
ds = get_banking_77()

model, tokenizer, config = get_distilbert(task="SequenceClassification")

/home/tess/work/deep_learning/transformers/models


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at /home/tess/work/deep_learning/transformers/models/distilbert/model and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([77]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([77, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully.


In [None]:
model.num_labels = 77

In [8]:
model.classifier.out_features=77

In [10]:
model.num_labels = 77

In [5]:
config_manager = ConfigurationManager()
params = config_manager.get_distilbert_config()


/home/tess/work/deep_learning/transformers/models


In [6]:

def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(batch["text"], max_length=80, truncation=True, padding="max_length")

# Encode the input data
ds = ds.map(encode_batch, batched=True)
# The transformers model expects the target class column to be named "labels"
ds = ds.rename_column(original_column_name="label", new_column_name="labels")
# Transform to pytorch tensors and only output the required columns
ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

ds["train"] = ds["train"].remove_columns(["text"])
ds["test"] = ds["test"].remove_columns(["text"])

In [7]:
from peft import LoraConfig


lora_config = LoraConfig(
    r = 7, # low rank means smaller update matrices with fewer trainable parameters
    target_modules = ["q_lin", "k_lin"],
    bias="none",
    lora_alpha=32,
    
)


In [8]:
from peft import get_peft_model
model = get_peft_model(model, lora_config)

In [9]:
model

PeftModel(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): DistilBertSdpaAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Identity()
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=768, out_features=7, bias=False)
             

In [10]:
model.print_trainable_parameters()

trainable params: 129,024 || all params: 67,141,709 || trainable%: 0.1922


In [11]:
from transformers import Trainer, TrainingArguments
from transformers.trainer_utils import EvalPrediction
import numpy as np

training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=200,
    output_dir="./training_output",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

def compute_accuracy(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis=1)
  return {"acc": (preds == p.label_ids).mean()}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    compute_metrics=compute_accuracy,
)

In [12]:
ds

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 10003
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 3080
    })
})

In [13]:
trainer.train()

Step,Training Loss


KeyboardInterrupt: 

In [15]:
ds

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 10003
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 3080
    })
})

In [16]:
model.num_labels

77