In [1]:
import os
os.chdir("../")

In [3]:
from peft import PrefixTuningConfig, TaskType
from src.hf_models.get_model import get_distilbert, get_distilgpt2


prefix_config = PrefixTuningConfig(
    task_type=TaskType.SEQ_CLS,
    num_virtual_tokens=10,
    num_layers=6,
    token_dim=768,
    num_attention_heads=12
    
)


In [4]:
model, tokenizer, config = get_distilgpt2(task="SequenceClassification",
                                          num_labels=77)

tokenizer.pad_token = tokenizer.eos_token

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at /home/tess/work/deep_learning/transformers/models/distilgpt2/model and are newly initialized because the shapes did not match:
- score.weight: found shape torch.Size([1, 768]) in the checkpoint and torch.Size([77, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully.


In [5]:
model

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=77, bias=False)
)

In [6]:
config

GPT2Config {
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.47.1",
  "use_cache": true,
  "vocab_size": 50257
}

In [7]:
from peft import get_peft_model

model = get_peft_model(model,peft_config=prefix_config)


In [8]:
from src.hf_dataset.dataset import get_banking_77
ds = get_banking_77()
def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(batch["text"], max_length=80, truncation=True, padding="max_length")

# Encode the input data
ds = ds.map(encode_batch, batched=True)
# The transformers model expects the target class column to be named "labels"
ds = ds.rename_column(original_column_name="label", new_column_name="labels")
# Transform to pytorch tensors and only output the required columns
ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

ds["train"] = ds["train"].remove_columns(["text"])
ds["test"] = ds["test"].remove_columns(["text"])

In [9]:
model.config.pad_token_id = tokenizer.pad_token_id

In [10]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)

In [12]:
model

PeftModelForSequenceClassification(
  (base_model): GPT2ForSequenceClassification(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-5): 6 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2SdpaAttention(
            (c_attn): Conv1D(nf=2304, nx=768)
            (c_proj): Conv1D(nf=768, nx=768)
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D(nf=3072, nx=768)
            (c_proj): Conv1D(nf=768, nx=3072)
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (sco

In [40]:
from transformers import Trainer, TrainingArguments
from transformers.trainer_utils import EvalPrediction
import numpy as np
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,
    learning_rate=1e-4,
    num_train_epochs=1,
    fp16=False,
    evaluation_strategy="steps"
)


def compute_accuracy(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis=1)
  return {"acc": (preds == p.label_ids).mean()}

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=ds["train"],
    eval_dataset=ds["train"],
    compute_metrics=compute_accuracy,
    
    
    
)



In [34]:
model.num_labels

77

In [None]:
trainer.train()

In [35]:
trainer.evaluate(ds["test"])

{'eval_loss': 4.173234462738037,
 'eval_model_preparation_time': 0.0028,
 'eval_acc': 0.05,
 'eval_runtime': 20.9176,
 'eval_samples_per_second': 147.245,
 'eval_steps_per_second': 18.406}

In [21]:
prompt= "I want to check my balance"
input_ids = tokenizer(prompt, return_tensors="pt")

In [26]:
input_ids = input_ids.to(model.device)

In [27]:
import torch
with torch.no_grad():
    logits = model(**input_ids).logits

In [28]:
predicted_class_id = logits.argmax().item()

In [29]:
predicted_class_id

26