In [1]:
import random
import numpy as np
import os
import torch
import hydra
from omegaconf import DictConfig, OmegaConf
from lightning.pytorch import LightningModule, Trainer, seed_everything
from lightning.pytorch.callbacks import LearningRateMonitor, ModelCheckpoint
from lightning.pytorch.loggers import TensorBoardLogger

from dats import build_dataset
from model import build_model
# from tools.vis import save_attention_loc
from tools.validator import build_validator

@hydra.main(config_path="conf", config_name="finetune", version_base="1.3")
def run(cfg: DictConfig):
    save_dir = (f"{cfg.dataset.name}_result/{cfg.task.name}/{cfg.model.name}")
    save_path = os.path.join(cfg.root_dir, save_dir)
    meta_dataloader = build_dataset(cfg)
    train_loader = meta_dataloader['train']
    val_loader = meta_dataloader['val']
    lr_monitor = LearningRateMonitor(logging_interval='epoch')
    ckpt_cb = ModelCheckpoint(
        dirpath=os.path.join(save_path, "weight"),
        filename="best",
        monitor='val/loss',
        mode='min',
        save_top_k=1,
        save_last=True)
    callbacks = [lr_monitor, ckpt_cb]

  warn(


In [2]:
from hydra.core.global_hydra import GlobalHydra
from hydra import compose, initialize

if GlobalHydra.instance().is_initialized():
    GlobalHydra.instance().clear()
# Initialize Hydra and compose the config manually
initialize(config_path="conf")  # Make sure 'conf' is the correct path relative to your notebook
cfg = compose(config_name="finetune")  # Load the "train.yaml" config

# Print the default configuration
print(OmegaConf.to_yaml(cfg))  # Optional: To see the loaded configuration

root_dir: datasets
batch_size: 8
num_workers: 2
num_epochs: 10
eval_steps: 1000
learning_rate: 5.0e-05
weight_decay: 0.0001
scheduler:
  milestones:
  - 6
  - 8
  gamma: 0.1
apply_lora: false
lora_r: 16
lora_alpha: 32
wandb:
  project: deberta-glue
  entity: eddie880509
  tags:
  - lora
gpu: 0
load_pretrained: true
dataset:
  name: glue
  root_dir: datasets
  overwrite_cache: false
  use_tokenizer: true
task:
  name: cola
  max_seq_length: 64
  num_labels: 2
  learning_rate: 5.0e-05
model:
  name: deberta_base
  model_name: microsoft/deberta-base
  use_hf_weights: true
  output_attentions: true



The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  initialize(config_path="conf")  # Make sure 'conf' is the correct path relative to your notebook


In [3]:
from model import build_model
model = build_model(cfg)
print(model)

DebertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "legacy": true,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "transformers_version": "4.49.0",
  "type_vocab_size": 0,
  "vocab_size": 50265
}

DebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (

In [5]:
from transformers import ViTForImageClassification, ViTConfig
ways = 5
model_name='google/vit-base-patch16-224-in21k'
config = ViTConfig.from_pretrained(model_name, num_labels=ways)
model = ViTForImageClassification.from_pretrained(model_name, config=config)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from peft import get_peft_model, LoraConfig
lora_config = LoraConfig(
        r=16,
        lora_alpha=16,
        target_modules=["query","value"],
        lora_dropout=0.0,
        bias="none",
        modules_to_save=["classifier"],
        lora_nums=1,
    )
model = get_peft_model(model, lora_config)
print(model)

PeftModel(
  (base_model): LoraModel(
    (model): ViTForImageClassification(
      (vit): ViTModel(
        (embeddings): ViTEmbeddings(
          (patch_embeddings): ViTPatchEmbeddings(
            (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
          )
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (encoder): ViTEncoder(
          (layer): ModuleList(
            (0-11): 12 x ViTLayer(
              (attention): ViTSdpaAttention(
                (attention): ViTSdpaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_route): Linear(in_features=768, out_features=1, bias=False)
                    (lora_A0): Linear(in_features=768, out_features=16, bias=False)
                    (lora_B0): Linear(in_features=16, out_features=768, bias=False)
                  )
                  (key): Linear(in_features=768, out_features=768, bias=True)
              

In [7]:
from dats.dataset import create_miniimgnat, create_omniglot
ways = 5
shots = 5
tasksets = create_miniimgnat(train_samples=2*shots,
                                    train_ways=ways,
                                    test_samples=2*shots,
                                    test_ways=ways,)


In [8]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.shape)

device = torch.device('cpu')
cuda = torch.cuda.is_available()
seed = 42
if cuda and torch.cuda.device_count():
    torch.cuda.manual_seed(seed)
    device = torch.device('cuda')
model.to(device)

base_model.model.vit.encoder.layer.0.attention.attention.query.lora_route.weight torch.Size([1, 768])
base_model.model.vit.encoder.layer.0.attention.attention.query.lora_A0.weight torch.Size([16, 768])
base_model.model.vit.encoder.layer.0.attention.attention.query.lora_B0.weight torch.Size([768, 16])
base_model.model.vit.encoder.layer.0.attention.attention.value.lora_route.weight torch.Size([1, 768])
base_model.model.vit.encoder.layer.0.attention.attention.value.lora_A0.weight torch.Size([16, 768])
base_model.model.vit.encoder.layer.0.attention.attention.value.lora_B0.weight torch.Size([768, 16])
base_model.model.vit.encoder.layer.1.attention.attention.query.lora_route.weight torch.Size([1, 768])
base_model.model.vit.encoder.layer.1.attention.attention.query.lora_A0.weight torch.Size([16, 768])
base_model.model.vit.encoder.layer.1.attention.attention.query.lora_B0.weight torch.Size([768, 16])
base_model.model.vit.encoder.layer.1.attention.attention.value.lora_route.weight torch.Size([1

PeftModel(
  (base_model): LoraModel(
    (model): ViTForImageClassification(
      (vit): ViTModel(
        (embeddings): ViTEmbeddings(
          (patch_embeddings): ViTPatchEmbeddings(
            (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
          )
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (encoder): ViTEncoder(
          (layer): ModuleList(
            (0-11): 12 x ViTLayer(
              (attention): ViTSdpaAttention(
                (attention): ViTSdpaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_route): Linear(in_features=768, out_features=1, bias=False)
                    (lora_A0): Linear(in_features=768, out_features=16, bias=False)
                    (lora_B0): Linear(in_features=16, out_features=768, bias=False)
                  )
                  (key): Linear(in_features=768, out_features=768, bias=True)
              

In [9]:
import learn2learn as l2l
from torch import optim
from torch import nn
meta_lr = 0.001
fast_lr = 0.1
maml = l2l.algorithms.MAML(model, lr=fast_lr, first_order=False)
opt = optim.Adam(maml.parameters(), meta_lr)
loss_fn = nn.CrossEntropyLoss(reduction='mean')
for name, param in maml.named_parameters():
    print(f"Before cloning - {name}: requires_grad={param.requires_grad}, grad_fn={param.grad_fn}")

learner = maml.clone()
for name, param in learner.named_parameters():
    print(f"After cloning - {name}: requires_grad={param.requires_grad}, grad_fn={param.grad_fn}")
tx, ty=tasksets.train.sample()
tx, ty=tx.to(device), ty.to(device)
# gradients did not propagate normally on the replicate
loss=loss_fn(learner(tx)[0], ty)
diff_param=[p for p in learner.parameters() if p.requires_grad]
grad=torch.autograd.grad(loss, diff_param, retain_graph=True, create_graph=True, allow_unused=True)
print("-------------------")
print(grad)

Before cloning - module.base_model.model.vit.embeddings.cls_token: requires_grad=False, grad_fn=None
Before cloning - module.base_model.model.vit.embeddings.position_embeddings: requires_grad=False, grad_fn=None
Before cloning - module.base_model.model.vit.embeddings.patch_embeddings.projection.weight: requires_grad=False, grad_fn=None
Before cloning - module.base_model.model.vit.embeddings.patch_embeddings.projection.bias: requires_grad=False, grad_fn=None
Before cloning - module.base_model.model.vit.encoder.layer.0.attention.attention.query.weight: requires_grad=False, grad_fn=None
Before cloning - module.base_model.model.vit.encoder.layer.0.attention.attention.query.bias: requires_grad=False, grad_fn=None
Before cloning - module.base_model.model.vit.encoder.layer.0.attention.attention.query.lora_route.weight: requires_grad=True, grad_fn=None
Before cloning - module.base_model.model.vit.encoder.layer.0.attention.attention.query.lora_A0.weight: requires_grad=True, grad_fn=None
Before 