# Transformer 

In [71]:
%pwd

'/Users/gaohn/gaohn/learning-agency-lab-automated-essay-scoring-2'

## LLRD

In [72]:
from transformers import AutoModelForSequenceClassification, DebertaV2ForSequenceClassification
from torch import nn
from typing import List, Dict, Tuple, Iterator, Literal
from rich.pretty import pprint
from torch.optim import AdamW
from omnivault.utils.torch_utils.model_utils import get_named_parameters

In [74]:
BASE_MODEL: nn.Module = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-small")
ALL_LAYERNORM_LAYERS = [nn.LayerNorm]


def get_parameter_names(model, forbidden_layer_types):
    """
    Returns the names of the model parameters that are not inside a forbidden layer.
    """
    result = []
    for name, child in model.named_children():
        result += [
            f"{name}.{n}"
            for n in get_parameter_names(child, forbidden_layer_types)
            if not isinstance(child, tuple(forbidden_layer_types))
        ]
    # Add model specific parameters (defined with nn.Parameter) since they are not in any child.
    result += list(model._parameters.keys())
    return result


def get_decay_parameter_names(model) -> List[str]:
    """
    Get all parameter names that weight decay will be applied to

    Note that some models implement their own layernorm instead of calling nn.LayerNorm, weight decay could still
    apply to those modules since this function only filter out instance of nn.LayerNorm
    """
    decay_parameters = get_parameter_names(model, ALL_LAYERNORM_LAYERS)
    decay_parameters = [name for name in decay_parameters if "bias" not in name]
    return decay_parameters


decay_parameters = get_decay_parameter_names(BASE_MODEL)
optimizer_grouped_parameters = [
    {"params": [p for n, p in BASE_MODEL.named_parameters() if (n in decay_parameters and p.requires_grad)]},
    {"params": [p for n, p in BASE_MODEL.named_parameters() if n not in decay_parameters and p.requires_grad]},
]
optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=1e-5,
    weight_decay=0.01,
    betas=(0.9, 0.999),
    eps=1e-6,
    capturable=False,
    differentiable=False,
    maximize=False,
    amsgrad=False,
)

Pass the above in yields same results as if you set config in `TrainingArguments`
for `adamw_torch` with same parameters above. The decay group is set as such in
the huggingface's transformer library.

In [75]:
assert isinstance(BASE_MODEL, DebertaV2ForSequenceClassification)


named_parameters_grouped: Dict[
    Literal["EMBEDDINGS", "BACKBONE", "POOLER", "HEAD"], Iterator[Tuple[str, nn.Parameter]]
] = {
    "EMBEDDINGS": BASE_MODEL.deberta.embeddings.named_parameters(),
    "BACKBONE": BASE_MODEL.deberta.encoder.named_parameters(),
    "POOLER": BASE_MODEL.pooler.named_parameters(),
    "HEAD": BASE_MODEL.classifier.named_parameters(),
}
NO_DECAY: List[str] = ["bias", "LayerNorm.weight", "LayerNorm.bias"]  # CHANGE AS YOU WISH

pprint(named_parameters_grouped)

In [88]:
embeddings_and_backbone = [BASE_MODEL.deberta.embeddings] + list(BASE_MODEL.deberta.encoder.layer)
embeddings_and_backbone.reverse()

In [86]:
BASE_MODEL

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-5): 6 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=T

In [76]:
group_settings = {
    "EMBEDDINGS": (0.01, 1e-5),  # (weight_decay, lr)
    "BACKBONE": (0.03, 1e-4),  # (weight_decay, lr)
    "POOLER": (0.0, 1e-4),  # (weight_decay, lr)
    "HEAD": (0.0, 1e-4),  # (weight_decay, lr)
}

EMBEDDINGS_PARAM_GROUP = {
    "params": [
        parameter
        for parameter_name, parameter in named_parameters_grouped["EMBEDDINGS"]
        if not any(nd in parameter_name for nd in NO_DECAY)
    ],
    "lr": group_settings["EMBEDDINGS"][1],
    "weight_decay": group_settings["EMBEDDINGS"][0],
}

In [77]:
EMBEDDINGS_PARAM_GROUP["params"]

[Parameter containing:
 tensor([[ 1.1034e-03, -3.6097e-04, -1.9512e-03,  ..., -5.6000e-03,
          -6.8550e-03,  3.3997e-02],
         [ 1.1765e-02,  1.6769e-02, -1.3268e-02,  ...,  1.4679e-02,
          -3.7575e-03,  4.2969e-02],
         [ 2.1271e-02,  1.5610e-02, -1.2688e-02,  ...,  1.9821e-02,
          -1.6876e-02,  3.2959e-02],
         ...,
         [ 4.1008e-05, -4.1733e-03, -1.3351e-04,  ..., -1.5106e-02,
          -5.4779e-03,  3.0121e-02],
         [-6.1836e-03, -3.7823e-03, -6.9580e-03,  ..., -6.7368e-03,
          -5.2834e-03,  3.0716e-02],
         [-5.6953e-03, -4.5681e-04,  1.1883e-03,  ..., -6.3858e-03,
          -8.9340e-03,  3.2684e-02]], requires_grad=True)]

In [78]:
def categorize_parameters(
    named_parameters: Iterator[Tuple[str, nn.Parameter]],
    weight_decay: float,
    lr: float
) -> Dict[str, List[Dict[str, object]]]:
    decay, no_decay = [], []
    for parameter_name, parameter in named_parameters:
        if any(black in parameter_name for black in NO_DECAY):
            no_decay.append({"params": parameter, "weight_decay": 0.0, "lr": lr})
        elif "weight" in parameter_name and "bias" not in parameter_name:
            decay.append({"params": parameter, "weight_decay": weight_decay, "lr": lr})
        else:
            no_decay.append({"params": parameter, "weight_decay": 0.0, "lr": lr})
    return {"decay": decay, "no_decay": no_decay}

def set_parameters_by_group(group_settings: Dict[str, Tuple[float, float]]) -> List[Dict[str, object]]:
    param_groups = []
    named_params = get_named_parameters()
    for group_name, (weight_decay, lr) in group_settings.items():
        group_params = categorize_parameters(named_params[group_name], weight_decay, lr)
        param_groups.extend(group_params["decay"])
        param_groups.extend(group_params["no_decay"])
    return param_groups

In [79]:
model_named_parameters = get_named_parameters(BASE_MODEL)
for module_info in model_named_parameters:
    for name, param in module_info.items():
        print(name, param.requires_grad)
        break

deberta.embeddings.word_embeddings.weight True
deberta.embeddings.LayerNorm.weight True
deberta.embeddings.LayerNorm.bias True
deberta.encoder.layer.0.attention.self.query_proj.weight True
deberta.encoder.layer.0.attention.self.query_proj.bias True
deberta.encoder.layer.0.attention.self.key_proj.weight True
deberta.encoder.layer.0.attention.self.key_proj.bias True
deberta.encoder.layer.0.attention.self.value_proj.weight True
deberta.encoder.layer.0.attention.self.value_proj.bias True
deberta.encoder.layer.0.attention.output.dense.weight True
deberta.encoder.layer.0.attention.output.dense.bias True
deberta.encoder.layer.0.attention.output.LayerNorm.weight True
deberta.encoder.layer.0.attention.output.LayerNorm.bias True
deberta.encoder.layer.0.intermediate.dense.weight True
deberta.encoder.layer.0.intermediate.dense.bias True
deberta.encoder.layer.0.output.dense.weight True
deberta.encoder.layer.0.output.dense.bias True
deberta.encoder.layer.0.output.LayerNorm.weight True
deberta.encode

In [80]:
optimizer_grouped_parameters = [
    {
        "params": [
            p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
        ],
        "weight_decay": self.args.weight_decay,
    },
    {
        "params": [
            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
        ],
        "weight_decay": 0.0,
    },
]


NameError: name 'opt_model' is not defined