In [1]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
import torch.nn as nn

In [3]:
from soft_embedding import SoftEmbedding

n_tokens = 20
initialize_from_vocab = True

In [4]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [5]:
# freeze model params and add soft-prompting "layer"
for p in model.parameters():
    p.requires_grad=False

In [6]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('Freeze parameters :', count_parameters(model))

Original parameters : 0


In [7]:
s_wte = SoftEmbedding(model.get_input_embeddings(), 
                      n_tokens=n_tokens, 
                      initialize_from_vocab=initialize_from_vocab)

In [8]:
model.set_input_embeddings(s_wte)

In [9]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('Add Prompt parameters :', count_parameters(model))

Add Prompt parameters : 15360


In [10]:
def get_nb_trainable_parameters():
    r"""
    Returns the number of trainable parameters and number of all parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        # Due to the design of 4bit linear layers from bitsandbytes
        # one needs to multiply the number of parameters by 2 to get
        # the correct number of parameters
        if param.__class__.__name__ == "Params4bit":
            num_params = num_params * 2

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params

    return trainable_params, all_param


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params, all_param = get_nb_trainable_parameters()

    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")

In [11]:
print_trainable_parameters(model)

trainable params: 15360 || all params: 124455168 || trainable%: 0.012341793632868666


In [12]:
inputs = tokenizer("May the force be", return_tensors="pt")

# need to pad attention_mask and input_ids to be full seq_len + n_learned_tokens
# even though it does not matter what you pad input_ids with, it's just to make HF happy
inputs['input_ids'] = torch.cat([torch.full((1,n_tokens), 50256), inputs['input_ids']], 1)
inputs['attention_mask'] = torch.cat([torch.full((1,n_tokens), 1), inputs['attention_mask']], 1)

outputs = model(**inputs)

In [13]:
outputs.logits.shape

torch.Size([1, 24, 50257])