In [2]:
from transformers import AutoModelForCausalLM

In [3]:
import os
os.environ["http_proxy"] = "http://127.0.0.1:7890"
os.environ["https_proxy"] = "http://127.0.0.1:7890"

In [4]:
gpt2 = AutoModelForCausalLM.from_pretrained('gpt2')

```
# modeling_utils._load_state_dict_into_model
def _load_state_dict_into_model(model_to_load, state_dict, start_prefix):
    ...
```

- lm_head 结构是在 `GPT2LMHeadModel` 内部定义和创建的，
    - 但在 from_pretrained 加载预训练参数时，其参数是从 gpt2model（一个Transformer架构）的 wte 里来的；

## model arch

In [15]:
gpt2

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [5]:
type(gpt2)

transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel

### output_embeddings()


- input_embeddings: wte
    - Word Token Embeddings (ps. wpe, Word Position Embeddings )
- output_embeddings: lm_head

```
class GPT2LMHeadModel(GPT2PreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]
    
    ...
    
    def get_input_embeddings(self):
        return self.wte
        
    def get_output_embeddings(self):
        return self.lm_head
```

## word embedding & lm_head

- The GPT2 Model transformer with a **language modeling head on top** (`lm_head`) (linear layer with weights **tied** to the input embeddings).
    - https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2LMHeadModel


In [6]:
gpt2.transformer.wte.weight

Parameter containing:
tensor([[-0.1101, -0.0393,  0.0331,  ..., -0.1364,  0.0151,  0.0453],
        [ 0.0403, -0.0486,  0.0462,  ...,  0.0861,  0.0025,  0.0432],
        [-0.1275,  0.0479,  0.1841,  ...,  0.0899, -0.1297, -0.0879],
        ...,
        [-0.0445, -0.0548,  0.0123,  ...,  0.1044,  0.0978, -0.0695],
        [ 0.1860,  0.0167,  0.0461,  ..., -0.0963,  0.0785, -0.0225],
        [ 0.0514, -0.0277,  0.0499,  ...,  0.0070,  0.1552,  0.1207]],
       requires_grad=True)

In [7]:
gpt2.lm_head.weight

Parameter containing:
tensor([[-0.1101, -0.0393,  0.0331,  ..., -0.1364,  0.0151,  0.0453],
        [ 0.0403, -0.0486,  0.0462,  ...,  0.0861,  0.0025,  0.0432],
        [-0.1275,  0.0479,  0.1841,  ...,  0.0899, -0.1297, -0.0879],
        ...,
        [-0.0445, -0.0548,  0.0123,  ...,  0.1044,  0.0978, -0.0695],
        [ 0.1860,  0.0167,  0.0461,  ..., -0.0963,  0.0785, -0.0225],
        [ 0.0514, -0.0277,  0.0499,  ...,  0.0070,  0.1552,  0.1207]],
       requires_grad=True)

In [8]:
type(gpt2.lm_head.weight)

torch.nn.parameter.Parameter

In [9]:
gpt2.lm_head.weight is gpt2.transformer.wte.weight

True

In [12]:
gpt2.lm_head.weight.data_ptr??

In [13]:
# Returns the address of the first element of :attr:`self` tensor.
gpt2.lm_head.weight.data_ptr() == gpt2.transformer.wte.weight.data_ptr()

True

In [18]:
print(gpt2.state_dict()['transformer.wte.weight'].shape)
print(gpt2.state_dict()['lm_head.weight'].shape)

torch.Size([50257, 768])
torch.Size([50257, 768])


In [14]:
# 只占用一份内存空间
print(gpt2.state_dict()['transformer.wte.weight'])
print(gpt2.state_dict()['lm_head.weight'])

tensor([[-0.1101, -0.0393,  0.0331,  ..., -0.1364,  0.0151,  0.0453],
        [ 0.0403, -0.0486,  0.0462,  ...,  0.0861,  0.0025,  0.0432],
        [-0.1275,  0.0479,  0.1841,  ...,  0.0899, -0.1297, -0.0879],
        ...,
        [-0.0445, -0.0548,  0.0123,  ...,  0.1044,  0.0978, -0.0695],
        [ 0.1860,  0.0167,  0.0461,  ..., -0.0963,  0.0785, -0.0225],
        [ 0.0514, -0.0277,  0.0499,  ...,  0.0070,  0.1552,  0.1207]])
tensor([[-0.1101, -0.0393,  0.0331,  ..., -0.1364,  0.0151,  0.0453],
        [ 0.0403, -0.0486,  0.0462,  ...,  0.0861,  0.0025,  0.0432],
        [-0.1275,  0.0479,  0.1841,  ...,  0.0899, -0.1297, -0.0879],
        ...,
        [-0.0445, -0.0548,  0.0123,  ...,  0.1044,  0.0978, -0.0695],
        [ 0.1860,  0.0167,  0.0461,  ..., -0.0963,  0.0785, -0.0225],
        [ 0.0514, -0.0277,  0.0499,  ...,  0.0070,  0.1552,  0.1207]])


### how tied

```

class PreTrainedModel(
    ...
    def tie_weights(self):

        if getattr(self.config, "tie_word_embeddings", True):
            output_embeddings = self.get_output_embeddings()
            if output_embeddings is not None:
                self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())

    def _tie_or_clone_weights(self, output_embeddings, input_embeddings):
        """Tie or clone module weights depending of whether we are using TorchScript or not"""
        if self.config.torchscript:
            output_embeddings.weight = nn.Parameter(input_embeddings.weight.clone())
        else:
            output_embeddings.weight = input_embeddings.weight
```

### tied or shared tensors

- https://huggingface.co/docs/safetensors/torch_shared_tensors

    - Pytorch uses shared tensors for some computation. This is extremely interesting to reduce memory usage in general.

    - One very classic use case is in transformers the embeddings are shared with lm_head. By using the same matrix, the model uses less parameters, and gradients flow much better to the embeddings (which is the start of the model, so they don’t flow easily there, whereas lm_head is at the tail of the model, so gradients are extremely good over there, since they are the same tensors, they both benefit)

In [32]:

from torch import nn

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.a = nn.Linear(100, 100)
        self.b = nn.Linear(100, 100)

    def forward(self, x):
        return self.b(self.a(x))


model = Model()
print(model.state_dict())
# odict_keys(['a.weight', 'a.bias', 'b.weight', 'b.bias'])
torch.save(model.state_dict(), "model.bin")
# This file is now 41k instead of ~80k, because A and B are the same weight hence only 1 is saved on disk with both `a` and `b` pointing to the same buffer

odict_keys(['a.weight', 'a.bias', 'b.weight', 'b.bias'])


In [31]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.a = nn.Linear(100, 100)
        # tied/share
        self.b = self.a

    def forward(self, x):
        return self.b(self.a(x))


model = Model()
print(model.state_dict())
# odict_keys(['a.weight', 'a.bias', 'b.weighat', 'b.bias'])
torch.save(model.state_dict(), "model2.bin")
# This file is now 41k instead of ~80k, because A and B are the same weight hence only 1 is saved on disk with both `a` and `b` pointing to the same buffer


OrderedDict([('a.weight', tensor([[ 0.0777,  0.0683,  0.0710,  ...,  0.0692,  0.0377,  0.0951],
        [ 0.0986,  0.0207, -0.0691,  ...,  0.0168,  0.0718, -0.0220],
        [ 0.0806, -0.0145, -0.0851,  ...,  0.0653,  0.0729, -0.0443],
        ...,
        [ 0.0818, -0.0725,  0.0595,  ..., -0.0729,  0.0758,  0.0752],
        [ 0.0983, -0.0282, -0.0066,  ..., -0.0467, -0.0237, -0.0505],
        [-0.0291, -0.0101, -0.0842,  ..., -0.0088, -0.0748,  0.0641]])), ('a.bias', tensor([ 0.0377,  0.0652,  0.0465, -0.0092,  0.0909,  0.0700, -0.0753, -0.0464,
        -0.0905, -0.0142,  0.0044,  0.0673, -0.0510,  0.0401,  0.0207, -0.0703,
         0.0661, -0.0329, -0.0917,  0.0600,  0.0594,  0.0968,  0.0822, -0.0912,
         0.0221,  0.0809, -0.0047, -0.0823,  0.0861,  0.0808, -0.0131, -0.0903,
        -0.0515,  0.0507, -0.0054,  0.0317, -0.0846, -0.0964,  0.0124, -0.0123,
         0.0576,  0.0543, -0.0357,  0.0272,  0.0058, -0.0178,  0.0899,  0.0117,
         0.0805, -0.0146, -0.0219,  0.0898,  0.