## Replicating & Copying Parameters for GPT

In [1]:
import torch as t
from torch import nn
from torch.utils.data import Dataset, DataLoader
import plotly.express as px
from IPython.display import display
import pandas as pd
import numpy as np
import transformers
from fancy_einsum import einsum
from dataclasses import dataclass
from tqdm.notebook import tqdm_notebook
import matplotlib

from einops import rearrange, reduce, repeat

In [3]:
import sys 
sys.path.append('../common')

import gpt_modules as gpt
import utils

In [4]:
from transformer_modules import Dropout, LayerNorm, MLP, TransformerConfig, Embedding
from general_modules import Linear

In [5]:
class GPTAttention(nn.Module):
    W_QKV: nn.Linear
    W_O: nn.Linear

    def __init__(self, hidden_size: int, num_heads: int, dropout: float):
        super().__init__()
        self.num_heads = num_heads
        self.query_size = int(hidden_size / num_heads)
        
        self.qkv = Linear(hidden_size, 3 * hidden_size)
        self.ff = Linear(hidden_size, hidden_size)

        self.dropout1 = Dropout(p=dropout)
        self.dropout2 = Dropout(p=dropout)

    def multihead_masked_attention(
        self, Q: t.Tensor, K: t.Tensor, V: t.Tensor, num_heads: int
    ):
        """
        Implements multihead masked attention on the matrices Q, K and V.

        Q: shape (batch, seq, nheads*headsize)
        K: shape (batch, seq, nheads*headsize)
        V: shape (batch, seq, nheads*headsize)

        returns: shape (batch, seq, nheads*headsize)
        """
        Q = rearrange(
            Q, "B S (nheads headsize) -> B S nheads headsize", nheads=num_heads
        )
        K = rearrange(
            K, "B S (nheads headsize) -> B S nheads headsize", nheads=num_heads
        )
        V = rearrange(
            V, "B S (nheads headsize) -> B S nheads headsize", nheads=num_heads
        )

        batch_size, seq_len, nheads, headsize = Q.shape
        scores = einsum(
            "B Qseq nheads headsize, B Kseq nheads headsize -> B nheads Qseq Kseq", Q, K
        )
        scores /= Q.shape[-1] ** 0.5

        # create lower-left triangle of ones, including the diagonal
        mask = t.tril(t.ones(seq_len, seq_len).to(Q.device), diagonal=0)
        # mask out the upper-right triangle
        scores = scores.masked_fill(mask == 0, -1e9)

        scores = t.softmax(scores, dim=-1)

        scores = self.dropout1(scores)

        Z = einsum(
            "B nheads Qseq Kseq, B Kseq nheads headsize -> B Qseq nheads headsize",
            scores,
            V,
        )
        Z = rearrange(Z, "B Qseq nheads headsize -> B Qseq (nheads headsize)")
        return Z

    def forward(self, x: t.Tensor) -> t.Tensor:
        """
        x: shape (batch, seq, hidden_size)

        Return: shape (batch, seq, hidden_size)
        """
        out = self.qkv(x)
        Q, K, V = t.tensor_split(out, 3, dim=-1)

        Z = self.multihead_masked_attention(Q, K, V, self.num_heads)
        out = self.ff(Z)
        out = self.dropout2(out)
        return out

In [6]:
class GPTDecoder(nn.Module):

    def __init__(self, config: TransformerConfig):
        super().__init__()
        self.lnorm1 = LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
        self.attn = GPTAttention(config.hidden_size, config.num_heads, config.dropout)
        self.lnorm2 = LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
        self.mlp = MLP(config.hidden_size, config.dropout)

    def forward(self, x: t.Tensor) -> t.Tensor:
        attn = self.attn(self.lnorm1(x))
        out = attn + x
        mlp = self.mlp(self.lnorm2(out))
        out = mlp + out
        return out

In [7]:
class GPT(nn.Module):

    def __init__(self, config: TransformerConfig):
        super().__init__()
        self.emb = Embedding(config.vocab_size, config.hidden_size)
        self.pos_emb = Embedding(config.max_seq_len, config.hidden_size)
        self.dropout = Dropout(p=config.dropout)

        decoders = [GPTDecoder(config) for l in range(config.num_layers)]
        self.decoders = nn.Sequential(*decoders)
        
        self.post_norm = LayerNorm(config.hidden_size)

    def forward(self, x: t.Tensor) -> t.Tensor:
        pos = t.arange(x.shape[1], device=x.device)
        embedding = self.emb(x) + self.pos_emb(pos)
        embedding = self.dropout(embedding)

        out = self.decoders(embedding)
        out = self.post_norm(out)

        out = einsum("B S E, V E -> B S V", out, self.emb.weight)

        return out

In [8]:
config = TransformerConfig(
    num_layers = 12,
    num_heads = 12,
    vocab_size = 50257,
    hidden_size = 768,
    max_seq_len = 1024,
    dropout = 0.1,
    layer_norm_epsilon = 1e-05
)

In [10]:
my_gpt = GPT(config).train()
gpt = transformers.AutoModelForCausalLM.from_pretrained("gpt2").train()

# compare the number of parameters between my implementation and the original
utils.print_param_count(my_gpt, gpt)

Model 1, total params = 124439808


Unnamed: 0,name_1,shape_1,num_params_1
0,emb.weight,"(50257, 768)",38597376
1,pos_emb.weight,"(1024, 768)",786432
2,decoders.0.lnorm1.weight,"(768,)",768
3,decoders.0.lnorm1.bias,"(768,)",768
4,decoders.0.attn.qkv.weight,"(2304, 768)",1769472
...,...,...,...
143,decoders.11.mlp.linear1.bias,"(3072,)",3072
144,decoders.11.mlp.linear2.weight,"(768, 3072)",2359296
145,decoders.11.mlp.linear2.bias,"(768,)",768
146,post_norm.weight,"(768,)",768


Model 2, total params = 124439808


Unnamed: 0,num_params_2,shape_2,name_2
0,38597376,"(50257, 768)",transformer.wte.weight
1,786432,"(1024, 768)",transformer.wpe.weight
2,768,"(768,)",transformer.h.0.ln_1.weight
3,768,"(768,)",transformer.h.0.ln_1.bias
4,1769472,"(768, 2304)",transformer.h.0.attn.c_attn.weight
...,...,...,...
143,3072,"(3072,)",transformer.h.11.mlp.c_fc.bias
144,2359296,"(3072, 768)",transformer.h.11.mlp.c_proj.weight
145,768,"(768,)",transformer.h.11.mlp.c_proj.bias
146,768,"(768,)",transformer.ln_f.weight


All parameter counts match!


Unnamed: 0,name_1,shape_1,num_params_1,num_params_2,shape_2,name_2
0,emb.weight,"(50257, 768)",38597376,38597376,"(50257, 768)",transformer.wte.weight
1,pos_emb.weight,"(1024, 768)",786432,786432,"(1024, 768)",transformer.wpe.weight
2,decoders.0.lnorm1.weight,"(768,)",768,768,"(768,)",transformer.h.0.ln_1.weight
3,decoders.0.lnorm1.bias,"(768,)",768,768,"(768,)",transformer.h.0.ln_1.bias
4,decoders.0.attn.qkv.weight,"(2304, 768)",1769472,1769472,"(768, 2304)",transformer.h.0.attn.c_attn.weight
5,decoders.0.attn.qkv.bias,"(2304,)",2304,2304,"(2304,)",transformer.h.0.attn.c_attn.bias
6,decoders.0.attn.ff.weight,"(768, 768)",589824,589824,"(768, 768)",transformer.h.0.attn.c_proj.weight
7,decoders.0.attn.ff.bias,"(768,)",768,768,"(768,)",transformer.h.0.attn.c_proj.bias
8,decoders.0.lnorm2.weight,"(768,)",768,768,"(768,)",transformer.h.0.ln_2.weight
9,decoders.0.lnorm2.bias,"(768,)",768,768,"(768,)",transformer.h.0.ln_2.bias


In [12]:
def copy_weights_from_gpt(my_gpt: GPT, gpt) -> GPT:
    '''
    Copy over the weights from pretrained GPT to this implementation of gpt.
    '''
    mydict = dict(my_gpt.named_parameters())
    pretraineddict = dict(gpt.named_parameters())

    # Initialize an empty dictionary to store the correct key-value pairs
    state_dict_to_load = {}

    for (my_name, my_param), (pt_name, pt_param) in zip(mydict.items(), pretraineddict.items()):
        if len(my_param.shape)==2 and my_param.shape == pt_param.T.shape:
            pt_param = pt_param.T

        state_dict_to_load[my_name] = pt_param

    my_gpt.load_state_dict(state_dict_to_load)
    
    return my_gpt

In [13]:
copy_weights_from_gpt(my_gpt, gpt)

GPT(
  (emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (dropout): Dropout(p=0.1)
  (decoders): Sequential(
    (0): GPTDecoder(
      (lnorm1): LayerNorm(normalized_shape=(768,), eps=1e-05, elementwise_affine=True)
      (attn): GPTAttention(
        (qkv): Linear()
        (ff): Linear()
        (dropout1): Dropout(p=0.1)
        (dropout2): Dropout(p=0.1)
      )
      (lnorm2): LayerNorm(normalized_shape=(768,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (linear1): Linear()
        (gelu): GELU()
        (linear2): Linear()
        (dropout): Dropout(p=0.1)
      )
    )
    (1): GPTDecoder(
      (lnorm1): LayerNorm(normalized_shape=(768,), eps=1e-05, elementwise_affine=True)
      (attn): GPTAttention(
        (qkv): Linear()
        (ff): Linear()
        (dropout1): Dropout(p=0.1)
        (dropout2): Dropout(p=0.1)
      )
      (lnorm2): LayerNorm(normalized_shape=(768,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (linear1)

In [14]:
tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")
utils.test_load_pretrained_weights(my_gpt, tokenizer)

Prompt:  Former President of the United States of America, George
Your model's top 10 predictions:  [' W', ' H', ' Bush', ' Washington', ' HW', ' Herbert', ' Pat', ' S', ' Soros', ' Wallace']


## TODO: Finetuning