<a href="https://colab.research.google.com/github/graviraja/100-Days-of-NLP/blob/architectures/architectures/GPT2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GPT-2 Model 

The GPT-2 utilizes a 12-layer Decoder Only Transformer architecture. 

Natural language processing tasks, such as question answering, machine translation, reading comprehension, and summarization, are typically approached with supervised learning on taskspecific datasets. We demonstrate that language models begin to learn these tasks without any explicit supervision when trained on a new dataset of millions of webpages called WebText. Our largest model, GPT-2, is a 1.5B parameter Transformer that achieves state of the art results on 7 out of 8 tested language modeling datasets in a zero-shot setting but still underfits WebText. Samples from the model reflect these improvements and contain coherent paragraphs of text. These findings suggest a promising path towards building language processing systems which learn to perform tasks from their naturally occurring demonstrations.

Go through the following resources:

- [Illustrated GPT-2 by Jay Alammar](http://jalammar.github.io/illustrated-gpt2/)

- [Annotated GPT-2](https://amaarora.github.io/2020/02/18/annotatedGPT2.html)

- [GPT-2 Paper](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)

In [0]:
import copy
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np


In [0]:
class Conv1D(nn.Module):
    def __init__(self, nx, nf):
        super().__init__()
        self.nf = nf
        w = torch.empty(nx, nf)
        nn.init.normal_(w, std=0.02)
        self.weight = nn.Parameter(w)
        self.bias = nn.Parameter(torch.zeros(nf))
    
    def forward(self, x):
        # x => [batch_size, seq_len, d_model(nx)]

        # reshape the output to following size
        size_out = x.size()[:-1] + (self.nf, )
        # size_out => [batch_size, seq_len, nf]

        # x = W.x + b
        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
        x = x.view(*size_out)
        # x => [batch_size, seq_len, nf]

        return x

In [0]:
class FeedForward(nn.Module):
    def __init__(self, dropout, d_model=768, dff=768*4):
        super().__init__()

        self.c_fc = Conv1D(d_model, dff)
        self.c_proj = Conv1D(dff, d_model)
        self.act = F.gelu
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        # x => [batch_size, seq_len, d_model]

        x = self.act(self.c_fc(x))
        # x => [batch_size, seq_len, dff]
        
        x = self.c_proj(self.dropout(x))
        # x => [batch_size, seq_len, d_model]

        return x

In [0]:
class Attention(nn.Module):
    def __init__(self, d_model=768, n_head=12, n_ctx=1024, d_head=64, bias=True, scale=False):
        super().__init__()

        self.n_head = n_head
        self.d_model = d_model
        self.c_attn = Conv1D(d_model, d_model * 3)
        self.scale = scale
        self.softmax = nn.Softmax(dim=-1)
        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
        self.dropout = nn.Dropout(0.1)
        self.c_proj = Conv1D(d_model, d_model)
    
    def split_heads(self, x):
        # x => [batch_size, seq_len, d_model]

        new_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
        # new_shape => [batch_size, seq_len, n_heads, head_dim]

        x = x.view(*new_shape)
        x = x.permute(0, 2, 1, 3)
        # x => [batch_size, n_heads, seq_len, head_dim]

        return x

    def _attn(self, q, k, v, attn_mask=None):
        # q, k, v => [batch_size, n_heads, seq_len, head_dim]

        scores = torch.matmul(q, k.transpose(-2, -1))
        # q   => [batch_size, n_heads, seq_len, head_dim]
        # k^t => [batch_size, n_heads, head_dim, seq_len]
        # scores => [batch_size, n_heads, seq_len, seq_len]

        if self.scale:
            scores = scores / math.sqrt(v.size(-1))
        nd, ns = scores.size(-2), scores.size(-1)

        if attn_mask is not None:
            scores = scores + attn_mask
        scores = self.softmax(scores)
        scores = self.dropout(scores)
        # scores => [batch_size, n_heads, seq_len, seq_len]
        # v      => [batch_size, n_heads, seq_len, head_dim]

        outputs = torch.matmul(scores, v)
        # outputs => [batch_size, n_heads, seq_len, head_dim]
        
        return outputs
    
    def merge_heads(self, x):
        # x => [batch_size, n_heads, seq_len, head_dim]
        
        x = x.permute(0, 2, 1, 3).contiguous()
        # x => [batch_size, seq_len, n_heads, head_dim]

        new_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
        # new_shape => [batch_size, seq_len, d_model]
        
        return x.view(*new_shape)

    def forward(self, x):
        # x => [batch_size, seq_len, d_model]
        
        x = self.c_attn(x)
        # x => [batch_size, seq_len, d_model * 3]

        q, k, v = x.split(self.d_model, dim=-1)
        # q, k, v => [batch_size, seq_len, d_model]

        q, k, v = self.split_heads(q), self.split_heads(k), self.split_heads(v)
        # q, k, v => [batch_size, n_heads, seq_len, head_dim]

        out = self._attn(q, k, v)
        # out => [batch_size, n_heads, seq_len, head_dim]

        out = self.merge_heads(out)
        # out => [batch_size, seq_len, d_model]

        out = self.c_proj(out)
        # out => [batch_size, seq_len, d_model]
        
        return out

In [0]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model=768, n_head=12, dropout=0.1):
        super().__init__()

        self.attn = Attention(d_model, n_head, d_head=64, n_ctx=1024, bias=True, scale=False)
        self.feedforward = FeedForward(dropout, d_model, d_model*4)
        self.ln_1 = nn.LayerNorm(d_model)
        self.ln_2 = nn.LayerNorm(d_model)
    
    def forward(self, x):
        # x => [batch_size, seq_len, d_model]

        # self-attention and layer normalization
        x = x + self.attn(self.ln_1(x))

        # feed forward network and layer normalization
        x = x + self.feedforward(self.ln_2(x))
        return x

In [0]:
def _get_clones(module, n):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(n)])


class GPT2(nn.Module):
    def __init__(self, n_layers=12, n_ctx=1024, d_model=768, vcb_sz=50257):
        super().__init__()
        
        self.n_layers = n_layers
        block = TransformerBlock(d_model=768, n_head=12, dropout=0.1)
        self.h = _get_clones(block, n_layers)
        self.wte = nn.Embedding(vcb_sz, d_model)
        self.wpe = nn.Embedding(n_ctx, d_model)
        self.drop = nn.Dropout(0.1)
        self.ln_f = nn.LayerNorm(d_model)
        self.out = nn.Linear(d_model, vcb_sz, bias=False)
        self.loss_fn = nn.CrossEntropyLoss()
        self.init_weights()
    
    def init_weights(self):
        self.out.weight = self.wte.weight
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    def forward(self, src, labels=None, pos_ids=None):
        # src => [batch_size, seq_len]

        # create positional vectors if not present
        if pos_ids is None:
            pos_ids = torch.arange(0, src.size(-1)).unsqueeze(0)
            # pos_ids => [1, seq_len]
        
        inp = self.drop((self.wte(src) + self.wpe(pos_ids)))
        # inp => [batch_size, seq_len, d_model]

        for i in range(self.n_layers):
            inp = self.h[i](inp)
        
        # final layer normalization
        inp = self.ln_f(inp)

        # prediction layer
        logits = self.out(inp)
        # logits => [batch_size, seq_len, vocab_size]

        outputs = (logits, ) + (inp, )

        if labels is not None:
            # ignore the last token (<eos>)
            shift_logits = logits[..., :-1, :].contiguous()

            # ignore the starting label (<sos>)
            shift_labels = labels[..., 1:].contiguous()

            loss = self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

            outputs = (loss, ) + outputs
            # outputs => (loss, logits, inp_representation)

            return outputs
        
        return logits

In [14]:
# load pretrained_weights from hugging face
!wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin


--2020-06-08 17:04:40--  https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.40.190
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.40.190|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 548118077 (523M) [application/octet-stream]
Saving to: ‘gpt2-pytorch_model.bin.1’


2020-06-08 17:04:48 (69.8 MB/s) - ‘gpt2-pytorch_model.bin.1’ saved [548118077/548118077]



In [48]:
model = GPT2()

model_dict = model.state_dict()
state_dict = torch.load('./gpt2-pytorch_model.bin')

old_keys = []
new_keys = []

for key in state_dict.keys():
    # The hugging face state dict references the feedforward network as mlp, 
    # need to replace to `feedforward` be able to reuse these weights
    if "mlp" in key:
        new_key = key.replace("mlp", "feedforward")
        new_keys.append(new_key)
        old_keys.append(key)

for old_key, new_key in zip(old_keys, new_keys):
    state_dict[new_key] = state_dict.pop(old_key)

pretrained_dict = {k: v for k, v in state_dict.items() if k in model_dict}

model_dict.update(pretrained_dict)
model.load_state_dict(model_dict)
model.eval()


GPT2(
  (h): ModuleList(
    (0): TransformerBlock(
      (attn): Attention(
        (c_attn): Conv1D()
        (softmax): Softmax(dim=-1)
        (dropout): Dropout(p=0.1, inplace=False)
        (c_proj): Conv1D()
      )
      (feedforward): FeedForward(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (1): TransformerBlock(
      (attn): Attention(
        (c_attn): Conv1D()
        (softmax): Softmax(dim=-1)
        (dropout): Dropout(p=0.1, inplace=False)
        (c_proj): Conv1D()
      )
      (feedforward): FeedForward(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (2): 

In [21]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/48/35/ad2c5b1b8f99feaaf9d7cdadaeef261f098c6e1a6a2935d4d07662a6b780/transformers-2.11.0-py3-none-any.whl (674kB)
[K     |▌                               | 10kB 16.4MB/s eta 0:00:01[K     |█                               | 20kB 2.2MB/s eta 0:00:01[K     |█▌                              | 30kB 2.5MB/s eta 0:00:01[K     |██                              | 40kB 2.7MB/s eta 0:00:01[K     |██▍                             | 51kB 2.5MB/s eta 0:00:01[K     |███                             | 61kB 2.8MB/s eta 0:00:01[K     |███▍                            | 71kB 3.0MB/s eta 0:00:01[K     |███▉                            | 81kB 3.2MB/s eta 0:00:01[K     |████▍                           | 92kB 3.1MB/s eta 0:00:01[K     |████▉                           | 102kB 3.2MB/s eta 0:00:01[K     |█████▍                          | 112kB 3.2MB/s eta 0:00:01[K     |█████▉                          | 122kB 3.2

In [22]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [0]:
def generate(context, ntok=20):
    for _ in range(ntok):
        out = model(context)
        logits = out[:, -1, :]
        indices_to_remove = logits < torch.topk(logits, 10)[0][..., -1, None]
        logits[indices_to_remove] = np.NINF
        next_tok = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1).squeeze(1)
        context = torch.cat([context, next_tok.unsqueeze(-1)], dim=-1)
    return context

In [133]:
context = torch.tensor([tokenizer.encode("The world is ")])
out = generate(context, ntok=15)
tokenizer.decode(out[0])

'The world is full of surprises.\n\n\n\n\n\n\n\n\n"\n'

In [134]:
context = torch.tensor([tokenizer.encode("The world is ")])
out = generate(context, ntok=15)
tokenizer.decode(out[0])

'The world is so rich in its own country, it is poor, it is. in'

In [143]:
context = torch.tensor([tokenizer.encode("The planet earth is ")])
out = generate(context, ntok=15)
tokenizer.decode(out[0])

'The planet earth is a beautiful place where you live, you have your heart. is my is'