In [2]:
import torch
from torch import nn
import torch.nn.functional as F

In [11]:
from transformers import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTConfig

In [4]:
from datasets import load_dataset

# Test Pretrained GTP and FineTune on a Downstream Classification Task

In [8]:
bookcorpus_ds = load_dataset("bookcorpus/bookcorpus", split=['train'])

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [10]:
bookcorpus_ds[0]['text']

['usually , he would be tearing around the living room , playing with his toys .',
 'but just one look at a minion sent him practically catatonic .',
 "that had been megan 's plan when she got him dressed earlier .",
 "he 'd seen the movie almost by mistake , considering he was a little young for the pg cartoon , but with older cousins , along with her brothers , mason was often exposed to things that were older .",
 'she liked to think being surrounded by adults and older kids was one reason why he was a such a good talker for his age .',
 "`` are n't you being a good boy ? ''",
 'she said .',
 'mason barely acknowledged her .',
 'instead , his baby blues remained focused on the television .',
 'since the movie was almost over , megan knew she better slip into the bedroom and finish getting ready .',
 "each time she looked into mason 's face , she was grateful that he looked nothing like his father .",
 'his platinum blond hair and blue eyes were completely hers .',
 'it was only his 

In [15]:
tokenizer = AutoTokenizer.from_pretrained("openai-community/openai-gpt")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/656 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/816k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/458k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

In [16]:
model = OpenAIGPTModel.from_pretrained("openai-community/openai-gpt")

model.safetensors:   0%|          | 0.00/479M [00:00<?, ?B/s]

In [25]:
model.config

OpenAIGPTConfig {
  "_name_or_path": "openai-community/openai-gpt",
  "afn": "gelu",
  "architectures": [
    "OpenAIGPTLMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "openai-gpt",
  "n_ctx": 512,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 512,
  "n_special": 0,
  "predict_special_tokens": true,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.41.2",
  "vocab_size": 40478
}

In [17]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")

In [39]:
tokenizer.get_vocab()

{'animously</w>': 39373,
 'mu</w>': 36758,
 'freeze</w>': 9233,
 'cayden</w>': 40309,
 'scrolled</w>': 18085,
 'gordy</w>': 22286,
 'vagrant</w>': 38184,
 'joy': 7378,
 'raved</w>': 40404,
 'roles</w>': 18476,
 'affliction</w>': 30217,
 'chariah</w>': 31689,
 'whipla': 34852,
 'lishment</w>': 11941,
 'pock': 13669,
 'rima</w>': 36035,
 'apprehensively</w>': 29717,
 'swapped</w>': 27015,
 'weepy</w>': 39376,
 'barber</w>': 26784,
 'schem': 14433,
 'screw': 20634,
 'pastures</w>': 29508,
 'medic': 3412,
 'seem</w>': 2153,
 'nachi': 35569,
 'lunacy</w>': 37458,
 'besieged</w>': 35784,
 'juarez</w>': 33999,
 'cynically</w>': 39511,
 'georg': 19775,
 'longer</w>': 1872,
 'natural</w>': 3342,
 'sidhe</w>': 16634,
 'shackled</w>': 26255,
 'commanded</w>': 7389,
 'elected</w>': 15188,
 'haughtily</w>': 33030,
 'looked</w>': 816,
 'names</w>': 4009,
 'nicolae</w>': 24818,
 'assaulting</w>': 30156,
 'showed</w>': 2699,
 'beet</w>': 31578,
 'hag</w>': 19192,
 'figuring</w>': 11436,
 'ali</w>': 67

In [53]:
print(tokenizer.__dict__)



In [21]:
outputs = model(**inputs)

In [26]:
last_hidden_states = outputs.last_hidden_state

In [27]:
last_hidden_states.shape

torch.Size([1, 6, 768])

In [54]:
yelp_review_ds = load_dataset("yelp_review_full")

Downloading readme:   0%|          | 0.00/6.72k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/299M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

# Implement a GPT from scratch

In [55]:
import math

In [77]:
n_position = 32
d_model = 512
n_head = 2
initializer_range = 0.02
dropout = 0.1
vocab_size = len(tokenizer.get_vocab())
batch_size = 8
n_layer = 3

In [59]:
embedding = torch.rand(batch_size, n_positions, d_model)

In [60]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super(MultiHeadAttention, self).__init__()
        assert d_model % n_head == 0, "The dimension of model must be divisible by the number of heads"
        
        self.d_model = d_model
        self.n_head = n_head
        self.d_head = d_model // n_head

        self.Wq = nn.Linear(d_model, d_model)
        self.Wk = nn.Linear(d_model, d_model)
        self.Wv = nn.Linear(d_model, d_model)
        self.proj = nn.Linear(d_model, d_model)

    def split_into_heads(self, x, batch_size):
        x = x.contiguous().view(batch_size, -1, self.n_head, self.d_head)
        return x.permute(0,2,1,3)

    def forward(self, v, k, q, mask=None):
        q = self.Wq(q)
        k = self.Wk(k)
        v = self.Wv(v)
        batch_size = q.size(0)

        q = self.split_into_heads(q, batch_size)
        k = self.split_into_heads(k, batch_size)
        v = self.split_into_heads(v, batch_size)

        attention = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_head)
        if mask is not None:
            attention = attention.masked_fill_(mask, '-inf')
        attention = F.softmax(attention, dim=-1)

        output = torch.matmul(attention, v)
        output = output.transpose(1, 2)
        output = output.contiguous().view(batch_size, -1, self.d_model)
        
        output = self.proj(output)
        return output

In [61]:
mla = MultiHeadAttention(d_model, n_head)

In [62]:
mla(embedding, embedding, embedding).size()

torch.Size([8, 32, 512])

In [65]:
class FeedForward(nn.Module):
    def __init__(self, d_model, dropout):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, 2*d_model)
        self.linear2 = nn.Linear(2*d_model, d_model)
        self.relu = nn.GELU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [66]:
ff = FeedForward(d_model, dropout)

In [67]:
ff(embedding).size()

torch.Size([8, 32, 512])

In [73]:
class GPTBlock(nn.Module):
    def __init__(self, d_model, n_head, dropout):
        super(GPTBlock, self).__init__()
        self.mla = MultiHeadAttention(d_model, n_head)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.ff = FeedForward(d_model, dropout)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_out = self.mla(x, x, x, mask)
        x = x + self.dropout1(attn_out)
        x = self.norm1(x)

        ff_out = self.ff(x)
        x = x + self.dropout2(ff_out)
        x = self.norm2(x)
        return x

In [74]:
bptb = GPTBlock(d_model, n_head, dropout)

In [75]:
bptb(embedding).size()

torch.Size([8, 32, 512])

In [90]:
class GPTBase(nn.Module):
    def __init__(self, d_model, n_head, vocab_size, n_position, n_layer, dropout=0.1):
        super(GPTBase, self).__init__()
        self.d_model = d_model
        self.n_head = n_head

        self.eb = nn.Embedding(vocab_size, d_model)
        self.pe = nn.Parameter(torch.zeros(1, n_position, d_model))
        self.gptbs = nn.ModuleList([GPTBlock(d_model, n_head, dropout) for _ in range(n_layer)])
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, mask=None):
        x = self.eb(x)
        x = x + self.pe
        for gptb in self.gptbs:
            x = gptb(x, mask)
        return self.norm(x)

In [91]:
gpt_model = GPTBase(d_model, n_head, vocab_size, n_position, n_layer, dropout)

In [92]:
tk = torch.randint(0, vocab_size, (batch_size, n_position))

In [93]:
gpt_model(tk).size()

torch.Size([8, 32, 512])

In [94]:
embedding.shape

torch.Size([8, 32, 512])

In [95]:
class AutoRegressiveHead(nn.Module):
    def __init__(self, d_model, vocab_size):
        super(AutoRegressiveHead, self).__init__()
        self.linear = nn.Linear(d_model, vocab_size)
    
    def forward(self, x):
        return self.linear(x)

In [96]:
arh = AutoRegressiveHead(d_model, vocab_size)

In [97]:
arh(embedding).size()

torch.Size([8, 32, 40478])