In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch import optim

In [67]:
import sacrebleu

In [2]:
from transformers import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTConfig, AutoTokenizer
from tokenizers.processors import TemplateProcessing

In [3]:
from datasets import load_dataset

In [4]:
from tqdm.notebook import tqdm, trange

In [5]:
assert torch.cuda.device_count() > 1, "This script requires at least 2 GPUs"
device_ids = [i for i in range(torch.cuda.device_count())]

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Test Pretrained GTP and FineTune on a Downstream Classification Task

In [7]:
bookcorpus_ds = load_dataset("bookcorpus/bookcorpus", split=['train'])

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [8]:
bookcorpus_ds[0]['text']

['usually , he would be tearing around the living room , playing with his toys .',
 'but just one look at a minion sent him practically catatonic .',
 "that had been megan 's plan when she got him dressed earlier .",
 "he 'd seen the movie almost by mistake , considering he was a little young for the pg cartoon , but with older cousins , along with her brothers , mason was often exposed to things that were older .",
 'she liked to think being surrounded by adults and older kids was one reason why he was a such a good talker for his age .',
 "`` are n't you being a good boy ? ''",
 'she said .',
 'mason barely acknowledged her .',
 'instead , his baby blues remained focused on the television .',
 'since the movie was almost over , megan knew she better slip into the bedroom and finish getting ready .',
 "each time she looked into mason 's face , she was grateful that he looked nothing like his father .",
 'his platinum blond hair and blue eyes were completely hers .',
 'it was only his 

In [9]:
tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-community/openai-gpt", padding_side="right", truncation_side="left", pad_token="<pad>", eos_token="</s>", bos_token="<s>", unk_token="<unk>", cls_token="<cls>", maks_token="<mask>", sep_token="<sep>")

In [10]:
tokenizer.__dict__

{'nlp': <spacy.tokenizer.Tokenizer at 0x7f39dd770ca0>,
 'fix_text': <function ftfy.fix_text(text, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, fix_latin_ligatures=True, fix_character_width=True, uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True, normalization='NFC', max_decode_length=1000000)>,
 'encoder': {'.': 1,
  ',': 2,
  't': 3,
  'h': 4,
  'e': 5,
  '"': 6,
  'o': 7,
  'a': 8,
  'n': 9,
  'd': 10,
  'i': 11,
  'f': 12,
  'w': 13,
  's': 14,
  'y': 15,
  'u': 16,
  'r': 17,
  "'": 18,
  '?': 19,
  'm': 20,
  'b': 21,
  '-': 22,
  'v': 23,
  'p': 24,
  'c': 25,
  'l': 26,
  'k': 27,
  'j': 28,
  '!': 29,
  'g': 30,
  '*': 31,
  ';': 32,
  ':': 33,
  'x': 34,
  'q': 35,
  'z': 36,
  ')': 37,
  '(': 38,
  '1': 39,
  '/': 40,
  '_': 41,
  '2': 42,
  '3': 43,
  '4': 44,
  '~': 45,
  '5': 46,
  '#': 47,
  '0': 48,
  '6': 49,
  '7': 50,
  '$': 51,
  '>': 52,
  '9': 53,
  '8': 54,
  '[': 55,
  ']': 56,
  '<'

In [11]:
model = OpenAIGPTModel.from_pretrained("openai-community/openai-gpt")

In [12]:
model.config

OpenAIGPTConfig {
  "_name_or_path": "openai-community/openai-gpt",
  "afn": "gelu",
  "architectures": [
    "OpenAIGPTLMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "openai-gpt",
  "n_ctx": 512,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 512,
  "n_special": 0,
  "predict_special_tokens": true,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.41.2",
  "vocab_size": 40478
}

In [13]:
inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="pt", truncation=True, padding=True, add_special_tokens=True)

In [14]:
inputs

tensor([[3570,  240,  547, 2585,  544, 4957]])

In [15]:
outputs = tokenizer.batch_decode(inputs)

In [16]:
outputs

['hello, my dog is cute']

In [17]:
tokenizer.get_vocab()

{'.': 1,
 ',': 2,
 't': 3,
 'h': 4,
 'e': 5,
 '"': 6,
 'o': 7,
 'a': 8,
 'n': 9,
 'd': 10,
 'i': 11,
 'f': 12,
 'w': 13,
 's': 14,
 'y': 15,
 'u': 16,
 'r': 17,
 "'": 18,
 '?': 19,
 'm': 20,
 'b': 21,
 '-': 22,
 'v': 23,
 'p': 24,
 'c': 25,
 'l': 26,
 'k': 27,
 'j': 28,
 '!': 29,
 'g': 30,
 '*': 31,
 ';': 32,
 ':': 33,
 'x': 34,
 'q': 35,
 'z': 36,
 ')': 37,
 '(': 38,
 '1': 39,
 '/': 40,
 '_': 41,
 '2': 42,
 '3': 43,
 '4': 44,
 '~': 45,
 '5': 46,
 '#': 47,
 '0': 48,
 '6': 49,
 '7': 50,
 '$': 51,
 '>': 52,
 '9': 53,
 '8': 54,
 '[': 55,
 ']': 56,
 '<': 57,
 '&': 58,
 '%': 59,
 '¨': 60,
 '`': 61,
 'é': 62,
 '»': 63,
 '«': 64,
 '=': 65,
 '•': 66,
 '@': 67,
 '+': 68,
 '©': 69,
 '¡': 70,
 '{': 71,
 '}': 72,
 'ª': 73,
 'ñ': 74,
 'ï': 75,
 '‖': 76,
 'ç': 77,
 'í': 78,
 '^': 79,
 '£': 80,
 '§': 81,
 '♥': 82,
 '−': 83,
 'à': 84,
 '|': 85,
 '°': 86,
 '¦': 87,
 'ł': 88,
 'ĩ': 89,
 'ü': 90,
 '®': 91,
 'ù': 92,
 'á': 93,
 'â': 94,
 'ó': 95,
 'è': 96,
 '∞': 97,
 'ë': 98,
 'ä': 99,
 '♪': 100,
 'ò': 10

In [18]:
len(tokenizer.get_vocab())

40483

In [19]:
tokenizer.decode(tokenizer.pad_token_id)

'<pad>'

In [20]:
print(tokenizer.__dict__)



In [21]:
outputs = model(inputs)

In [22]:
last_hidden_states = outputs.last_hidden_state

In [23]:
last_hidden_states.shape

torch.Size([1, 6, 768])

In [24]:
yelp_review_ds = load_dataset("yelp_review_full")

# Implement a GPT from scratch

In [25]:
import math

In [26]:
n_position = 64
max_generation_length = 50
d_model = 512
n_head = 2
initializer_range = 0.02
dropout = 0.1
vocab_size = len(tokenizer.get_vocab())
batch_size = 64
n_layer = 3

In [27]:
def create_causal_mask(size, device='cuda'):
    mask = torch.triu(torch.ones(size, size, device=device), diagonal=1).bool()
    return mask

In [28]:
def create_token_mask(sequence, padding_id=0, device='cuda'):
    mask = (sequence == padding_id).unsqueeze(1).unsqueeze(2)
    return mask.to(device)

In [29]:
embedding = torch.rand(batch_size, n_position, d_model)

In [30]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super(MultiHeadAttention, self).__init__()
        assert d_model % n_head == 0, "The dimension of model must be divisible by the number of heads"
        
        self.d_model = d_model
        self.n_head = n_head
        self.d_head = d_model // n_head

        self.Wq = nn.Linear(d_model, d_model)
        self.Wk = nn.Linear(d_model, d_model)
        self.Wv = nn.Linear(d_model, d_model)
        self.proj = nn.Linear(d_model, d_model)

    def split_into_heads(self, x, batch_size):
        x = x.contiguous().view(batch_size, -1, self.n_head, self.d_head)
        return x.permute(0,2,1,3)

    def forward(self, v, k, q, mask=None):
        q = self.Wq(q)
        k = self.Wk(k)
        v = self.Wv(v)
        batch_size = q.size(0)

        q = self.split_into_heads(q, batch_size)
        k = self.split_into_heads(k, batch_size)
        v = self.split_into_heads(v, batch_size)

        attention = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_head)
        if mask is not None:
            attention = attention.masked_fill(mask, float('-inf'))
        attention = F.softmax(attention, dim=-1)

        output = torch.matmul(attention, v)
        output = output.transpose(1, 2)
        output = output.contiguous().view(batch_size, -1, self.d_model)
        
        output = self.proj(output)
        return output

In [31]:
mla = MultiHeadAttention(d_model, n_head)

In [32]:
mla(embedding, embedding, embedding).size()

torch.Size([64, 64, 512])

In [33]:
class FeedForward(nn.Module):
    def __init__(self, d_model, dropout):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, 2*d_model)
        self.linear2 = nn.Linear(2*d_model, d_model)
        self.relu = nn.GELU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [34]:
ff = FeedForward(d_model, dropout)

In [35]:
ff(embedding).size()

torch.Size([64, 64, 512])

In [36]:
class GPTBlock(nn.Module):
    def __init__(self, d_model, n_head, dropout):
        super(GPTBlock, self).__init__()
        self.mla = MultiHeadAttention(d_model, n_head)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.ff = FeedForward(d_model, dropout)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_out = self.mla(x, x, x, mask)
        x = x + self.dropout1(attn_out)
        x = self.norm1(x)

        ff_out = self.ff(x)
        x = x + self.dropout2(ff_out)
        x = self.norm2(x)
        return x

In [37]:
bptb = GPTBlock(d_model, n_head, dropout)

In [38]:
bptb(embedding).size()

torch.Size([64, 64, 512])

In [39]:
class GPTBase(nn.Module):
    def __init__(self, d_model, n_head, vocab_size, n_position, n_layer, dropout=0.1, initializer_range=0.02):
        super(GPTBase, self).__init__()
        self.d_model = d_model
        self.n_head = n_head
        self.vocab_size = vocab_size
        self.n_position = n_position
        self.n_layer = n_layer
        self.dropout = dropout
        self.initializer_range = initializer_range

        self.eb = nn.Embedding(vocab_size, d_model)
        self.pe = nn.Parameter(torch.zeros(1, n_position, d_model))
        self.gptbs = nn.ModuleList([GPTBlock(d_model, n_head, dropout) for _ in range(n_layer)])
        self.norm = nn.LayerNorm(d_model)

        self.linear = nn.Linear(d_model, vocab_size)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            nn.init.trunc_normal_(module.weight.data, mean=0.0, std=self.initializer_range, a=-2*self.initializer_range, b=2*self.initializer_range)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

    def generate_embeddings(self, x, mask=None):
        x = self.eb(x)
        x = x + self.pe
        for gptb in self.gptbs:
            x = gptb(x, mask)
        return self.norm(x)

    def forward(self, x, mask=None):
        x = self.generate_embeddings(x, mask=mask)
        return self.linear(x)

    def generate_text(self, init_prompts, tokenizer, max_generation_length=50):
        device = next(self.parameters()).device
        batch_size = len(init_prompts)
        current_length = 0
        self.eval()

        # This tokenizer should be left padding
        generation_tokens = tokenizer(init_prompts, return_tensors='pt', max_length=self.n_position, padding='max_length', truncation=True, add_special_tokens=False).input_ids
        generation_tokens = generation_tokens.to(device)

        for idx in range(max_generation_length):
            window_tokens = generation_tokens[:, -self.n_position:]
            padding_mask = create_token_mask(window_tokens, tokenizer.pad_token_id, device)
            causal_mask = create_causal_mask(self.n_position, device)
            combined_mask = padding_mask | causal_mask
            
            with torch.no_grad():
                outputs = gpt_model(window_tokens, mask=combined_mask)
                next_token_logits = outputs[:,-1,:]
                next_token_logits[:, tokenizer.eos_token_id] = float('-inf')
                next_tokens = next_token_logits.argmax(dim=-1, keepdim=True)

            if (next_tokens == tokenizer.eos_token_id).all():
                break
            generation_tokens = torch.cat([generation_tokens, next_tokens], dim=1)
            current_length = idx + 1

        return ['']*batch_size if current_length==0 else tokenizer.batch_decode(generation_tokens[:, -current_length:], skip_special_tokens=True)

In [40]:
gpt_model = GPTBase(d_model, n_head, vocab_size, n_position, n_layer, dropout)

In [41]:
tk = torch.randint(0, vocab_size, (batch_size, n_position))

In [42]:
gpt_model(tk).size()

torch.Size([64, 64, 40483])

In [43]:
gpt_model = nn.DataParallel(gpt_model, device_ids=device_ids)
gpt_model = gpt_model.to('cuda')

# Pretrain GPT for TextCompletion

In [44]:
yelp_review_ds_train = yelp_review_ds['train']

In [45]:
yelp_review_dataloader_train = DataLoader(yelp_review_ds_train, batch_size=batch_size, shuffle=True)

In [46]:
yelp_review_ds_test = yelp_review_ds['test']

In [47]:
yelp_review_dataloader_test = DataLoader(yelp_review_ds_train, batch_size=batch_size, shuffle=True)

In [48]:
len(next(iter(yelp_review_dataloader_test))['text'])

64

In [49]:
lr = 5e-5

In [50]:
optimizer = optim.AdamW(gpt_model.parameters(), lr=lr)

In [51]:
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

In [77]:
def train_epoch(model, data_loader, optimizer, criterion, device='cuda'):
    model.train()
    total_loss = 0

    for batch in data_loader:
        texts = batch['text']
        batch_size = len(texts)

        tokenized_texts = [tokenizer(text, truncation=True, padding=False, max_length=n_position, add_special_tokens=False).input_ids for text in texts]
        tokenized_texts = [text + [tokenizer.eos_token_id] for text in tokenized_texts]

        encodings = tokenizer.pad(
            {"input_ids": tokenized_texts},
            padding=True,
            max_length=n_position+1,
            return_tensors="pt"
        ).input_ids
        inputs = encodings[:, :-1].to(device)
        targets = encodings[:, 1:].to(device)

        padding_mask = create_token_mask(inputs, tokenizer.pad_token_id, device)
        causal_mask = create_causal_mask(inputs.size(1), device)
        combined_mask = padding_mask | causal_mask

        optimizer.zero_grad()
        pred = model(inputs, mask=combined_mask)
        pred = pred.view(-1, pred.size(-1))

        loss = criterion(pred, targets.view(-1))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    return total_loss / len(data_loader)

In [78]:
def train_model(model, train_loader, optimizer, criterion, epochs, device='cuda'):
    with trange(epochs, desc="Training", leave=False) as pbar:
        for epoch in pbar:
            train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
            pbar.set_postfix({'Train Loss': f"{train_loss:.4f}"})

In [79]:
train_model(gpt_model, yelp_review_dataloader_train, optimizer, criterion, 10, device=device)

Training:   0%|          | 0/10 [00:00<?, ?it/s]

# Test GPT for TextCompletion

In [181]:
def generate_text(model, init_prompts, tokenizer, max_generation_length=50):
    device = next(model.parameters()).device
    batch_size = len(init_prompts)
    current_length = 0
    model.eval()

    # This tokenizer should be left padding
    generation_tokens = tokenizer(init_prompts, return_tensors='pt', max_length=n_position, padding='max_length', truncation=True, add_special_tokens=False).input_ids
    generation_tokens = generation_tokens.to(device)

    for idx in range(max_generation_length):
        window_tokens = generation_tokens[:, -n_position:]
        padding_mask = create_token_mask(window_tokens, tokenizer.pad_token_id, device)
        causal_mask = create_causal_mask(n_position, device)
        combined_mask = padding_mask | causal_mask
        
        with torch.no_grad():
            outputs = gpt_model(window_tokens, mask=combined_mask)
            next_token_logits = outputs[:,-1,:]
            next_token_logits[:, tokenizer.eos_token_id] = float('-inf')
            next_tokens = next_token_logits.argmax(dim=-1, keepdim=True)

        if (next_tokens == tokenizer.eos_token_id).all():
            break
        generation_tokens = torch.cat([generation_tokens, next_tokens], dim=1)
        current_length = idx + 1

    return ['']*batch_size if current_length==0 else tokenizer.batch_decode(generation_tokens[:, -current_length:], skip_special_tokens=True)

In [186]:
prompts = [
    "I must admit",
    "I like",
    "I hate this"
]

In [187]:
generate_text(gpt_model, prompts, tokenizer, max_generation_length)

['the art.', 'the music', 'place..']

In [110]:
generated_text = gpt_model.module.generate_text(prompts, test_tokenizer, max_generation_length)

In [111]:
generated_text

['', '', '']

In [188]:
def test_bleu(dataloader, model, tokenizer, max_len, device='cuda'):
    all_pred_texts = []
    all_ref_texts = []

    for idx, test_batch in enumerate(tqdm(dataloader, total=100, desc="Test Bleu Score")):
        if idx > 99:
            break
        texts = test_batch['text']
        init_prompts = [text[:min(len(text)//2, max_len//2)] for text in texts]
        ref_texts = [text[min(len(text)//2, max_len//2):] for text in texts]
        
        # Translate the current minibatch
        pred_texts = generate_text(gpt_model, init_prompts, tokenizer, max_generation_length=max_len//2)
        
        # Store predictions and references
        all_pred_texts.extend(pred_texts)
        all_ref_texts.extend(ref_texts)
        
    bleu = sacrebleu.corpus_bleu(all_pred_texts, [all_ref_texts])
    return bleu

In [189]:
with torch.no_grad():
    bleu = test_bleu(yelp_review_dataloader_test, gpt_model.module, tokenizer, n_position, device)
print(f"BLEU score: {bleu.score}")

Test Bleu Score:   0%|          | 0/100 [00:00<?, ?it/s]

BLEU score: 5.48677236018528e-08
