<a href="https://colab.research.google.com/github/doudi25/GPT2-custom-smallest-one-/blob/main/Minimal_implementation_of_GPT2(small).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [145]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Load the model
model_original = GPT2LMHeadModel.from_pretrained('gpt2')


In [236]:
# Input text
input_text = "can you tell me if you are a language model?"

# Tokenize the input text
input_ids = tokenizer.encode(input_text, return_tensors='pt')  # Return PyTorch tensors


In [147]:
input_ids


tensor([[5171,  345, 1560,  502,  611,  345,  389,  257, 3303, 2746,   30]])

In [148]:
# Generate text
output = model_original.generate(input_ids, max_length=20, num_return_sequences=1)

# Decode the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


can you tell me if you are a language model?

I'm not a language model.


In [240]:
from dataclasses import dataclass
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import tiktoken
import time
import inspect

@dataclass
class GPTConfig:
    block_size: int = 1024
    vocab_size: int = 50257
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768
class CausalSelfAttention(nn.Module):
    def __init__(self,config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.register_buffer("bias",torch.tril(torch.ones(config.block_size,config.block_size)).view(1,1,config.block_size,config.block_size))
    def forward(self,x):
        B, T, C = x.size()
        qkv = self.c_attn(x)
        query , key , value = qkv.split(self.n_embd,dim=2)

        query = query.view(B,T,self.n_head,C // self.n_head).transpose(1,2)

        key = key.view(B,T,self.n_head,C // self.n_head).transpose(1,2)

        value = value.view(B,T,self.n_head,C // self.n_head).transpose(1,2)

        attention_score = (query @ key.transpose(-2,-1)) * (1.0/ math.sqrt(key.size(-1)))
        attention_score  = attention_score.masked_fill(self.bias[:,:,:T,:T]==0,float('-inf'))

        attention_score = F.softmax(attention_score,dim = -1)

        y = torch.matmul(attention_score,value)

        y = y.transpose(1,2).contiguous().view(B,T,C)
        y = self.c_proj(y)
        return
class MLP(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)

        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
    def forward(self,x):
      x = self.c_fc(x)
      x = F.gelu(x)
      x = self.c_proj(x)
      return x
class Block(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)
    def forward(self,x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x
class GPT(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.config = config
        self.wte = nn.Embedding(config.vocab_size,config.n_embd)
        self.wpe = nn.Embedding(config.block_size,config.n_embd)
        self.layers = nn.ModuleList([Block(config) for _ in range(config.n_layer)])
        self.ln_f = nn.LayerNorm(config.n_embd)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size,bias=False)
        self.wte.weight = self.lm_head.weight
    def forward(self,idx,targets=None):
        B, T = idx.size()
        assert T <= self.config.block_size , f"Cannot forward sequence of length {T}"
        pos = torch.arange(0,T, dtype=torch.long,device= idx.device)
        pos_emb = self.wpe(pos)
        tok_emb = self.wte(idx)
        x = tok_emb + pos_emb
        for block in self.layers:
            x = block(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        return logits

In [241]:
config = GPTConfig()

In [242]:
model = GPT(config)

In [126]:
!pip install tiktoken



In [243]:
save = model.state_dict()
save2 = model_original.state_dict()

In [244]:
keys = save.keys()
keys_original = save2.keys()

In [155]:
len(keys)

161

In [156]:
len(keys_original)

149

In [246]:
sd_keys_hf = [k for k in keys_original if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
keys = [k for k in keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
        # this means that we have to transpose these weights when we import them
assert len(sd_keys_hf) == len(keys), f"mismatched keys: {len(sd_keys_hf)} != {len(keys)}"
for k in zip(keys,sd_keys_hf):
      if any(k[1].endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
          assert save2[k[1]].shape[::-1] == save[k[0]].shape
          with torch.no_grad():
                save[k[0]].copy_(save2[k[1]].t())
      else:
                # vanilla copy over the other parameters
          assert save2[k[1]].shape == save[k[0]].shape
          with torch.no_grad():
                save[k[0]].copy_(save2[k[1]])



In [None]:
# naive way of next token prediciton
tokens = input_ids
for i in range(30):  # Generate 30 new tokens
    out = model(tokens)  # Forward pass through the model
    out = out[:, -1, :]  # Get the logits for the last token
    out = torch.softmax(out/0.4, dim=-1)  # Convert logits to probabilities
    out = torch.multinomial(out, 1)  # Sample a token from the distribution
    tokens = torch.cat([tokens, out], dim=1)  # Append the new token to the sequence and repeat the process

In [None]:
generated_text = tokenizer.decode(tokens[0],add_special_tokens=True) #decoding the tokens into string
generated_text