In [1]:
import torch
from transformers import GPT2LMHeadModel
import math
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [50]:
!wc input.txt

   40000  202651 1115394 input.txt


python(45929) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [2]:
model_hf = GPT2LMHeadModel.from_pretrained("gpt2") # 124
sd_hf = model_hf.state_dict()

for k,v in sd_hf.items():
    print(k,v.shape)

transformer.wte.weight torch.Size([50257, 768])
transformer.wpe.weight torch.Size([1024, 768])
transformer.h.0.ln_1.weight torch.Size([768])
transformer.h.0.ln_1.bias torch.Size([768])
transformer.h.0.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias torch.Size([2304])
transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias torch.Size([768])
transformer.h.0.ln_2.weight torch.Size([768])
transformer.h.0.ln_2.bias torch.Size([768])
transformer.h.0.mlp.c_fc.weight torch.Size([768, 3072])
transformer.h.0.mlp.c_fc.bias torch.Size([3072])
transformer.h.0.mlp.c_proj.weight torch.Size([3072, 768])
transformer.h.0.mlp.c_proj.bias torch.Size([768])
transformer.h.1.ln_1.weight torch.Size([768])
transformer.h.1.ln_1.bias torch.Size([768])
transformer.h.1.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.1.attn.c_attn.bias torch.Size([2304])
transformer.h.1.attn.c_proj.weight torch.Size([768, 768])
transformer.h.1.attn.c_proj.bias 

In [3]:
sd_hf['transformer.wte.weight'].view(-1)[:20]

tensor([-0.1101, -0.0393,  0.0331,  0.1338, -0.0485, -0.0789, -0.2398, -0.0895,
         0.0253, -0.1074, -0.1811, -0.0672,  0.0739, -0.0161,  0.0117,  0.1245,
        -0.0020, -0.0815,  0.0338,  0.2365])

In [4]:
num_params = sum(p.numel() for p in model_hf.parameters())

In [5]:
millnames = ['',' Thousand',' Million',' Billion',' Trillion']

def millify(n):
    n = float(n)
    millidx = max(0,min(len(millnames)-1,
                        int(math.floor(0 if n == 0 else math.log10(abs(n))/3))))

    return '{:.0f}{}'.format(n / 10**(3 * millidx), millnames[millidx])

In [6]:
print(f'The total number of params in the model is {millify(num_params)}!')

The total number of params in the model is 124 Million!


In [7]:
from transformers import pipeline,set_seed
generator = pipeline('text-generation',model='gpt2')
set_seed(42)
generator("Hello, I'm a language model,",max_length=30,num_return_sequences=5)

Device set to use mps:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=30) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[{'generated_text': "Hello, I'm a language model, and my project is based on the idea of a language model.\n\nI want to have a language that's both expressive and readable.\n\nLet's look at the following code:\n\nimport Data.ByteString\n\ndef get ( self ):\n\nself.data = Data.ByteString(data.get())\n\nself.data.append(data.get())\n\nself.data.append(data.get())\n\nself.data.append(data.get())\n\nself.data.append(data.get())\n\nself.data.append(data.get())\n\nself.data.append(data.get())\n\nself.data.append(data.get())\n\nself.data.append(data.get())\n\nself.data.append(data.get())\n\nself.data.append(data.get())\n\nself.data.append(data.get())\n\nself.data.append(data.get())\n\nself.data.append(data.get())\n\nself.data.append(data.get())\n\nself.data.append(data.get())\n\nself.data.append(data"},
 {'generated_text': "Hello, I'm a language model, not a language model, and if I don't understand it, I won't learn to write it. So I was writing a system for programming in JavaScript and a s

In [8]:
x = torch.zeros(768)
n = 100

for i in range(n):
    x += n**-0.5 * torch.randn_like(x)

print(x.std())   

tensor(0.9850)


In [9]:
0.1+0.7

0.7999999999999999

In [10]:
torch.__version__

'2.7.1'

In [11]:
import torch.nn as nn
from dataclasses import dataclass

In [12]:
@dataclass
class GPTconfig:
    block_size: int = 256
    vocab_size: int = 65
    n_layer: int = 6
    n_head: int = 2
    n_embd: int = 384
    droput = 0.1

In [13]:
size = [10,5,10]
x = torch.randn(size)
x.shape

torch.Size([10, 5, 10])

In [None]:
class CausalSelfAttention(nn.Module):
    def __init__(self,config:GPTconfig) -> None:
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.config = config
        self.c_attn = nn.Linear(config.n_embd,3*config.n_embd,bias=False)
        self.proj = nn.Linear(config.n_embd,config.n_embd,bias=False)
        self.dropout = nn.Dropout(config.droput)
        self.register_buffer('tril',torch.tril(torch.ones(config.block_size,config.block_size)).view(1,1,config.block_size,config.block_size))

    def forward(self,x:torch.Tensor):
        B,T,C = x.shape
        q,k,v = self.c_attn(x).split(self.config.n_embd,dim=-1)

        q = q.view(B,T,self.config.n_head,C//self.config.n_head).transpose(1,2) # (10,5,2,5) -> # (10,2,5,5)
        k = k.view(B,T,self.config.n_head,C//self.config.n_head).transpose(1,2)
        v = v.view(B,T,self.config.n_head,C//self.config.n_head).transpose(1,2)
            # (B,n_head,T,T)
            # (10,2,5,5) @ (10,2,5,5) --> (10,2,5,5)
        att = (q@k.transpose(-2,-1)) * (1.0 / math.sqrt(k.size(-1)))  # scaling attention product by 1 / sqroot(Dk) to prevent the variance from getting too large
        att = att.masked_fill(self.tril[:,:,T,T]==0,float('-inf')) # In general our tril matrix is of size context length, but we never know of what size T are we pulling in now
        att  = F.softmax(att,dim=-1)
        # att = self.dropout(att)

        # (10,2,5,5) @ (10,2,5,5) = (10,2,5,5)
        out = att @ v

        # out = out.transpose(1,2).contiguous().view(B,T,C)
        # out = self.proj(out)
        return att

In [33]:
config = GPTconfig(block_size=10,n_embd=10)
attention = CausalSelfAttention(config)

In [34]:
outs = attention(x)
outs.shape

torch.Size([10, 2, 5, 5])

In [39]:
model_type = {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}

In [72]:
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
import time
import tiktoken 
# device = 'cpu'
device = 'mps' if torch.mps.is_available() else 'cpu'
print(f"Using device: {device}\n")


# -----------------------------------------------------------------------------

@dataclass
class GPTconfig:
    block_size: int = 1024
    vocab_size: int = 50257
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768
    dropout:float = 0.0


class MLP(nn.Module):
    def __init__(self, config:GPTconfig) -> None:
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = nn.GELU(approximate='tanh')
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)

    def forward(self,x:torch.Tensor):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x
    
class CausalSelfAttention(nn.Module):
    def __init__(self,config:GPTconfig) -> None:
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.config = config
        self.c_attn = nn.Linear(config.n_embd,3*config.n_embd)
        self.c_proj = nn.Linear(config.n_embd,config.n_embd)
        self.dropout = nn.Dropout(config.dropout)
        self.register_buffer('tril',torch.tril(torch.ones(config.block_size,config.block_size)).view(1,1,config.block_size,config.block_size)) # C,C -> 1,1,C,C (Batched)

    def forward(self,x:torch.Tensor):
        B,T,C = x.shape

        # intialising our k,q,v
        q,k,v = self.c_attn(x).split(self.config.n_embd,dim=-1)
        
        # We're doing some tensor gymnastics to view our last channel as n_head * head_size, and make the n_head as a Batch Dimension, so that all heads are processed in parallel 
        # split C into head_size,n_head
        q = q.view(B,T,self.config.n_head,C//self.config.n_head).transpose(1,2) # B,n_head,T,head_size
        k = k.view(B,T,self.config.n_head,C//self.config.n_head).transpose(1,2) # B,n_head,T,head_size
        v = v.view(B,T,self.config.n_head,C//self.config.n_head).transpose(1,2) # B,n_head,T,head_size
        
    # heart of cauusal-self-attention mechanism 🧐

            # (B,T,n_head,head_size) @ (B,T,head_size,n_head) -> (B,T,n_head,n_head)
        att = (q@k.transpose(-2,-1)) * (1.0 / math.sqrt(k.size(-1)))  # scaling attention product by 1 / sqroot(Dk) to prevent the variance from getting too large
        # after matmul we get (B,n_heads,num_key_positions,num_query_positions)

        # you can comment out this line to get self-attention ⬇️
        att = att.masked_fill(self.tril[:,:,:T,:T]==0,float('-inf'))  
        att  = F.softmax(att,dim=-1)
        att = self.dropout(att)
            # (B,T,n_head,n_head) @ (B,T,n_head,head_size) ->   (B,T,n_head,head_size)
        y = att @ v 
            # B,T,n_head,head_size -> B,T,C
        y = y.transpose(1,2).contiguous().view(B,T,C)

        # This is our blender, wherein we mix the outputs of all heads before feeding into an MLP
        out = self.c_proj(y) 

        return out

class Block(nn.Module):
    
    def __init__(self,config:GPTconfig):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self,x):
        # Since we do a ln and skip connection before an MLP, its essential for the outputs to be blended beforehand, else it'll be hard to backprop through
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x
    
class GPT(nn.Module):

    def __init__(self, config:GPTconfig):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size,config.n_embd),
            wpe = nn.Embedding(config.block_size,config.n_embd),
            drop = nn.Dropout(config.dropout),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd)
        ))
        self. lm_head = nn.Linear(config.n_embd,config.vocab_size,bias=False)

    def forward(self,idx,targets=None):
        B,T = idx.shape
        assert T <= self.config.block_size, f"Cannot forward sequence of lenght {T} when block size is {self.config.block_size}"
        
        token_embd = self.transformer.wte(idx) # pyright: ignore[reportCallIssue]

        pos = torch.arange(0,T,dtype=torch.long,device=idx.device)
        position_embd = self.transformer.wpe(pos) # pyright: ignore[reportCallIssue]

        x = token_embd + position_embd
        x = self.transformer.drop(x) # pyright: ignore[reportCallIssue]
        
        for block in self.transformer.h: # pyright: ignore[reportGeneralTypeIssues, reportCallIssue]
            x = block(x)
        
        x = self.transformer.ln_f(x) # pyright: ignore[reportCallIssue]

        logits = self.lm_head(x) # (B,T,Vocab_size)

        loss = None
    
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))

        return logits, loss


    @classmethod
    def from_pretrained(cls,model_type):
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        from transformers import GPT2LMHeadModel
        print(f"Loading weights from pretrained GPT: {model_type}")

         # n_layer, n_head and n_embd are determined from model_type
        config_args = {
            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
        }[model_type]

        config_args['vocab_size'] = 50257
        config_args['block_size'] = 1024

        config = GPTconfig(**config_args)
        model = GPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.tril')]

         # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
        # this means that we have to transpose these weights when we import them
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model


# -----------------------------------------------------------------------------

num_return_sequences = 5
max_length = 30

enc = tiktoken.get_encoding('gpt2')


# -----------------------------------------------------------------------------


model = GPT.from_pretrained('gpt2')
model = model.to(device)
# model = GPT(GPTconfig())
model.eval()

tokens = enc.encode("Hello, I'm a language model,")
tokens = torch.tensor(tokens,dtype=torch.long) #(8,)
tokens = tokens.unsqueeze(0).repeat(num_return_sequences,1) 


print(x[0])

x = tokens.to(device)

torch.manual_seed(42)
torch.mps.manual_seed(42)

t0= time.perf_counter()


with torch.no_grad():
    logits,_ = model(x)
    logits = logits[:,-1,:]
    probs = F.softmax(logits,-1)

    topkprobs, topkindices = torch.topk(probs,50)

    ix = torch.multinomial(topkprobs,1)

    xcol = torch.gather(topkindices,-1,ix)

    # x = torch.cat((x,xcol),dim=1)

print(x[0])

# for i in range(num_return_sequences):
#     tokens = x[i,:max_length].tolist()
#     decoded = enc.decode(tokens)
#     print(">",decoded)


print('')
print(f"Its taking {(time.perf_counter()-t0):.2f} seconds for inference to run when device is set to {device}")


Using device: mps

Loading weights from pretrained GPT: gpt2
tensor([15496,    11,   314,  1101,   257,  3303,  2746,    11,   543,  1724],
       device='mps:0')
tensor([15496,    11,   314,  1101,   257,  3303,  2746,    11],
       device='mps:0')

Its taking 0.04 seconds for inference to run when device is set to mps
