<a href="https://colab.research.google.com/github/cyFou/testColab/blob/main/2025_02_26_TutoGpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Implementation de https://devshahs.medium.com/build-gpt-with-me-implementing-gpt-from-scratch-step-by-step-b2efe4e2f7e0


In [1]:
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM


import os,urllib
url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
filename = './tinyshakespeare.txt'
if not os.path.isfile(filename):
    urllib.request.urlretrieve(url, filename)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
device="cuda"

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
class ShakespeareDataset(Dataset):
    def __init__(self, file_path, tokenizer, block_size=128):
        self.block_size = block_size
        self.tokenizer = tokenizer
        self.block_size = block_size
        
        with open(file_path, 'r') as f:
            self.data = f.read()

        #mise en token du jeu de donnée
        self.token = self.tokenizer(self.data, padding='max_length',  truncation=False, return_tensors='pt').input_ids[0].numpy()

    def __len__(self):
        return len(self.token) - self.block_size

    def __getitem__(self, idx):
        x = self.token[idx : idx + self.block_size]
        y = self.token[idx + 1 : idx + self.block_size + 1]
        return torch.tensor(x, dtype=torch.long).to(device), torch.tensor(y, dtype=torch.long).to(device)

def collate_fn(batch):
    x, y = zip(*batch)
    x = torch.stack(x)
    y = torch.stack(y)
    return x,y

In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 64
block_size = 256
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
C1 = 384
n_head = 6
n_layer = 6
dropout = 0.2

In [None]:
class Head(nn.Module):


  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(C1, head_size, bias=False)
    self.query = nn.Linear(C1, head_size, bias=False)
    self.value = nn.Linear(C1, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size,block_size))) # this creates the lower triangle matrix
    self.dropout = nn.Dropout(dropout)

  def forward(self, x): # copied from above
    B,T,C = x.shape
    k = self.key(x)
    q = self.query(x)

    wei = q @ k.transpose(-2, -1) * C ** 0.5
    wei = wei.masked_fill(self.tril[:T,:T]==0, float('-inf'))
    wei = F.softmax(wei,dim=-1)
    wei = self.dropout(wei)
    v = self.value(x)
    out = wei @ v
    return out

In [None]:
class MultiHeadAttention(nn.Module):

  def __init__(self,num_heads,head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for i in range(num_heads)]) # create multiple heads
    self.proj = nn.Linear(C1, C1)
    self.dropout = nn.Dropout(dropout)

  def forward(self,x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.proj(out)
    return out # concatenate all of the output

class FeedForward(nn.Module):
  def __init__(self,n_embd):
    super().__init__()
    self.net = nn.Sequential( # multiplication of 4 comes from the fact that the dimensionality of input is x, but the inner layer dimensionality is 4*x
        nn.Linear(n_embd, 4*n_embd), # linear layer with n_embd input and n_embd output
        nn.ReLU(),# activation function, allows for non linearity (we use ReLU to get over vanishing gradients) -> vanishing gradients is essentially when
        nn.Linear(n_embd * 4, n_embd),    #  the gradients are propagated backward from the output layer to the input layer, they can become very small (vanish) as they pass through many layers.
        nn.Dropout(dropout)          # When the gradients become extremely small, the weights of the early layers are updated only by tiny amounts, if at all.
    )

  def forward(self, x):
    return self.net(x)
    
class Block(nn.Module):

  def __init__(self, n_embd, n_head): ## n_embd is the embedding dimension, n_head are the number of heads
    super().__init__()
    head_size = n_embd // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedForward(n_embd)
    self.ln1 = nn.LayerNorm(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)


  def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x

In [None]:
class BigramLanguageModel(nn.Module):


  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, C1)
    self.position_embedding_table = nn.Embedding(block_size, C1)
    self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
    self.ln_f = nn.LayerNorm(C1)
    self.lm_head = nn.Linear(C1, vocab_size)


  def forward(self, idx, targets=None):
    B,T = idx.shape
    tok_emb = self.token_embedding_table(idx) #
    pos_emb = self.position_embedding_table(torch.arange(T, device=device))
    x = tok_emb + pos_emb # (B,T,C) array, includes both the information about the tokens and their positions in the sequence
    x = self.blocks(x)
    x = self.ln_f(x)
    logits = self.lm_head(x)

    if targets is None:
      loss = None
    else:
      B,T,C = logits.shape
      logits = logits.view(B*T,C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self,idx,max_new_tokens):
    for i in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :] # focus on the last time step
            probs = F.softmax(logits, dim=-1) # probabilities
            idx_next = torch.multinomial(probs, num_samples=1) # get the i +1th prediction
            idx = torch.cat((idx, idx_next), dim=1)  # concatenate the prediction with the current sequence
    return idx

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
for iter in range(max_iters):

    print(iter)

    # every once in a while evaluate the loss on train and val sets

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()