## Installing dependencies

### All dependencies

In [None]:
%%bash
conda install -c conda-forge spacy
python -m spacy download en_core_web_trf
python -m spacy download fr_dep_news_trf
conda install -c conda-forge nltk
pip install pandas

### Linux / Windows with cuda

In [None]:
%conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
%conda install -c conda-forge cupy

### Linux / Windows without cuda

In [4]:
# %conda install pytorch torchvision torchaudio cpuonly -c pytorch

### Mac

In [5]:
# %conda install pytorch::pytorch torchvision torchaudio -c pytorch

## Development

### Imports

In [155]:
import torch
import math
from torch.nn import functional as F
import torch.nn as nn
import os
from nltk.tokenize import RegexpTokenizer
import spacy
from spacy import displacy
import pandas as pd

### Parameters

In [None]:
torch.set_float32_matmul_precision('high')
torch.cuda.is_available()
nlp = spacy.load("fr_dep_news_trf")

batch_size = 2 # how many independent sequences will we process in parallel
block_size = 250 # what is the maximum context length for predictions

load_model = False # True if you wouldn't to generate the trainig set else False

learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'

n_embd = 512
n_head = 16
n_layer = 16

max_iters = 1000
eval_interval = 100
eval_iters = 100

tokens_gen = 200 # number of words in the generated text

torch.manual_seed(445)

dataset_file_name = os.path.join("data/", 'grimm_fr.txt')
model_file_name = os.path.join("saved/", 'ckpt_grimm_512_fr.pt')

print(f'device : {device}')

### Loading dataset

In [159]:
# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open(dataset_file_name, 'r', encoding='utf-8') as f:
    text = f.read()

### Transform dataset

In [160]:
pattern = r"[dnl]['´`]|\w+|$[\d.]+|\S+"
tokenizer = RegexpTokenizer(pattern)

tokens = tokenizer.tokenize(text)
stoi = {ch: i for i, ch in enumerate(tokens)}
itos = {i: ch for i, ch in enumerate(tokens)}

vocab_size = len(tokens)


def encode(s):
    s_tokens = tokenizer.tokenize(s)
    return [stoi[token] for token in s_tokens if token in stoi]


def decode(l):
    return ' '.join([itos[i] for i in l if i in itos])

### Test encode / decode functions

In [None]:
txt = """« Je sais pourquoi tu es ici !  » ?"""
encoded_text = encode(txt)
print(encoded_text)
print(decode(encoded_text))

### Breakdown for training and test data

In [162]:
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

### Data loading

In [163]:
def get_batch(split):
    data = train_data if split == 'train' else val_data

    if len(data) <= block_size:
        raise ValueError("Data length is too short for the model's block size.")

    if batch_size > block_size:
        raise ValueError("Batch size is larger than the model's block size.")

    ix = torch.randint(len(data) - block_size, (batch_size,))
    X = torch.zeros((batch_size, block_size), dtype=torch.long)
    Y = torch.zeros((batch_size, block_size), dtype=torch.long)

    for i in range(batch_size):
        block_start = ix[i]
        block_end = block_start + block_size + 1
        block = data[block_start:block_end]
        X[i, :] = block[:-1]
        Y[i, :] = block[1:]
        X = X.to(device)
        Y = Y.to(device)

    return X, Y

### Activation function

In [164]:
def new_gelu(x):
    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

### Model functions

In [None]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.dropout = nn.Dropout(0.2)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))


    def forward(self, x):  # cos and sin are expected to be provided by the caller
        B, T, C = x.shape

        query = self.query(x)
        key = self.key(x)
        value = self.value(x)
    
        # query, key = apply_rotary_emb(query, key, freqs_cis=freqs_cis)
        
        wei = query @ key.transpose(-2,-1) * C**-0.5  # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))  # (B, T, T)
        wei = F.softmax(wei, dim=-1)  # (B, T, T)
        wei = self.dropout(wei)
        out = wei @ value  # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out


class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(0.2)


    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        out = self.dropout(out)
        return out


class FeedFoward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.fc_1 = nn.Linear(n_embd, 4 * n_embd)
        self.fc_2 = nn.Linear(4 * n_embd, n_embd)
        self.droupout = nn.Dropout(0.2)


    def forward(self, x):
        x = new_gelu(self.fc_1(x))
        x = self.fc_2(x)
        x = self.droupout(x)
        return x


class RMSNorm(nn.Module):
    def __init__(self, dim, eps=1e-8):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))


    def _norm(self, x):
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)


    def forward(self, x):
        return self.weight * self._norm(x.float()).type_as(x)


class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.rms_norm1 = RMSNorm(dim = n_embd)
        self.rms_norm2 = RMSNorm(dim = n_embd)


    def forward(self, x):
        x = x + self.sa(self.rms_norm1(x))
        x = x + self.ffwd(self.rms_norm2(x))
        return x


class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embd)
        self.position_embedding = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
        self.rms_norm = RMSNorm(n_embd)
        self.linear_head = nn.Linear(n_embd, vocab_size)


    def forward(self, tokens, targets=None):
        B, T = tokens.shape
        
        # tokens and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding(tokens) # (B,T,C)
        pos_emb = self.position_embedding(torch.arange(T, device=device)) # (T,C)
       
        x = tok_emb + pos_emb # (B,T,C)

        for block in self.blocks:
            x = block(x)

        x = self.rms_norm(x) # (B,T,C)
        logits = self.linear_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= block_size else idx[:, -block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx
    
model = GPT()


@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()

    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)

        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()

        out[split] = losses.mean()

    model.train()
    return out

if load_model:
    checkpoint = torch.load(model_file_name, map_location="cuda:0")
    state_dict = checkpoint['model']
    unwanted_prefix = '_orig_mod.'

    for k,v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)

    model.load_state_dict(state_dict)

model.to(device)
model = torch.compile(model)
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

### Training model

In [None]:
if not load_model:
    for iter in range(max_iters):
        if iter % eval_interval == 0 or iter == max_iters - 1:
            losses = estimate_loss()
            print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

        xb, yb = get_batch('train')
        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.sum().backward()
        optimizer.step()

### Save model

In [None]:
if not load_model:
    checkpoint = {
        'model': model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'iter_num': iter,
        'val_loss': losses['val'],
    }
    print("saving checkpoint")
    torch.save(checkpoint, model_file_name)
    print("saved checkpoint")

### Generate text

In [None]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
gen_text = decode(model.generate(context, max_new_tokens=tokens_gen)[0].tolist())
print(gen_text)

### Generate the text structure diagram

In [None]:
doc = nlp(gen_text)
displacy.render(doc, style="dep")

### Generate the table of type of words

In [None]:
data = {
    "Texte": [],
    "Lemme": [],
    "POS": [],
    "Tag": [],
    "Dépendance": [],
    "Forme": [],
    "Alphabétique": [],
    "Stop": []
}

for token in doc:
    data["Texte"].append(token.text)
    data["Lemme"].append(token.lemma_)
    data["POS"].append(token.pos_)
    data["Tag"].append(token.tag_)
    data["Dépendance"].append(token.dep_)
    data["Forme"].append(token.shape_)
    data["Alphabétique"].append(token.is_alpha)
    data["Stop"].append(token.is_stop)

df = pd.DataFrame(data)

display(df)