[example] microGPT (#62)

* - microGPT example - removing a wasted line, thanks @SeanNaren - getting there, fixing the initial garbage problem * adding a HOWTO link
facebookresearch · Nov 3, 2021 · 962db66 · 962db66
1 parent f3d0873
commit 962db66
Show file tree

Hide file tree

Showing 3 changed files with 331 additions and 1 deletion.
diff --git a/.isort.cfg b/.isort.cfg
@@ -1,2 +1,2 @@
 [settings]
-known_third_party =fvcore,input_pipeline,matplotlib,numpy,pandas,pyre_extensions,pytest,recommonmark,seaborn,setuptools,sklearn,submitit,tensorflow,timm,torch,tqdm,triton,typing_extensions
+known_third_party =fvcore,input_pipeline,matplotlib,numpy,pandas,pyre_extensions,pytest,pytorch_lightning,recommonmark,seaborn,setuptools,sklearn,submitit,tensorflow,timm,torch,tqdm,triton,typing_extensions
diff --git a/HOWTO.md b/HOWTO.md
@@ -21,6 +21,7 @@ Let's present here a couple of code snippets on how to solve a couple of questio
     - [Fair enough, now I just want to build models and be done with it](#fair-enough-now-i-just-want-to-build-models-and-be-done-with-it)
       - [PyTorch Transformer](#pytorch-transformer)
       - [model factory](#model-factory)
+      - [practical usecase: microGPT](#practical-usecase-microgpt)
   - [Extensibility](#extensibility)
     - [Extend the xFormers parts zoo locally](#extend-the-xformers-parts-zoo-locally)
     - [Contributing an extension to the xFormers repository](#contributing-an-extension-to-the-xformers-repository)
@@ -579,6 +580,12 @@ You can compare the speed and memory use of the vanilla PyTorch Transformer Enco
 | xformers  | 89                | 1182               | 2709               |
 | pytorch   | 155               | 1950               | 4117               |
 
+
+#### practical usecase: microGPT
+
+This repo contains an hommage of sorts to [minGPT](https://github.com/karpathy/minGPT), in the `/examples` folder. You can run it with `python3 microGPT.py`, and it uses the model factory described above to reproduce the character-level training as proposed by [this notebook](https://github.com/karpathy/minGPT/blob/master/play_char.ipynb), with everything being implemented in a single file. This example will train a model to predict the next character, based on Shakespeare's creations. It then runs a quick demo with the model generating a paragraph after being primed with a prompt. [Pytorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning) handles the training side, xFormers handles the modelling (which you can alter as you see fit), and the minimal dataset is kept mostly as-is.
+
+
 ## Extensibility
 
 ### Extend the xFormers parts zoo locally

diff --git a/examples/microGPT.py b/examples/microGPT.py
@@ -0,0 +1,323 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+# A MinGPT + Lightning + xFormers example Code from Sean Naren (@seannaren)
+# This is an hommage to https://github.com/karpathy/minGPT
+
+import math
+import os
+
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+from pytorch_lightning import Trainer, seed_everything
+from pytorch_lightning.utilities import rank_zero_info
+from torch.nn import functional as F
+from torch.utils.data import DataLoader, Dataset, RandomSampler
+
+from xformers.factory.model_factory import xFormer, xFormerConfig
+
+
+class GPT(pl.LightningModule):
+    """  the full GPT language model, with a context size of block_size """
+
+    def __init__(
+        self,
+        vocab_size,
+        weight_decay=0.1,
+        betas=(0.9, 0.95),
+        learning_rate=6e-4,
+        n_embd=512,
+        block_size=128,
+        n_layer=4,
+        n_head=4,
+        resid_pdrop=0.1,
+        attn_pdrop=0.1,
+        mlp_pdrop=0.1,
+        attention="scaled_dot_product",
+        hidden_layer_multiplier=4,
+        warmup_tokens=20,
+        final_tokens=1000,
+    ):
+        super().__init__()
+
+        # auto creates self.hparams from the method signature
+        self.save_hyperparameters()
+
+        # A list of the encoder or decoder blocks which constitute the Transformer.
+        xformer_config = [
+            {
+                "block_config": {
+                    "block_type": "encoder",
+                    "num_layers": self.hparams.n_layer,
+                    "dim_model": self.hparams.n_embd,
+                    "layer_norm_style": "pre",
+                    "position_encoding_config": {
+                        "name": "vocab",
+                        "seq_len": self.hparams.block_size,
+                        "vocab_size": self.hparams.vocab_size,
+                    },
+                    "multi_head_config": {
+                        "num_heads": self.hparams.n_head,
+                        "residual_dropout": self.hparams.resid_pdrop,
+                        "use_rotary_embeddings": True,
+                        "attention": {
+                            "name": self.hparams.attention,
+                            "dropout": self.hparams.attn_pdrop,
+                            "causal": True,
+                            "seq_len": self.hparams.block_size,
+                        },
+                    },
+                    "feedforward_config": {
+                        "name": "MLP",
+                        "dropout": self.hparams.mlp_pdrop,
+                        "activation": "gelu",
+                        "hidden_layer_multiplier": self.hparams.hidden_layer_multiplier,
+                    },
+                }
+            }
+        ]
+
+        config = xFormerConfig(xformer_config)
+        self.model = xFormer.from_config(config)
+
+        # decoder head
+        self.ln_f = nn.LayerNorm(self.hparams.n_embd)
+        self.head = nn.Linear(self.hparams.n_embd, self.hparams.vocab_size, bias=False)
+
+        self.block_size = self.hparams.block_size
+        self.apply(self._init_weights)
+
+        self._tokens_seen = 0
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+        # Reset the token counter
+        self._tokens_seen = 0
+
+    def get_block_size(self):
+        return self.block_size
+
+    def configure_optimizers(self):
+        # Create the optimizer and the training schedule:
+        # - Handle the per-param weight decay
+        no_decay = ["bias", "LayerNorm.weight"]
+        params_decay = [
+            p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay)
+        ]
+        params_nodecay = [
+            p for n, p in self.named_parameters() if any(nd in n for nd in no_decay)
+        ]
+        optim_groups = [
+            {"params": params_decay, "weight_decay": self.hparams.weight_decay},
+            {"params": params_nodecay, "weight_decay": 0.0},
+        ]
+
+        # - Start with a warm up, ramp up then cosine
+        optimizer = torch.optim.AdamW(
+            optim_groups, lr=self.hparams.learning_rate, betas=self.hparams.betas
+        )
+
+        def update_lr(*_):
+            config = self.hparams
+
+            if self._tokens_seen < config.warmup_tokens:
+                # linear warmup
+                lr_mult = float(self._tokens_seen) / float(max(1, config.warmup_tokens))
+                lr_mult = max(lr_mult, 1e-2)  # could be that we've not seen any yet
+            else:
+                # cosine learning rate decay
+                progress = float(self._tokens_seen - config.warmup_tokens) / float(
+                    max(1, config.final_tokens - config.warmup_tokens)
+                )
+                lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress)))
+
+            return lr_mult
+
+        lr_scheduler = {
+            "scheduler": torch.optim.lr_scheduler.LambdaLR(
+                optimizer,
+                lr_lambda=[update_lr, update_lr],
+            ),
+            "name": "learning_rate",
+            "interval": "step",  # The unit of the scheduler's step size
+            "frequency": 1,  # The frequency of the scheduler
+        }
+        return [optimizer], [lr_scheduler]
+
+    def forward(self, src):
+        # predict the next tokens (in latent space)
+        prediction = self.model(src)
+
+        # translate the predictions into tokens
+        prediction = self.ln_f(prediction)
+        logits = self.head(prediction)
+
+        return logits
+
+    def training_step(self, batch, _):
+        src, targets = batch
+
+        # Update the tokens we've seen (tracked for LR scheduling)
+        self._tokens_seen += (src >= 0).numel()
+
+        # same action as inference
+        logits = self(src)
+
+        # if we are given some desired targets also calculate the loss
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+
+        self.logger.log_metrics(
+            {
+                "train_loss": loss.mean(),
+                "learning_rate": self.lr_schedulers().get_last_lr()[0],
+            },
+            step=trainer.global_step,
+        )
+
+        return loss
+
+
+class CharDataset(Dataset):
+    def __init__(self, data, block_size):
+        chars = list(set(data))
+        data_size, vocab_size = len(data), len(chars)
+        rank_zero_info("data has %d characters, %d unique." % (data_size, vocab_size))
+
+        self.stoi = {ch: i for i, ch in enumerate(chars)}
+        self.itos = {i: ch for i, ch in enumerate(chars)}
+        self.block_size = block_size
+        self.vocab_size = vocab_size
+        self.data = data
+
+    def __len__(self):
+        return len(self.data) - self.block_size
+
+    def __getitem__(self, i):
+        chunk = self.data[i : i + self.block_size + 1]
+        dix = [self.stoi[s] for s in chunk]
+
+        # src and target are off by one, we want the model to predict the next word
+        x = torch.tensor(dix[:-1], dtype=torch.long)
+        y = torch.tensor(dix[1:], dtype=torch.long)
+        return x, y
+
+    def to_tokens(self, message, device):
+        return torch.tensor([self.stoi[s] for s in message], dtype=torch.long)[
+            None, ...
+        ].to(device)
+
+    def from_tokens(self, tokens):
+        return "".join([self.itos[int(i)] for i in tokens])
+
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
+    """
+    take a conditioning sequence of indices in x (of shape (b,t)) and predict the next token in
+    the sequence, feeding the predictions back into the model each time. Clearly the sampling
+    has quadratic complexity unlike an RNN that is only linear, and has a finite context window
+    of block_size, unlike an RNN that has an infinite context window.
+    """
+    block_size = model.get_block_size()
+    model.eval()
+
+    # CREDITS: https://github.com/karpathy/minGPT/blob/master/mingpt/utils.py
+    def top_k_logits(logits, k):
+        v, _ = torch.topk(logits, k)
+        out = logits.clone()
+        out[out < v[:, [-1]]] = -float("Inf")
+        return out
+
+    for _ in range(steps):
+        x_cond = (
+            x if x.size(1) <= block_size else x[:, -block_size:]
+        )  # crop context if needed
+        logits = model(x_cond)
+
+        # pluck the logits at the final step and scale by temperature
+        logits = logits[:, -1, :] / temperature
+
+        # optionally crop probabilities to only the top k options
+        if top_k is not None:
+            logits = top_k_logits(logits, top_k)
+
+        # apply softmax to convert to probabilities
+        probs = F.softmax(logits, dim=-1)
+
+        # sample from the distribution or take the most likely
+        if sample:
+            ix = torch.multinomial(probs, num_samples=1)
+        else:
+            _, ix = torch.topk(probs, k=1, dim=-1)
+
+        # append to the sequence and continue
+        x = torch.cat((x, ix), dim=1)
+
+    return x[0]  # escape the batch dimension
+
+
+if __name__ == "__main__":
+    seed_everything(42)
+    REF_BATCH = 512
+    BATCH = 256  # adjust depending on the avaiable memory on your machine
+    WORKERS = 8
+    EPOCHS = 2
+    BLOCK = 128
+    WARMUP = 20
+
+    if not os.path.exists("input.txt"):
+        os.system(
+            "wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
+        )
+
+    text = open("input.txt", "r").read()
+    train_dataset = CharDataset(
+        text, BLOCK
+    )  # one line of poem is roughly 50 characters
+    random_sampler = RandomSampler(train_dataset)
+    train_loader = DataLoader(
+        train_dataset,
+        sampler=random_sampler,
+        batch_size=BATCH,
+        num_workers=WORKERS,
+        pin_memory=True,
+    )
+
+    model = GPT(
+        vocab_size=train_dataset.vocab_size,
+        block_size=train_dataset.block_size,
+        attention="scaled_dot_product",
+        warmup_tokens=REF_BATCH * WARMUP,
+        final_tokens=EPOCHS * len(train_dataset) * BLOCK,
+    )
+
+    trainer = Trainer(
+        gpus=1,
+        max_epochs=EPOCHS,
+        precision=16,
+        gradient_clip_val=1,
+        log_every_n_steps=1,
+        terminate_on_nan=True,
+        accumulate_grad_batches=REF_BATCH // BATCH,
+    )
+
+    trainer.fit(model, train_loader)
+
+    # sample from the model
+    context = "Friends of my soul"  # Prime with something
+    x = train_dataset.to_tokens(context, model.device)
+    y = sample(model, x, steps=1000, temperature=1.0, sample=True, top_k=10)
+
+    print(train_dataset.from_tokens(y))