<a href="https://colab.research.google.com/github/eisbetterthanpi/pytorch/blob/main/minGPT_play_char.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### setup

In [20]:
# # set up logging
# import logging
# logging.basicConfig(
#         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
#         datefmt="%m/%d/%Y %H:%M:%S",
#         level=logging.INFO,
# )

# char-transformer: chop up to individual characters and train,

# https://colab.research.google.com/github/karpathy/minGPT/blob/master/play_char.ipynb
# # make deterministic
# from mingpt.utils import set_seed
# set_seed(42)

import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

!git clone https://github.com/karpathy/minGPT.git
%cd /content/minGPT


Cloning into 'minGPT'...
remote: Enumerating objects: 386, done.[K
remote: Counting objects: 100% (95/95), done.[K
remote: Compressing objects: 100% (61/61), done.[K
remote: Total 386 (delta 44), reused 76 (delta 28), pack-reused 291[K
Receiving objects: 100% (386/386), 1.42 MiB | 2.13 MiB/s, done.
Resolving deltas: 100% (199/199), done.
/content/minGPT


#### functions

In [10]:
# @title CharDataset
import math
from torch.utils.data import Dataset

class CharDataset(Dataset):
    def __init__(self, data, block_size):
        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))
        self.stoi = {ch:i for i,ch in enumerate(chars)}
        self.itos = {i:ch for i,ch in enumerate(chars)}
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data
    
    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx + self.block_size + 1]
        # encode every character to an integer
        dix = [self.stoi[s] for s in chunk]
        """arrange data and targets so that the first i elements of x will be asked to predict the i-th element of y.
        Notice that the eventual language model will actually make block_size individual predictions at the same time based on this data,
        so we are being clever and amortizing the cost of the forward pass of the network.
        So for example if block_size is 4, then we could e.g. sample a chunk of text "hello", the integers in
        x will correspond to "hell" and in y will be "ello". This will then actually "multitask" 4 separate examples at the same time
        in the language model:
        - given just "h", please predict "e" as next
        - given "he" please predict "l" next
        - given "hel" predict "l" next
        - given "hell" predict "o" next
        
        In addition, because the DataLoader will create batches of examples,
        every forward/backward pass during traning will simultaneously train
        a LOT of predictions, amortizing a lot of computation. In particular,
        for a batched input of integers X (B, T) where B is batch size and
        T is block_size and Y (B, T), the network will during training be
        simultaneously training to make B*T predictions, all at once! Of course,
        at test time we can paralellize across batch B, but unlike during training
        we cannot parallelize across the time dimension T - we have to run
        a forward pass of the network to recover the next single character of the 
        sequence along each batch dimension, and repeatedly always feed in a next
        character to get the next one.
        
        So yes there is a big asymmetry between train/test time of autoregressive
        models. During training we can go B*T at a time with every forward pass,
        but during test time we can only go B at a time, T times, with T forward passes.
        """
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y


In [31]:
# @title Trainer
"""Simple training loop; Boilerplate that could apply to any arbitrary neural network,
so nothing in this file really has anything to do with GPT specifically."""

import math
import logging
from tqdm import tqdm
import numpy as np
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data.dataloader import DataLoader

logger = logging.getLogger(__name__)

class TrainerConfig:
    # optimization parameters
    max_epochs = 10
    batch_size = 64
    learning_rate = 3e-4
    betas = (0.9, 0.95)
    grad_norm_clip = 1.0
    weight_decay = 0.1 # only applied on matmul weights
    # learning rate decay params: linear warmup followed by cosine decay to 10% of original
    lr_decay = False
    warmup_tokens = 375e6 # these two numbers come from the GPT-3 paper, but may not be good defaults elsewhere
    final_tokens = 260e9 # (at what point we reach 10% of original LR)
    # checkpoint settings
    ckpt_path = None
    num_workers = 0 # for DataLoader
    def __init__(self, **kwargs):
        for k,v in kwargs.items():
            setattr(self, k, v)

class Trainer:
    def __init__(self, model, train_dataset, test_dataset, config):
        self.model = model
        self.train_dataset = train_dataset
        self.test_dataset = test_dataset
        self.config = config
        # take over whatever gpus are on the system
        self.device = 'cpu'
        if torch.cuda.is_available():
            self.device = torch.cuda.current_device()
            self.model = torch.nn.DataParallel(self.model).to(self.device)
        print("init device",self.device)

    def save_checkpoint(self):
        # DataParallel wrappers keep raw model object in .module attribute
        raw_model = self.model.module if hasattr(self.model, "module") else self.model
        logger.info("saving %s", self.config.ckpt_path)
        torch.save(raw_model.state_dict(), self.config.ckpt_path)

    def train(self):
        model, config = self.model, self.config
        raw_model = model.module if hasattr(self.model, "module") else model
        optimizer = raw_model.configure_optimizers(config)
        def run_epoch(loader, is_train):
            model.train(is_train)
            losses = []
            pbar = tqdm(enumerate(loader), total=len(loader)) if is_train else enumerate(loader)
            for it, (x, y) in pbar:
                # place data on the correct device
                x = x.to(self.device)
                y = y.to(self.device)
                # print("device",self.device)
                # forward the model
                with torch.set_grad_enabled(is_train):
                    # print("model",next(model.parameters()).is_cuda)
                    logits, loss = model(x, y)
                    loss = loss.mean() # collapse all losses if they are scattered on multiple gpus
                    losses.append(loss.item())
                if is_train:
                    # backprop and update the parameters
                    model.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip)
                    optimizer.step()

                    # decay the learning rate based on our progress
                    if config.lr_decay:
                        self.tokens += (y >= 0).sum() # number of tokens processed this step (i.e. label is not -100)
                        if self.tokens < config.warmup_tokens:
                            # linear warmup
                            lr_mult = float(self.tokens) / float(max(1, config.warmup_tokens))
                        else:
                            # cosine learning rate decay
                            progress = float(self.tokens - config.warmup_tokens) / float(max(1, config.final_tokens - config.warmup_tokens))
                            lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress)))
                        lr = config.learning_rate * lr_mult
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr
                    else:
                        lr = config.learning_rate
                    # report progress
                    pbar.set_description(f"epoch {epoch+1} iter {it}: train loss {loss.item():.5f}. lr {lr:e}")
            if not is_train:
                test_loss = float(np.mean(losses))
                logger.info("test loss: %f", test_loss)
                return test_loss

        best_loss = float('inf')
        self.tokens = 0 # counter used for learning rate decay

        train_loader = DataLoader(
            self.train_dataset,
            shuffle=True,
            pin_memory=True,
            batch_size=config.batch_size,
            num_workers=config.num_workers
        )
        if self.test_dataset is not None:
            test_loader = DataLoader(
                self.test_dataset,
                shuffle=True,
                pin_memory=True,
                batch_size=config.batch_size,
                num_workers=config.num_workers
            )

        for epoch in range(config.max_epochs):
            run_epoch(train_loader, is_train=True)
            if self.test_dataset is not None:
                test_loss = run_epoch(test_loader, is_train=False)

            # supports early stopping based on the test loss, or just save always if no test set is provided
            good_model = self.test_dataset is None or test_loss < best_loss
            if self.config.ckpt_path is not None and good_model:
                best_loss = test_loss
                self.save_checkpoint()




In [38]:
# @title utils sample
# https://github.com/karpathy/minGPT/blob/master/mingpt/utils.py
import random
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def top_k_logits(logits, k):
    v, ix = torch.topk(logits, k)
    out = logits.clone()
    out[out < v[:, [-1]]] = -float('Inf')
    return out

@torch.no_grad()
def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
    """take a conditioning sequence of indices in x (of shape (b,t)) and predict the next token in the sequence, feeding the predictions back into the model each time.
    Clearly the sampling has quadratic complexity unlike an RNN that is only linear, and has a finite context window of block_size, unlike an RNN that has an infinite context window."""
    block_size = model.get_block_size()
    model.eval()
    for k in range(steps):
        x_cond = x if x.size(1) <= block_size else x[:, -block_size:] # crop context if needed
        # print("x_cond",x_cond.shape) # [1, 13]
        logits, _ = model(x_cond)
        # pluck the logits at the final step and scale by temperature
        logits = logits[:, -1, :] / temperature
        # optionally crop probabilities to only the top k options
        if top_k is not None:
            logits = top_k_logits(logits, top_k)
        # apply softmax to convert to probabilities
        probs = F.softmax(logits, dim=-1)
        # sample from the distribution or take the most likely
        if sample:
            ix = torch.multinomial(probs, num_samples=1)
        else:
            _, ix = torch.topk(probs, k=1, dim=-1)
        # append to the sequence and continue
        x = torch.cat((x, ix), dim=1)
    return x



In [15]:

block_size = 128 # spatial extent of the model for its context
# you can download this file at https://github.com/karpathy/char-rnn/blob/master/data/tinyshakespeare/input.txt

import urllib.request
url = 'https://github.com/karpathy/char-rnn/blob/master/data/tinyshakespeare/input.txt'
urllib.request.urlretrieve(url, "input.txt")

text = open('input.txt', 'r').read() # don't worry we won't run out of file handles
train_dataset = CharDataset(text, block_size) # one line of poem is roughly 50 characters

data has 132659 characters, 95 unique.


#### model

In [5]:
# og mingpt
from mingpt.model import GPT, GPTConfig
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size, n_layer=4, n_head=4, n_embd=128) # 8 8 512
model = GPT(mconf)


#### gpt class

In [6]:

class CausalSelfAttention(nn.Module):
    """A vanilla multi-head masked self-attention layer with a projection at the end.
    It is possible to use torch.nn.MultiheadAttention here but I am including an
    explicit implementation here to show that there is nothing too scary here."""
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads
        self.key = nn.Linear(config.n_embd, config.n_embd)
        self.query = nn.Linear(config.n_embd, config.n_embd)
        self.value = nn.Linear(config.n_embd, config.n_embd)
        # regularization
        self.attn_drop = nn.Dropout(config.attn_pdrop)
        self.resid_drop = nn.Dropout(config.resid_pdrop)
        # output projection
        self.proj = nn.Linear(config.n_embd, config.n_embd)
        # causal mask to ensure that attention is only applied to the left in the input sequence
        self.register_buffer("mask", torch.tril(torch.ones(config.block_size, config.block_size))
           .view(1, 1, config.block_size, config.block_size))
        self.n_head = config.n_head

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_drop(att)
        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
        # output projection
        y = self.resid_drop(self.proj(y))
        return y


class Block(nn.Module):
    """ an unassuming Transformer block """
    def __init__(self, config):
        super().__init__()
        self.ln1 = nn.LayerNorm(config.n_embd)
        self.ln2 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.mlp = nn.Sequential(
            nn.Linear(config.n_embd, 4 * config.n_embd),
            nn.GELU(),
            nn.Linear(4 * config.n_embd, config.n_embd),
            nn.Dropout(config.resid_pdrop),
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x

class GPT(nn.Module):
    """  the full GPT language model, with a context size of block_size """
    def __init__(self, config):
        super().__init__()
        # input embedding stem
        self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd)
        self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd))
        self.drop = nn.Dropout(config.embd_pdrop)
        # transformer
        self.blocks = nn.Sequential(*[Block(config) for _ in range(config.n_layer)])
        # decoder head
        self.ln_f = nn.LayerNorm(config.n_embd)
        self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.block_size = config.block_size
        self.apply(self._init_weights)
        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))

    def get_block_size(self):
        return self.block_size

    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if isinstance(module, nn.Linear) and module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.LayerNorm):
            torch.nn.init.zeros_(module.bias)
            torch.nn.init.ones_(module.weight)
        elif isinstance(module, GPT):
            torch.nn.init.normal_(module.pos_emb, mean=0.0, std=0.02)

    def configure_optimizers(self, train_config):
        """This long function is unfortunately doing something very simple and is being very defensive:
        We are separating out all parameters of the model into two buckets: those that will experience
        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
        We are then returning the PyTorch optimizer object."""
        # separate out all parameters to those that will and won't experience regularizing weight decay
        decay = set()
        no_decay = set()
        whitelist_weight_modules = (torch.nn.Linear, )
        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
        for mn, m in self.named_modules():
            for pn, p in m.named_parameters():
                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
                if pn.endswith('bias'):
                    # all biases will not be decayed
                    no_decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                    # weights of whitelist modules will be weight decayed
                    decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                    # weights of blacklist modules will NOT be weight decayed
                    no_decay.add(fpn)
        # special case the position embedding parameter in the root GPT module as not decayed
        no_decay.add('pos_emb')
        # validate that we considered every parameter
        param_dict = {pn: p for pn, p in self.named_parameters()}
        inter_params = decay & no_decay
        union_params = decay | no_decay
        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
                                                    % (str(param_dict.keys() - union_params), )

        # create the pytorch optimizer object
        optim_groups = [
            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
        ]
        optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
        return optimizer

    def forward(self, idx, targets=None):
        print("in model",idx.shape)
        b, t = idx.size()
        assert t <= self.block_size, "Cannot forward, model block size is exhausted."
        # forward the GPT model
        token_embeddings = self.tok_emb(idx) # each index maps to a (learnable) vector
        position_embeddings = self.pos_emb[:, :t, :] # each position maps to a (learnable) vector
        x = self.drop(token_embeddings + position_embeddings)
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.head(x)
        # if we are given some desired targets also calculate the loss
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss


#### perceiverio

In [None]:

!pip install perceiver-pytorch
import torch
from perceiver_pytorch import PerceiverIO
# https://github.com/lucidrains/perceiver-pytorch


In [43]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

# mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size, n_layer=4, n_head=4, n_embd=128) # 8 8 512
# print(train_dataset.vocab_size, train_dataset.block_size) # 95 128
from torch.utils.data.dataloader import DataLoader
train_loader = DataLoader(train_dataset, #self.train_dataset
            shuffle=True, pin_memory=True,
            batch_size=64, #64 config.batch_size
            num_workers=0)

vocab_size = train_dataset.vocab_size
block_size = train_dataset.block_size
class perceiverioGPT(torch.nn.Module):
    def __init__(self, vocab_size, block_size, n_layer=4, n_head=4, n_embd=128):
        super(perceiverioGPT, self).__init__()
        self.model = PerceiverIO(
            # dim = n_embd,            # dimension of sequence to be encoded
            dim = n_embd*n_embd,            # dimension of sequence to be encoded
            queries_dim = block_size,    # dimension of decoder queries
            logits_dim = None,           # dimension of final logits
            depth = 1,                   # depth of net
            num_latents = 32,           # number of latents, or induced set points, or centroids. different papers giving it different names
            latent_dim = 32,            # latent dimension
            cross_heads = 1,             # number of heads for cross attention. paper said 1
            latent_heads = 4,            # number of heads for latent self attention, 8
            cross_dim_head = 8,         # number of dimensions per cross attention head
            latent_dim_head = 8,        # number of dimensions per latent self attention head
            weight_tie_layers = False    # whether to weight tie layers (optional, as indicated in the diagram)
        )
        self.tok_emb = nn.Embedding(vocab_size, n_embd) # vocab_size num diff words, embedded into n_embd size vector
        # self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd))
        self.pos_emb = nn.Parameter(torch.zeros(1, block_size, n_embd))
        # self.pos_emb = torch.zeros(1, block_size, n_embd)
        # self.drop = nn.Dropout(config.embd_pdrop) # randomly zeroes some of the elements of the input tensor with probability p using samples from a Bernoulli distribution. for regularization and preventing the co-adaptation of neurons
        # self.queries = torch.zeros(1, block_size)
        self.queries = nn.Parameter(torch.zeros(1, block_size))
        self.block_size = block_size
    def get_block_size(self): return self.block_size
    def preprocess(self, X):
        if X.dim()==1:
            # print("inn")
            X=X.unsqueeze(dim=0)
        # elif X.dim() not in [1,2]: print("erm ",X.dim())
        X=X.flatten(start_dim=1, end_dim=-1) #(start_dim=1)
        X=X.unsqueeze(dim=1)
        # print(X.shape)
        return X

    def configure_optimizers(self, train_config):
        """This long function is unfortunately doing something very simple and is being very defensive:
        We are separating out all parameters of the model into two buckets: those that will experience weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
        We are then returning the PyTorch optimizer object."""
        # separate out all parameters to those that will and won't experience regularizing weight decay
        decay = set()
        no_decay = set()
        whitelist_weight_modules = (torch.nn.Linear, )
        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
        for mn, m in self.named_modules(): # https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.named_modules
            for pn, p in m.named_parameters():
                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
                if pn.endswith('bias'):
                    # all biases will not be decayed
                    no_decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                    # weights of whitelist modules will be weight decayed
                    decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                    # weights of blacklist modules will NOT be weight decayed
                    no_decay.add(fpn)
        # special case the position embedding parameter in the root GPT module as not decayed
        no_decay.add('pos_emb')
        # validate that we considered every parameter
        param_dict = {pn: p for pn, p in self.named_parameters()}
        inter_params = decay & no_decay
        union_params = decay | no_decay
        # print(param_dict.keys(),inter_params.shape,union_params.shape)
        # print(len(param_dict.keys()),len(inter_params),len(union_params)) # 36 0 36
        # print(param_dict.keys(),len(inter_params),len(union_params)) # 36 0 36
        # assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
        # assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" % (str(param_dict.keys() - union_params), )
        # create the pytorch optimizer object
        optim_groups = [{"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},]
        optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
        return optimizer

    def forward(self, idx, targets=None):
        b, t = idx.size()
        print("b, t",b, t)
        # assert t <= self.block_size, "Cannot forward, model block size is exhausted."
        token_embeddings = self.tok_emb(idx) # each index maps to a (learnable) vector
        position_embeddings = self.pos_emb[:, :t, :] # each position maps to a (learnable) vector
        # x = self.drop(token_embeddings + position_embeddings)
        # x = self.model(token_embeddings + position_embeddings)
        # print(token_embeddings.shape, position_embeddings.shape) # [64, 128, 128] + [1, 128, 128] = [64, 128, 128]
        seq = self.preprocess(token_embeddings + position_embeddings)
        print(seq.shape) #[64, 1, 16384]
        # print("forward",seq.device,self.queries.device)
        logits = self.model(seq, queries = self.queries)
        # print(logits.shape,targets.shape)
        # print(logits.view(-1, logits.size(-1)).shape, targets.shape) # [64, 128]
        # print(logits.view(-1, logits.size(-1)).dtype, targets.dtype) # float int
        if targets is not None:
            # loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.float())
        return logits, loss


model = perceiverioGPT(vocab_size, block_size, n_layer=4, n_head=4, n_embd=128).to(device)

(x, y) = next(iter(train_loader))
# print(x.shape, y.shape) #0-94? [64, 128],[64, 128]
logits, loss = model(x.to(device), y.to(device))
# print(model.device())
print(x.device)
print(next(model.parameters()).is_cuda)

# print(logits, loss) #[64, 128, 95] , [] singu
# dict_keys(['model.latents', 'model.cross_attend_blocks.0.fn.to_q.weight', 'model.cross_attend_blocks.0.fn.to_kv.weight', 'model.cross_attend_blocks.0.fn.to_out.weight', 'model.cross_attend_blocks.0.fn.to_out.bias', 'model.cross_attend_blocks.0.norm.weight', 'model.cross_attend_blocks.0.norm.bias', 'model.cross_attend_blocks.0.norm_context.weight', 'model.cross_attend_blocks.0.norm_context.bias', 'model.cross_attend_blocks.1.fn.net.0.weight', 'model.cross_attend_blocks.1.fn.net.0.bias', 'model.cross_attend_blocks.1.fn.net.2.weight', 'model.cross_attend_blocks.1.fn.net.2.bias', 'model.cross_attend_blocks.1.norm.weight', 'model.cross_attend_blocks.1.norm.bias', 'model.layers.0.0.fn.to_q.weight', 'model.layers.0.0.fn.to_kv.weight', 'model.layers.0.0.fn.to_out.weight', 'model.layers.0.0.fn.to_out.bias', 'model.layers.0.0.norm.weight', 'model.layers.0.0.norm.bias', 'model.layers.0.1.fn.net.0.weight', 'model.layers.0.1.fn.net.0.bias', 'model.layers.0.1.fn.net.2.weight', 'model.layers.0.1.fn.net.2.bias', 'model.layers.0.1.norm.weight', 'model.layers.0.1.norm.bias', 'model.decoder_cross_attn.fn.to_q.weight', 'model.decoder_cross_attn.fn.to_kv.weight', 'model.decoder_cross_attn.fn.to_out.weight', 'model.decoder_cross_attn.fn.to_out.bias', 'model.decoder_cross_attn.norm.weight', 'model.decoder_cross_attn.norm.bias', 'model.decoder_cross_attn.norm_context.weight', 'model.decoder_cross_attn.norm_context.bias', 'tok_emb.weight']) 0 36



b, t 64 128
torch.Size([64, 1, 16384])
cpu
True


#### train

In [32]:
# from mingpt.trainer import Trainer, TrainerConfig

# initialize a trainer instance and kick off training
# tconf = TrainerConfig(max_epochs=2, batch_size=512, learning_rate=6e-4, lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*block_size, num_workers=4)
tconf = TrainerConfig(max_epochs=2, batch_size=512, learning_rate=6e-4, lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*block_size, num_workers=0)
trainer = Trainer(model, train_dataset, None, tconf)
trainer.train()


init device 0


epoch 1 iter 258: train loss 31182.26367. lr 3.001423e-04: 100%|██████████| 259/259 [00:12<00:00, 21.22it/s]
epoch 2 iter 258: train loss 31248.47852. lr 6.000000e-05: 100%|██████████| 259/259 [00:11<00:00, 21.98it/s]


#### eval

In [44]:

# alright, let's sample some character-level Shakespeare
# from mingpt.utils import sample

context = "O God, O God!"
x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None,...].to(trainer.device)
y = sample(model, x, 2000, temperature=1.0, sample=True, top_k=10)[0]
completion = ''.join([train_dataset.itos[int(i)] for i in y])
print(completion)


x_cond torch.Size([1, 13])
b, t 1 13
torch.Size([1, 1, 1664])


RuntimeError: ignored