In [1]:
#!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 955 kB/s eta 0:00:01
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


# Train tokenizer

In [2]:
import sentencepiece as spm

In [None]:
%%time
spm.SentencePieceTrainer.train('--input=guacamol_v1_train.smiles --model_prefix=example_sp --vocab_size=300')

In [3]:
%%time
sp_bpe = spm.SentencePieceProcessor()
sp_bpe.load('example_sp.model')
print('*** BPE ***')
print(sp_bpe.encode_as_pieces("Cc1cc2c(c3oc(CCCC#N)cc13)C(=O)c1c(O)cccc1C2=O"))

*** BPE ***
['▁C', 'c', '1', 'cc', '2', 'c', '(', 'c', '3', 'oc', '(', 'CCCC', '#', 'N', ')', 'cc', '1', '3)', 'C', '(=', 'O', ')', 'c', '1', 'c', '(', 'O', ')', 'ccc', 'c', '1', 'C', '2', '=', 'O']
CPU times: user 2.86 ms, sys: 3.9 ms, total: 6.76 ms
Wall time: 101 ms


In [9]:
print(sp_bpe.encode("Cc1cc2c(c3oc(CCCC#N)cc13)C(=O)c1c(O)cccc1C2=O"))

[24, 4, 7, 14, 9, 4, 6, 4, 12, 52, 6, 41, 37, 11, 5, 14, 7, 26, 3, 10, 8, 5, 4, 7, 4, 6, 8, 5, 13, 4, 7, 3, 9, 15, 8]


In [8]:
print(sp_bpe.decode([[24, 4, 7, 14, 9, 4, 6, 4, 12, 52, 6, 41, 37, 11, 5, 14, 7, 26, 3, 10, 8, 5, 4, 7, 4, 6, 8, 5, 13, 4, 7, 3, 9, 15, 8],
                     [24, 4, 7, 14, 9, 4, 6, 4, 12, 52, 6, 41, 37, 11, 5, 14, 7, 26, 3, 10, 8, 5, 4, 7, 4, 6, 8, 5, 13, 4, 7, 3, 9, 15, 8]]))

['Cc1cc2c(c3oc(CCCC#N)cc13)C(=O)c1c(O)cccc1C2=O', 'Cc1cc2c(c3oc(CCCC#N)cc13)C(=O)c1c(O)cccc1C2=O']


In [12]:
sp_bpe.get_piece_size()

300

In [14]:
sp_bpe.id_to_piece(2)

'</s>'

In [15]:
sp_bpe.id_to_piece([2, 3, 4])

['</s>', 'C', 'c']

In [7]:
sp_bpe.piece_to_id('<unk>')

0

In [4]:
sp_bpe.piece_to_id('<s>')

1

In [6]:
sp_bpe.piece_to_id('</s>')

2

In [18]:
sp_bpe.piece_to_id(['</s>', '\r', '▁'])

[2, 0, 63]

In [19]:
len(sp_bpe)

300

# Train minGPT model

In [9]:
# make deterministic
from mingpt.utils import set_seed
set_seed(42)

In [10]:
# set up logging
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)

In [11]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

In [30]:
import math
from torch.utils.data import Dataset

class CharDataset(Dataset):

    def __init__(self, data, block_size, sp):
        self.sp = sp
        self.block_size = block_size
        self.vocab_size = len(sp)
        self.data = self.sp.encode(data)

        print('data has %d characters, %d unique pieces, %d after encoded.' % (len(data), self.vocab_size, len(self.data)))

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) from the data
        chunk = self.data[idx:idx + self.block_size + 1]

        x = torch.tensor(chunk[:-1], dtype=torch.long)
        y = torch.tensor(chunk[1:], dtype=torch.long)
        return x, y

In [31]:
block_size = 128 # spatial extent of the model for its context

text = open('guacamol_v1_train.smiles', 'r').read() # don't worry we won't run out of file handles
train_dataset = CharDataset(text, block_size, sp_bpe) # one line of poem is roughly 50 characters

data has 61841218 characters, 300 unique pieces, 49406762 after encoded.


In [32]:
chunk = text[0:block_size + 1]
chunk

'CCC(C)(C)Br\nCCCN(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O\nOc1ccc(C2CC(c3ccccc3)=NN2C(=S)Nc2ccccc2)cc1\nCC1(C)OCC2OC3'

In [33]:
train_dataset[0]

(tensor([ 28,   3,   6,   3,   5,   6,   3,   5,  44, 104,   6,   3,   3,   4,
           7,  13,   4,  23,   4,   9,  17,  19,   4,   7,   5,   3,  10,   8,
           5,   3,   7,   8,   3,   6,   3,  10,   8,   5,   8,   5,  15,   3,
           3,   6,  11,   5,   3,   7,  22,   6,   3,   5,  15,   8,  31,   4,
           7,  13,   6,   3,   9,   3,   3,   6,   4,  12,  17,  26,  15,  11,
          11,   9,   3,  10,  27,   5,  11,   4,   9,  17,  19,  14,   7,  28,
           7,   6,   3,   5,   8,   3,   3,   9,   8,   3,  12,   6,   3,  21,
           8,   3,   6,   3,   5,   6,   3,   5,   8,   3,  21,   3,   8,   5,
           8,   3,   6,   3,   5,   6,   3,   5,   8,   3,  12,   3,   9,   8,
           7,  32]),
 tensor([  3,   6,   3,   5,   6,   3,   5,  44, 104,   6,   3,   3,   4,   7,
          13,   4,  23,   4,   9,  17,  19,   4,   7,   5,   3,  10,   8,   5,
           3,   7,   8,   3,   6,   3,  10,   8,   5,   8,   5,  15,   3,   3,
           6,  11,   5,   3,   

In [34]:
from mingpt.model import GPT, GPTConfig
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size,
                  n_layer=8, n_head=8, n_embd=512)
model = GPT(mconf)

09/27/2021 11:34:39 - INFO - mingpt.model -   number of parameters: 2.559283e+07


In [None]:
from mingpt.trainer import Trainer, TrainerConfig

# initialize a trainer instance and kick off training
tconf = TrainerConfig(max_epochs=2, batch_size=512, learning_rate=6e-4,
                      lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*block_size,
                      num_workers=4)
trainer = Trainer(model, train_dataset, None, tconf)
trainer.train()

epoch 1 iter 25931: train loss 0.62981. lr 5.736664e-04:  27%|██▋       | 25932/96498 [4:10:56<11:16:51,  1.74it/s]

In [None]:
# alright, let's sample some character-level Shakespeare
from mingpt.utils import sample

context = "C"
x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None,...].to(trainer.device)
y = sample(model, x, 2000, temperature=1.0, sample=True, top_k=10)[0]
completion = ''.join([train_dataset.itos[int(i)] for i in y])
print(completion)