## Train a character-level GPT on some smiles

The inputs here are simple smiles, which we chop up to individual characters and then train GPT on. So you could say this is a char-transformer instead of a char-rnn. Doesn't quite roll off the tongue as well. In this example we will feed it smiles from moses dataset, which we'll get it to predict character-level (i.e. token in smiles).

In [1]:
#for colab only
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
#for colab only
# Upload the folder containing the package
# !cp -r '/content/drive/My Drive/Colab Notebooks/mingpt' /content/

In [3]:
#for colab only
# import sys
# sys.path.append('/content/mingpt')

In [4]:
#for colab only
# import mingpt

In [1]:
# set up logging
import logging
import pandas as pd
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)

In [2]:
# make deterministic
from mingpt.utils import set_seed
set_seed(42)

In [3]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

In [4]:
from torch.utils.data import Dataset

class CharDataset(Dataset):

    def __init__(self, data, content):
        chars = sorted(list(set(content)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d smiles, %d unique characters.' % (data_size, vocab_size))
    
        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data
    
    def __len__(self):
        return math.ceil(len(self.data) / (self.block_size + 1))

    def __getitem__(self, idx):
        smiles = self.data[idx]
        len_smiles = len(smiles)
        dix =  [self.stoi[s] for s in smiles]
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y


In [5]:
# you can download this moses file here https://media.githubusercontent.com/media/molecularsets/moses/master/data/dataset_v1.csv
!wget https://media.githubusercontent.com/media/molecularsets/moses/master/data/dataset_v1.csv
smiles = pd.read_csv('dataset_v1.csv')['SMILES']
smiles.head()

--2023-03-05 09:34:04--  https://media.githubusercontent.com/media/molecularsets/moses/master/data/dataset_v1.csv
Resolving media.githubusercontent.com (media.githubusercontent.com)... 2606:50c0:8000::154, 2606:50c0:8002::154, 2606:50c0:8003::154, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|2606:50c0:8000::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84482588 (81M) [text/plain]
Saving to: 'dataset_v1.csv.2'

     0K .......... .......... .......... .......... ..........  0% 1.63M 49s
    50K .......... .......... .......... .......... ..........  0% 4.38M 34s
   100K .......... .......... .......... .......... ..........  0% 3.30M 31s
   150K .......... .......... .......... .......... ..........  0% 7.26M 26s
   200K .......... .......... .......... .......... ..........  0% 8.22M 23s
   250K .......... .......... .......... .......... ..........  0% 6.44M 21s
   300K .......... .......... .......... .......... .........

0    CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1
1      CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1
2    CC1C2CCC(C2)C1CN(CCO)C(=O)c1ccc(Cl)cc1
3       Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO
4          Cn1cnc2c1c(=O)n(CC(O)CO)c(=O)n2C
Name: SMILES, dtype: object

In [6]:
# some preprocessin, adding "<" to make every smile of max length (for us '<' is an end token)
lens = [len(i) for i in smiles]
max_len = max(lens)
smiles = [ i + str('<')*(max_len - len(i)) for i in smiles]
print(len(smiles))

1936962


In [7]:
content = ' '.join(smiles)

In [8]:
block_size = max_len
print(block_size)

57


In [9]:
train_dataset = CharDataset(smiles, content, )

data has 1936962 smiles, 28 unique characters.


In [10]:
from mingpt.model import GPT, GPTConfig
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size,
                  n_layer=8, n_head=8, n_embd=256)
model = GPT(mconf)

03/05/2023 09:34:40 - INFO - mingpt.model -   number of parameters: 6.347520e+06


In [None]:
from mingpt.trainer import Trainer, TrainerConfig
import math
# initialize a trainer instance and kick off training
tconf = TrainerConfig(max_epochs=5,
                    h_size=128, learning_rate=6e-4,
                    lr_decay=True, warmup_tokens=32*20, final_tokens=200*len(train_dataset)*block_size,
                    num_workers=2)
trainer = Trainer(model, train_dataset, None, tconf)
trainer.train()

In [16]:
# alright, let's sample some molecules and draw them using rdkit
!pip install rdkit
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from IPython.core.display import HTML
from rdkit.Chem.QED import qed
from rdkit.Chem import PandasTools
from mingpt.utils import sample
import seaborn as sns



03/05/2023 09:17:29 - INFO - rdkit -   Enabling RDKit 2022.09.5 jupyter extensions


ImportError: cannot import name 'sample' from 'mingpt.utils' (C:\Users\jztra\OneDrive\Desktop\mol_GPT-master\mingpt\utils.py)

In [None]:
def show(df):
    return HTML(df.to_html(notebook=True))
PandasTools.RenderImagesInAllDataFrames(images=True)

In [None]:
"Valid molecules % = {}".format(len(molecules))

In [17]:
mol_dict = []
for i in molecules:
    mol_dict.append({'molecule' : i, 'qed': qed(i), 'smiles': Chem.MolToSmiles(i)})

NameError: name 'molecules' is not defined

In [18]:
results = pd.DataFrame(mol_dict)

In [19]:
sns.kdeplot(results['qed'].values)

NameError: name 'sns' is not defined

In [20]:
show(results)

NameError: name 'show' is not defined

In [21]:
from rdkit.DataStructs import TanimotoSimilarity
from rdkit.Chem import AllChem

In [22]:
fp_list = []
for molecule in molecules:
    fp = AllChem.GetMorganFingerprintAsBitVect(molecule, 2, nBits=1024)
    fp_list.append(fp)

diversity = []
for i in range(len(fp_list)):
    for j in range(i+1, len(fp_list)):
        current_diverity  = 1 - float(TanimotoSimilarity(fp_list[i], fp_list[j]))
        diversity.append(current_diverity)

"Diversity of molecules % = {}".format(np.mean(diversity))

NameError: name 'molecules' is not defined