In [1]:
import gentrl
import torch
import pickle
import pandas as pd
import numpy as np
from rdkit.Chem import Draw
from moses.metrics import mol_passes_filters, QED, SA, logP
from moses.metrics.utils import get_n_rings, get_mol
import matplotlib.pyplot as plt
import random
from utilities.config import Config
from rdkit import RDLogger
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit import DataStructs, Chem
from rdkit.Chem.Fingerprints.FingerprintMols import FingerprintMol

torch.cuda.set_device(0)
RDLogger.DisableLog('rdApp.*')

In [2]:
enc = gentrl.RNNEncoder(latent_size=50)
dec = gentrl.DilConvDecoder(latent_input_size=50)
model = gentrl.GENTRL(enc, dec, 50 * [('c', 20)], [('c', 20)], beta=0.001)
model.cuda();

model.load('saved_gentrl_LA/')
model.cuda();

## Using the molecular skeleton to determine whether AL_37 is reproducible

Experimental results show that molecules with the same skeleton as AL_37 were generated when about 12.5w, 2.7w, 5w, 2.5w, and 10w molecules were generated in 5 experiments.

In [3]:
def sample_from_model(model):
    generated_num = 0
    
    al_37 = "CN(C)c1cc(C)ccc1NCC(=O)OCc1cccs1"
    scaf_37 = MurckoScaffold.MurckoScaffoldSmilesFromSmiles(al_37)

    while True:
        sampled = model.sample(2500)
        sampled_valid = [s for s in sampled if get_mol(s) and len(s) > 10]
        generated_num += len(sampled_valid)
        print("already generate: "+str(generated_num)+" molecules")
        scaf_list = set([MurckoScaffold.MurckoScaffoldSmilesFromSmiles(sm) for sm in sampled_valid])
        if scaf_37 in scaf_list:
            break
            
    return scaf_list, sampled_valid, generated_num
    

In [5]:
scaf_set,sampled_valid, generated_num = sample_from_model(model)

already generate: 598 molecules
already generate: 1233 molecules
already generate: 1879 molecules
already generate: 2500 molecules
already generate: 3187 molecules
already generate: 3816 molecules
already generate: 4487 molecules
already generate: 5079 molecules
already generate: 5707 molecules
already generate: 6350 molecules
already generate: 6968 molecules
already generate: 7589 molecules
already generate: 8212 molecules
already generate: 8823 molecules
already generate: 9452 molecules
already generate: 10105 molecules
already generate: 10759 molecules
already generate: 11398 molecules
already generate: 12031 molecules
already generate: 12670 molecules
already generate: 13340 molecules
already generate: 13973 molecules
already generate: 14599 molecules
already generate: 15235 molecules
already generate: 15891 molecules
already generate: 16507 molecules
already generate: 17144 molecules
already generate: 17790 molecules
already generate: 18404 molecules
already generate: 19023 molecu

In [6]:
al_37 = "CN(C)c1cc(C)ccc1NCC(=O)OCc1cccs1"
scaf_37 = MurckoScaffold.MurckoScaffoldSmilesFromSmiles(al_37)
for sm in sampled_valid:
    if MurckoScaffold.MurckoScaffoldSmilesFromSmiles(sm) == scaf_37:
        print(sm)

COC(=O)Nc1ccc(C)c(NCC(=O)OCc2cccs2)c1


In [8]:
scaf_37

'O=C(CNc1ccccc1)OCc1cccs1'

## Using Tanimoto coefficient similarity to determine whether AL_37 is reproducible

In [4]:
def sample_from_model(model, threshold=0.9):
    generated_num = 0
    
    al_37 = "CN(C)c1cc(C)ccc1NCC(=O)OCc1cccs1"
    mol_37 = Chem.MolFromSmiles(al_37)
    fp_37 = FingerprintMol(mol_37, minPath=1, maxPath=7, fpSize=2048,
                               bitsPerHash=2, useHs=True, tgtDensity=0.0,
                               minSize=128)

    while True:
        sampled = model.sample(2500)
        sampled_valid = [s for s in sampled if get_mol(s) and len(s) > 10]
        generated_num += len(sampled_valid)
        print("already generate: "+str(generated_num)+" molecules")
        maxsim = 0
        for sm in sampled_valid:
            fp_cur = FingerprintMol(Chem.MolFromSmiles(sm), minPath=1, maxPath=7, fpSize=2048,
                               bitsPerHash=2, useHs=True, tgtDensity=0.0,
                               minSize=128)
            sim = DataStructs.BulkTanimotoSimilarity(fp_37, [fp_cur])[0]
            maxsim = sim if sim > maxsim else maxsim
            if sim >= threshold:
                print(sm)
                break
        print("current max similarity: "+str(maxsim))
        
    return scaf_list, sampled_valid, generated_num
    

In [13]:
scaf_set,sampled_valid, generated_num = sample_from_model(model,threshold=0.45)

TypeError: sample_from_model() got an unexpected keyword argument 'threshold'

In [9]:
al_37 = "CN(C)c1cc(C)ccc1NCC(=O)OCc1cccs1"
fp_37 = FingerprintMol(Chem.MolFromSmiles(al_37), minPath=1, maxPath=7, fpSize=2048,
                               bitsPerHash=2, useHs=True, tgtDensity=0.0,
                               minSize=128)

sm_cur = "O=C(CNc1ccccc1)OCc1cccs1"
fp_cur = FingerprintMol(Chem.MolFromSmiles(sm_cur), minPath=1, maxPath=7, fpSize=2048,
                               bitsPerHash=2, useHs=True, tgtDensity=0.0,
                               minSize=128)

DataStructs.BulkTanimotoSimilarity(fp_37, [fp_cur])

[0.6533864541832669]