In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch

from rdkit.Chem.Descriptors import ExactMolWt
from rdkit.Chem.Crippen import MolLogP
from rdkit.Chem.rdMolDescriptors import CalcNumHBD    
from rdkit.Chem.rdMolDescriptors import CalcNumHBA
from rdkit.Chem.rdMolDescriptors import CalcTPSA
from rdkit import Chem
from rdkit.Chem.QED import qed
from utils import decode_smiles_from_indexes, load_dataset

from model import MolecularICVAE
from rdkit import RDLogger   
RDLogger.DisableLog('rdApp.*')

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = MolecularICVAE().to(device)

In [3]:
model.load_state_dict(torch.load('./result/model/MW_model.pth'))

<All keys matched successfully>

### Warning
**please note because the input value should be the normalized property value ranging from 0 to 500. So you need to change the true property value to normalized property firstly. In the following, we give you the two value y.min() and y_range = (y.max()-y.min()) for esily change:**

    
MW: [150.025169192,          201.12958056818]

SAS: [1.0574495523703487,  6.351893861381157]

TPSA: [0.,                149.48999999999998]

logP: [-4.9017,           10.970920000000003]

QED: [0.2335469098782191, 0.7145762918458708]

HBA: [0,                                  10]

HBD: [0,                                   5]

In [4]:
 def reconstructed(autoencoder, charset, p1):
    valid_smile = []
    
    p1 = (p1 - 150.025169192)/201.12958056818*500
    ## the two value 150.025169192 and 201.12958056818 need to change if you use want to generate other property.

    nums = 1000
    x = np.linspace(0, 10, 33)

    prop_np  = np.zeros((nums, 128))
    prop_np[:,0:2] = np.ones((nums,2))*p1
    
    p_ar = np.array([p1])
    p = np.repeat(p_ar[:], nums,0)
    for i in range(1000):
        
        lat =  np.random.normal(0., 1., size=(nums, 128)).astype ('float32')    
        lat = lat + prop_np
                
        lat_torch =  torch.Tensor(lat).to(device)
        cond_torch =  torch.Tensor(p).to(device)
        
        output = autoencoder.decode(lat_torch, cond_torch)
        outp = output.cpu().detach().numpy()
        
        for j in range(nums):
            decode_smi = outp[j].reshape(1, 120, len(charset)).argmax(axis=2)[0]
            smi = decode_smiles_from_indexes(decode_smi, charset)
            m = Chem.MolFromSmiles(smi)
            #if (m != None) and (' ' not in smi) and (abs(ExactMolWt(m)-p1)<10*2)  and (abs(MolLogP(m)-p2)<0.548546*2)  and (abs(calculateScore(m)-p3)<0.3176*2) and (abs(qed(m)-p4)<0.0357*2) and (abs(CalcNumHBA(m)-p5)<1*2) and (abs(CalcNumHBD(m)-p6)<1*2)and (abs(CalcTPSA(m)-p7)<7.47*2):
            if (m != None) and (' ' not in smi):
                valid_smile.append(smi)
                
    valid_smile = list(set(valid_smile))
 
    return valid_smile

### Sampling by setting the lantent value (condition) at 210

In [5]:
lines = []
valid_smile_all = []

condition = 210
## set the condition value (MW value) 
X_train, X_test, charset = load_dataset('./data/processed.h5')
valid_smile = reconstructed(model, charset, condition)

In [6]:
valid_smile

['CCCCCCCCCCCCOOOCCCCCCCC',
 'CC(C)CCC1CcCCC1',
 'Cc1cccnnn1BBBC',
 'C[C@@H]CCCCCCCCCCCCCCCOOO',
 'C1cccccc1NBOOO',
 'CCOC(=O)c1ccccccc1BB',
 'C[C@@H]CCCOOCCCCCCCC',
 'CCCCCCCCCCCCCCCOOOO',
 'CC1CCCCCCC1CCCSS',
 'c1ccccccc1CCCCCCCCCOOOO',
 'CNNCCCCCCCCCCCCCCCCOOOOOO',
 'CC(CCCCCCCCCCCCCCCC)CCCCC',
 'c1ccccccc1NCCCCCCCC',
 'CC1CC=C1CCCCSSCC',
 'Cc1ccccc1c2ccccc2CCCCOOO',
 'C[C@@H]1cccccc1CNCCCO',
 'c1ccc(cc1SSS(SO))N',
 'CC(C)CCCCCOOCCCCCOO(OO)',
 'CCC=CCCCCCCCCCCCCCCCC',
 'CC(C(=O)CCCCCCCCCCC)C',
 'Cc1ccccccccccc1CC',
 'c1ccc(cc1S2ccccoo2)',
 'CC(C(CCCCCCCCCCOO))OCCCCCCC',
 'CCCCCCCCCCCCCCCCCCCCOO',
 'CC(CC)CCCCCCCCCCCCCOOO',
 'c1ccc(cc1)CCCCCCCCCCCCCCOOOOOO',
 'CCCCCCCCCCCCCCCCCCCCCCCO',
 'CCCCCCCCCCCCCCSS',
 'c1cc(ccc1SSS(S))NNN',
 'CCCCCCCCCC(C)CCCCCCCCCC',
 'c1ccccccc1COOOOOCCO',
 'CNC=CCCCCCCCCCC',
 'CCCCCC(CCCCCCCCCCCCCCC)',
 'c1ccc2cc1cccnnnnnn2S#CCCC',
 'CCOC(OO)CCCCCCCCCCC',
 'CCCCCCCCCCCCCCCCCCCC',
 'C1cccccc1CCCOOOOO',
 'CC(C)CCCCCOOCCCCC(OOO)O',
 'Cc1c(nno1)BBBNNNN',
 'c1cc

### filtering the molecular within the interval [200, 220] and save the result in local path

In [7]:
all_smile = []

all_smile = list(set(valid_smile))

mw = np.zeros((len(all_smile)))

for i in range(len(all_smile)):
    s = all_smile[i]
    m = Chem.MolFromSmiles(s)
    mw[i] = ExactMolWt(m)

lines = []
for i in range(len(mw)):
    if (mw[i]>=200) and (mw[i]<=220):
        lines.append(all_smile[i] + "     " + str(mw[i]))

with open("./result/sampling/MW_condition_210.txt", 'a+') as f:
    f.seek(0)
    data = f.read(100)

    if len(data)>0 :
        f.write("\n")
    for line in lines:
        f.write(line)
        f.write('\n')