In [1]:
import gentrl
import torch
import pickle
import pandas as pd
import numpy as np
from rdkit.Chem import Draw
from rdkit import RDLogger
from moses.metrics import mol_passes_filters, QED, SA, logP
from moses.metrics.utils import get_n_rings, get_mol
import matplotlib.pyplot as plt
import random
import os
from utilities.candiconfig import CandiConfig

torch.cuda.set_device(0)
RDLogger.DisableLog('rdApp.*')


In [2]:
enc = gentrl.RNNEncoder(latent_size=50)
dec = gentrl.DilConvDecoder(latent_input_size=50)
model = gentrl.GENTRL(enc, dec, 50 * [('c', 20)], [('c', 20)], beta=0.001)
model.cuda();

model.load('candi_saved_gentrl/')
model.cuda();

In [3]:
# def get_num_rings_6(mol):
#     r = mol.GetRingInfo()
#     return len([x for x in r.AtomRings() if len(x) > 6])


# def penalized_logP(mol_or_smiles, masked=False, default=-5):
#     mol = get_mol(mol_or_smiles)
#     if mol is None:
#         return default
#     reward = logP(mol) - SA(mol) - get_num_rings_6(mol)
#     if masked and not mol_passes_filters(mol):
#         return default
#     return reward

In [4]:
#  # old,2021-07-26 原版：生成size个候选分子， 改版：可以生成size个有效分子

# def sample_from_model(model, fps_som, exist, size=1000, threshold = 0):
#     generated = []

#     while len(generated) < size:
#         sampled = model.sample(size//10)
#         valid_samples = []
#         for s in sampled:
#             if get_mol(s) and len(s) > 10 and s not in exist:
#                 valid_samples.append(s)
#                 exist.add(s)
        
#         generated += valid_samples
        
#     candi_smiles = []
#     grades = []
#     for s in generated:
#         if fps_som.som_reward(s) >= threshold and s not in fps_som.smiles_set:
# #         if fps_som.som_reward(s) >= threshold and s not in fps_som.smiles_set:
#             grades.append(fps_som.som_reward(s))
#             candi_smiles.append(s)
            
#     idxs = np.argsort(grades).tolist()
#     idxs = idxs[::-1]
    
#     smiles_list = [candi_smiles[i] for i in idxs]
#     grades_list = [grades[i] for i in idxs]
    
#     return smiles_list, grades_list

In [5]:
# new, 2021-07-26 Original version: Generate size candidate molecules, Revised version: Can generate size valid molecules
def sample_from_model(model, fps_som, exist, size=1000, threshold = 0):
    generated = []
    grades = []
    
    num = 0
    
    while len(generated) < size:
        #sampled = model.sample(size//10)
        sampled = model.sample(500)
        if len(generated) >= num * size // 50:
            print("already have: " + str(len(generated)) + " molecules...")
            num += 1
#         sampled = model.sample(5000)
        for s in sampled:
            if get_mol(s) and len(s) > 10 and s not in exist and fps_som.som_reward(s) >= threshold:
                generated.append(s)
                grades.append(fps_som.som_reward(s))
                exist.add(s)
        #print(len(generated), len(grades))
        
            
    idxs = np.argsort(grades).tolist()
    idxs = idxs[::-1]
    
    smiles_list = [generated[i] for i in idxs]
    grades_list = [grades[i] for i in idxs]
    
    return smiles_list, grades_list

In [6]:
config = CandiConfig(smiles_format=2, topn_fp_features=5, mode='threshold', max_fp_features=2048, threshold=0.3, morgan_radius=2)
with open(config.FpsSOM_model, 'rb') as infile:
    fps_som = pickle.load(infile)

In [7]:
# count = len(open('./generated_smiles/generated_smiles.csv','rU').readlines())
# count

In [1]:
# 2022-09-25 Problem with deduplication, modify the file format, and make smiles in the second column

# filename = 'generated_smiles.csv'
# preText = '210501'
# lineList = []
# with open('./generated_smiles/'+filename, 'r', encoding='utf-8') as f:
#     for line in f:
#         lineList.append(line)
        
# for i in range(1, count+1):
#     pre = preText + '_' + str(i)
#     lineList[i-1] = pre+','+lineList[i-1] 
    
    
# writeFileName = 'candi_smiles_2021_05_01.csv'
# with open('./generated_smiles/'+writeFileName, 'w', encoding='utf-8') as f:
#     for line in lineList:
#         f.write(line)

In [9]:
exist_smiles = set()


with open('./dataset/train_dataset.csv', 'r', encoding='utf-8') as f:
    for line in f:
        exist_smiles.add(line.strip().split(',')[0])

path = './generated_smiles/'
files = os.listdir(path)
csv_files = [path+file for file in files]

print(csv_files)

for file in csv_files:
    if os.path.isfile(file):
        with open(file, 'r', encoding='utf-8') as f:
            for line in f:
                exist_smiles.add(line.strip().split(',')[1])
                


['./generated_smiles/candi_smiles_2022_09_25_2.csv', './generated_smiles/candi_smiles_2021_09_03.csv', './generated_smiles/candi_smiles_2021_08_24.csv', './generated_smiles/.ipynb_checkpoints', './generated_smiles/generated_smiles.csv', './generated_smiles/candi_smiles_2022_09_25_1.csv', './generated_smiles/candi_smiles_2021_07_27.csv']


In [None]:
size_num = 50000
smiles_list, grades_list = sample_from_model(model, fps_som, exist_smiles, size = size_num)

already have: 0 molecules...
already have: 1005 molecules...
already have: 2000 molecules...
already have: 3009 molecules...
already have: 4000 molecules...
already have: 5007 molecules...
already have: 6010 molecules...
already have: 7007 molecules...
already have: 8006 molecules...
already have: 9001 molecules...
already have: 10010 molecules...
already have: 11003 molecules...
already have: 12002 molecules...
already have: 13005 molecules...
already have: 14009 molecules...
already have: 15003 molecules...
already have: 16010 molecules...
already have: 17002 molecules...
already have: 18001 molecules...
already have: 19004 molecules...
already have: 20004 molecules...
already have: 21004 molecules...
already have: 22004 molecules...
already have: 23002 molecules...
already have: 24003 molecules...
already have: 25008 molecules...
already have: 26016 molecules...
already have: 27010 molecules...
already have: 28000 molecules...
already have: 29005 molecules...
already have: 30000 mol

In [None]:
topn = size_num // 2
# topn = 50000
topn = topn if topn < len(smiles_list) else len(smiles_list)

selected_smiles_list = smiles_list[:topn]
selected_grades_list = grades_list[:topn]

with open('candidate_smiles.txt', 'w', encoding='utf-8') as f:
    f.write("SMILES,SOM_REWARD\n")
    for i,s in enumerate(selected_smiles_list):
        f.write(s+','+str(selected_grades_list[i])+'\n')

In [12]:
count = len(open('candidate_smiles.txt','rU').readlines())
print(count)

25001


  """Entry point for launching an IPython kernel.


### The generated smiles molecules are scored by SA, pLogP, etc.

In [13]:
import time

now_date = time.strftime("%Y_%m_%d", time.localtime())
pre_date = time.strftime("%Y%m%d", time.localtime())[2:]



smiles = []
som_reward = []
with open("candidate_smiles.txt", 'r', encoding='utf-8') as f:
    for line in f:
        sm, grade = line.strip().split(',')
        smiles.append(sm)
        som_reward.append(grade)

smiles = smiles[1:]
som_reward = som_reward[1:]
SA_grades = [SA(get_mol(sm)) for sm in smiles]

with open("./generated_smiles/candi_smiles_"+now_date+".csv",'w',encoding='utf-8') as f:
#     f.write("No,Smiles,SOM_reward,SA_grade\n")
    for i, sm in enumerate(smiles):
        line = pre_date+"_"+str(i+1)+','+ sm +','+ som_reward[i] +','+ str(SA_grades[i])+'\n'
        f.write(line)
    

In [7]:
import time

In [16]:
t = time.strftime("%Y-%m-%d", time.localtime())

In [15]:
t

'20210918'