In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import DataLoader
import math, random, sys
import numpy as np
import argparse
import os
from tqdm import tqdm
import pandas as pd

sys.path.append('/home/lnptest/SB_jin/git_test/hgraph2graph_wandb/')
from hgraph import *
from hgraph import vocab2
import rdkit

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  # Arrange GPU devices starting from 0
os.environ["CUDA_VISIBLE_DEVICES"]= "2"  # Set the GPU 2 to use

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print('Device:', device)
print('Current cuda device:', torch.cuda.current_device())
print('Count of using GPUs:', torch.cuda.device_count())

lg = rdkit.RDLogger.logger() 
lg.setLevel(rdkit.RDLogger.CRITICAL)

# argparse 대신 직접 args 설정
class Args:
    vocab = "recovered_vocab_2000.txt"
    atom_vocab = common_atom_vocab
    model = "ckpt/model_origin.ckpt"
    seed = 7
    nsample = 100
    rnn_type = 'LSTM'
    hidden_size = 250
    embed_size = 250
    batch_size = 50
    latent_size = 32
    depthT = 15
    depthG = 15
    diterT = 1
    diterG = 3
    dropout = 0.0

args = Args()


vocab = [x.strip("\r\n ").split() for x in open(args.vocab)] 
args.vocab = PairVocab(vocab)

model = HierVAE(args).to(device)
    
model.load_state_dict(torch.load(args.model)[0])
model.eval()

Device: cuda
Current cuda device: 0
Count of using GPUs: 4


HierVAE(
  (encoder): HierMPNEncoder(
    (E_c): Sequential(
      (0): Embedding(1578, 250)
      (1): Dropout(p=0.0, inplace=False)
    )
    (E_i): Sequential(
      (0): Embedding(5623, 250)
      (1): Dropout(p=0.0, inplace=False)
    )
    (W_c): Sequential(
      (0): Linear(in_features=500, out_features=250, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.0, inplace=False)
    )
    (W_i): Sequential(
      (0): Linear(in_features=500, out_features=250, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.0, inplace=False)
    )
    (W_root): Sequential(
      (0): Linear(in_features=500, out_features=250, bias=True)
      (1): Tanh()
    )
    (tree_encoder): MPNEncoder(
      (W_o): Sequential(
        (0): Linear(in_features=500, out_features=250, bias=True)
        (1): ReLU()
        (2): Dropout(p=0.0, inplace=False)
      )
      (rnn): LSTM(
        (W_i): Sequential(
          (0): Linear(in_features=520, out_features=250, bias=True)
          (1): Sigmoid()
       

In [29]:
torch.manual_seed(args.seed)
random.seed(args.seed)


sm_list = []
smiles_list = []
with torch.no_grad():
    for _ in tqdm(range(args.nsample // args.batch_size)):
        smiles_list = model.sample(args.batch_size, greedy=True)
        for _,smiles in enumerate(smiles_list):
            sm_list.append(smiles)
            
# with torch.no_grad():
#     for _ in tqdm(range((args.nsample + args.batch_size - 1) // args.batch_size)):
#         batch_smiles_list = model.sample(args.batch_size, greedy=True)
#         valid_smiles_list = [smiles for smiles in batch_smiles_list if smiles is not None]  # None 값 필터링
#         smiles_list.extend(valid_smiles_list)

100%|██████████| 2/2 [00:02<00:00,  1.42s/it]


In [30]:
print(len(sm_list))
sm_list

100


['C=Cc1c(C2OC(=O)C(C)=C(C)C23OC(c2ccccc2)=NC(c2ccccc2)C3c2ccccc2)c(CCc2ccccc2)c(C2OC(=O)C(C)=C(C)C23OC(c2ccccc2)=NC(c2ccccc2)C3c2ccccc2)c(C(C)=O)c1C1OC(=O)C(C)=C(C)C12OC(c1ccccc1)=NC(c1ccccc1)C2c1ccccc1',
 'Cc1cn(S(=O)(=O)CS)nc1Br',
 'COCCC(C)=O',
 'C=O',
 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCN(O)C1CC(CCCCCCCCCCC)C1=O',
 'COCC(C)=O',
 'C=O',
 'CC(=O)c1c(C)c(C(C)=O)c(C(C)(C)C)c(C(C)=O)c1C(C)=O',
 'COC(C)=O',
 'CC(C)=O',
 'CC(C)=O',
 'COC1C=CC=CC12CCN(C1(N)CC(C)C1(C)C)C1CN(c3cn[nH]c3)CC12',
 'C=CCCCCCCCC',
 'COC(C)=O',
 'CCc1cccc(OC)c1',
 'CC(=O)CCC1C(=O)OC(C)=C1C1=C(C(C)=O)C(CCC(C)=O)C(=O)O1',
 'O=C1NNC(=O)C1=O',
 'CC(C)=O',
 'C=O',
 'CS(N)(=O)=O',
 'COC1(C(C)=O)OC2(C(=O)C(=O)N(C)C2=O)N1C',
 'Cc1nc2n(c1-c1cccc(F)c1)C(=O)C(=O)N(C1C=C(Cc3cccc(F)c3)C=CN1C)C(=O)C2=O',
 'COC(C)(C)C',
 'CC1CC(N2C(=O)C3(C4C2OC(=S)N4C2NC(=O)c4ccccc42)N(C2C(=O)N(C4CCNC4)C4(C2=O)C2(ON5C4(C)C(=O)C46C(=O)N(C78CC(=O)C7C(=O)N(CN=O)C8=O)C(=O)C54OC(=O)N6C45CNCC4C(=O)C(=O)N5)C(=O)C(=O)N(C45CC(=O)C4C(=O)N(CN=O)C5=O)C2

In [None]:
# DataFrame으로 변환
df = pd.DataFrame(sm_list, columns=['SMILES']) 
df.to_csv('result.csv')
df

In [None]:
smiles_list

In [None]:
df

In [None]:
print(dir(args.vocab))

In [None]:
print(args.vocab)

# we train

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import DataLoader
import math, random, sys
import numpy as np
import argparse
import os
from tqdm import tqdm
import pandas as pd

sys.path.append('/home/lnptest/SB_jin/git_test/hgraph2graph_wandb/')
from hgraph import *
# from hgraph import vocab2
import rdkit

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  # Arrange GPU devices starting from 0
os.environ["CUDA_VISIBLE_DEVICES"]= "2"  # Set the GPU 2 to use

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print('Device:', device)
print('Current cuda device:', torch.cuda.current_device())
print('Count of using GPUs:', torch.cuda.device_count())

lg = rdkit.RDLogger.logger() 
lg.setLevel(rdkit.RDLogger.CRITICAL)

# argparse 대신 직접 args 설정
class Args:
    vocab = "vocab.txt"
    atom_vocab = common_atom_vocab
    model = "/home/lnptest/SB_jin/git_test/hgraph2graph_wandb/ckpt/chembl-240709-150/model_1604.ckpt"
    seed = 7
    nsample = 100000
    rnn_type = 'LSTM'
    hidden_size = 512
    embed_size = 512
    batch_size = 50
    latent_size = 32
    depthT = 15
    depthG = 15
    diterT = 1
    diterG = 3
    dropout = 0.0

args = Args()


vocab = [x.strip("\r\n ").split() for x in open(args.vocab)] 
args.vocab = PairVocab(vocab)

model = HierVAE(args).to(device)
    
model.load_state_dict(torch.load(args.model)[0])
model.eval()

Device: cuda
Current cuda device: 0
Count of using GPUs: 4




HierVAE(
  (encoder): HierMPNEncoder(
    (E_c): Sequential(
      (0): Embedding(1576, 512)
      (1): Dropout(p=0.0, inplace=False)
    )
    (E_i): Sequential(
      (0): Embedding(5625, 512)
      (1): Dropout(p=0.0, inplace=False)
    )
    (W_c): Sequential(
      (0): Linear(in_features=1024, out_features=512, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.0, inplace=False)
    )
    (W_i): Sequential(
      (0): Linear(in_features=1024, out_features=512, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.0, inplace=False)
    )
    (W_root): Sequential(
      (0): Linear(in_features=1024, out_features=512, bias=True)
      (1): Tanh()
    )
    (tree_encoder): MPNEncoder(
      (W_o): Sequential(
        (0): Linear(in_features=1024, out_features=512, bias=True)
        (1): ReLU()
        (2): Dropout(p=0.0, inplace=False)
      )
      (rnn): LSTM(
        (W_i): Sequential(
          (0): Linear(in_features=1044, out_features=512, bias=True)
          (1): Sigmoid()
  

In [9]:
torch.manual_seed(args.seed)
random.seed(args.seed)


sm_list = []
smiles_list = []
with torch.no_grad():
    for _ in tqdm(range(args.nsample // args.batch_size)):
        smiles_list = model.sample(args.batch_size, greedy=True)
        # print(type(smiles_list))
        # print(smiles_list[:5])
        for _,smiles in enumerate(smiles_list):
            sm_list.append(smiles)

 25%|██▌       | 5/20 [00:00<00:00, 21.19it/s]

<class 'list'>
['CC', 'CC', 'CCC', 'CC', 'CCC']
<class 'list'>
['CC', 'CC', 'CC', 'CC', 'CO']
<class 'list'>
['CCNC', 'CC', 'CC', 'CC', 'CCC']
<class 'list'>
['CC', 'CC', 'CC', 'CCC', 'CC']
<class 'list'>
['CC', 'CC', 'CCC', 'CC', 'CCC']


 40%|████      | 8/20 [00:00<00:00, 21.84it/s]

<class 'list'>
['CCC', 'CC', 'CC', 'CNC', 'CC']
<class 'list'>
['CCCC', 'CCC', 'CC', 'CC', 'CNC']
<class 'list'>
['CC', 'CCC', 'CC', 'CC', 'CO']
<class 'list'>
['CNC', 'CC', 'CC', 'CCC', 'CCC']
<class 'list'>
['CC', 'CC', 'CCC', 'CC', 'CC']
<class 'list'>
['CCC', 'CC', 'CC', 'CCC', 'CCC']


 70%|███████   | 14/20 [00:00<00:00, 23.56it/s]

<class 'list'>
['CCC', 'CC', 'CCC', 'CC', 'CNC']
<class 'list'>
['CC', 'CC', 'CC', 'CCC', 'CO']
<class 'list'>
['CC', 'CC', 'CC', 'CC', 'CNC']
<class 'list'>
['CNC', 'CC', 'CN', 'CC', 'CCC']
<class 'list'>
['CNC', 'CO', 'CO', 'CC', 'CCC']


100%|██████████| 20/20 [00:00<00:00, 22.93it/s]

<class 'list'>
['CNC', 'CC', 'CO', 'CC', 'CC=O']
<class 'list'>
['CC', 'CCC', 'CC', 'CCC', 'CNC']
<class 'list'>
['CCC', 'CCC', 'CCC', 'CNC', 'CC']
<class 'list'>
['CC', 'CNC', 'CC', 'CC', 'CNC']





In [11]:
torch.manual_seed(args.seed)
random.seed(args.seed)


sm_list = []
smiles_list = []
with torch.no_grad():
    for _ in tqdm(range(args.nsample // args.batch_size)):
        smiles_list = model.sample(args.batch_size, greedy=True)
        for _,smiles in enumerate(smiles_list):
            if len(smiles) > 20:
                sm_list.append(smiles)
            
# with torch.no_grad():
#     for _ in tqdm(range((args.nsample + args.batch_size - 1) // args.batch_size)):
#         batch_smiles_list = model.sample(args.batch_size, greedy=True)
#         valid_smiles_list = [smiles for smiles in batch_smiles_list if smiles is not None]  # None 값 필터링
#         smiles_list.extend(valid_smiles_list)

print(len(sm_list))
sm_list

100%|██████████| 2000/2000 [01:27<00:00, 22.87it/s]

0





[]

In [12]:
print(len(sm_list))
sm_list

0


[]

In [12]:
torch.manual_seed(args.seed)
random.seed(args.seed)


sm_list = []
smiles_list = []
with torch.no_grad():
    for _ in tqdm(range(args.nsample // args.batch_size)):
        smiles_list = model.sample(args.batch_size, greedy=True)
        for _,smiles in enumerate(smiles_list):
            if len(smiles) > 20:
                sm_list.append(smiles)
            
# with torch.no_grad():
#     for _ in tqdm(range((args.nsample + args.batch_size - 1) // args.batch_size)):
#         batch_smiles_list = model.sample(args.batch_size, greedy=True)
#         valid_smiles_list = [smiles for smiles in batch_smiles_list if smiles is not None]  # None 값 필터링
#         smiles_list.extend(valid_smiles_list)

print(len(sm_list))
sm_list

100%|██████████| 200000/200000 [2:08:48<00:00, 25.88it/s]  

30473





['CC1C[NH2+]C(C)C(C)C[NH2+]1',
 'CC1CC(C)C(C)[NH2+][NH2+]1',
 'CC1CSc2ccccc2CSCC(=O)SS1',
 'CC1CC(C)C(C)[NH2+][NH2+]1',
 'c1ccc2c(c1)CSCCSSCCS2',
 'CC1CC(C)C(C)[NH2+][NH2+]1',
 'CC1CC(C)C(C)[NH2+][NH2+]1',
 'O=C1CSc2ccccc2CSCCSS1',
 'CC1CCC(=O)[NH2+][NH2+]1',
 'c1ccc2c(c1)CSCCSSCCS2',
 'CC1CCC(=O)[NH2+][NH2+]1',
 'CC1CSc2ccccc2CSCC(=O)SS1',
 'c1ccc2c(c1)CSCCSSCCS2',
 'c1ccc2c(c1)CSCCSSCCS2',
 'CC1CC(C)C(C)[NH2+][NH2+]1',
 'CC1CCC(=O)[NH2+][NH2+]1',
 'CC1CSc2ccccc2CSCC(=O)SS1',
 'CC1CC(C)C(C)[NH2+][NH2+]1',
 'CC1CC(C)C(C)[NH2+][NH2+]1',
 'c1ccc2c(c1)CSCCSSCCS2',
 'CC1CSc2ccccc2CSCC(=O)SS1',
 'CC1=CCCCCOCC(=O)C(C)C1',
 'c1ccc2c(c1)CSCCSSCCS2',
 'c1ccc2c(c1)CSCCSSCCS2',
 'CC1C[NH2+]C(C)C(C)C[NH2+]1',
 'CC1CC(C)C(C)[NH2+][NH2+]1',
 'c1ccc2c(c1)CSCCSSCCS2',
 'CC1CC(C)C(C)[NH2+][NH2+]1',
 'CC1CC(C)C(C)[NH2+][NH2+]1',
 'CC1CC(C)[NH+]([O-])[NH2+]1',
 'c1ccc2c(c1)CSCCSSCCS2',
 'CC1CC(C)C(C)[NH2+][NH2+]1',
 'CC1CC(C)C(C)[NH2+][NH2+]1',
 'CC1C[NH2+]C(C)C(C)C[NH2+]1',
 'CC1CC(C)C(C)[NH2+][NH2+]1',

In [1]:
print(len(sm_list))
sm_list

NameError: name 'sm_list' is not defined

In [16]:
len(sm_list[5])

25