In [None]:
import torch

# 예시 입력 데이터
a = torch.randn(2, 3).cuda()  # A matrix (2x3)
b = torch.randn(3, 4).cuda()  # B matrix (3x4)

print("Matrix A shape:", a.shape)
print("Matrix B shape:", b.shape)

# 매트릭스 곱셈
c = torch.matmul(a, b)
print("Matrix C shape:", c.shape)

In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Tue_Mar__8_18:18:20_PST_2022
Cuda compilation tools, release 11.6, V11.6.124
Build cuda_11.6.r11.6/compiler.31057947_0


In [2]:
from rdkit import Chem

# SMILES 문자열을 로드하여 분자 객체 생성
smiles = 'CC(=O)Nc1ccc(Nc2ccc3c4c(cc(=O)n3C)-c3ccccc3C(=O)c24)cc1'
mol = Chem.MolFromSmiles(smiles)

# 각 원자의 인덱스 출력
for atom in mol.GetAtoms():
    print(f'Atom {atom.GetSymbol()} has index {atom.GetIdx()}')

Atom C has index 0
Atom C has index 1
Atom O has index 2
Atom N has index 3
Atom C has index 4
Atom C has index 5
Atom C has index 6
Atom C has index 7
Atom N has index 8
Atom C has index 9
Atom C has index 10
Atom C has index 11
Atom C has index 12
Atom C has index 13
Atom C has index 14
Atom C has index 15
Atom C has index 16
Atom O has index 17
Atom N has index 18
Atom C has index 19
Atom C has index 20
Atom C has index 21
Atom C has index 22
Atom C has index 23
Atom C has index 24
Atom C has index 25
Atom C has index 26
Atom O has index 27
Atom C has index 28
Atom C has index 29
Atom C has index 30


In [3]:
from rdkit import Chem
from rdkit.Chem import AllChem

def kekulize_molecule(smiles):
    # SMILES 문자열을 통해 분자 객체 생성
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        print(f"Error: Could not parse SMILES: {smiles}")
        return None
    
    try:
        # 분자를 정화(Sanitize)하여 구조적 오류를 정리
        Chem.SanitizeMol(mol)
        # Kekulization 시도
        Chem.Kekulize(mol, clearAromaticFlags=True)
        print(f"Successfully kekulized molecule: {Chem.MolToSmiles(mol)}")
    except Chem.rdchem.KekulizeException as e:
        print(f"Error kekulizing molecule: {smiles}, {e}")
        return None
    except Exception as e:
        print(f"Error processing molecule: {smiles}, {e}")
        return None
    
    return mol

# 테스트할 SMILES 문자열 목록
smiles_list = [
    "CCO",  # 에탄올
    "C1=CC=CC=C1",  # 벤젠
    "CSc1c2sc(C3=C(C(=O)[O-])N4C(=O)C(C(C)O)C4C3C)cn2c[n+]1C1CCNC1",  # 복잡한 분자
    "CC(=O)Nc1ccc(Nc2ccc3c4c(cc(=O)n3C)-c3ccccc3C(=O)c24)cc1"
]

for smiles in smiles_list:
    kekulize_molecule(smiles)

Successfully kekulized molecule: CCO
Successfully kekulized molecule: C1=CC=CC=C1
Successfully kekulized molecule: CSC1=C2SC(C3=C(C(=O)[O-])N4C(=O)C(C(C)O)C4C3C)=CN2C=[N+]1C1CCNC1
Successfully kekulized molecule: CC(=O)NC1=CC=C(NC2=CC=C3C4=C2C(=O)C2=CC=CC=C2C4=CC(=O)N3C)C=C1


In [17]:
from multiprocessing import Pool
import math, random, sys
import pickle
import argparse
from functools import partial
import torch
import numpy

from hgraph import MolGraph, common_atom_vocab, PairVocab
import rdkit

def to_numpy(tensors):
    convert = lambda x : x.numpy() if type(x) is torch.Tensor else x
    a,b,c = tensors
    b = [convert(x) for x in b[0]], [convert(x) for x in b[1]]
    return a, b, c

def tensorize(mol_batch, vocab):
    x = MolGraph.tensorize(mol_batch, vocab, common_atom_vocab)
    return to_numpy(x)

def tensorize_pair(mol_batch, vocab):
    x, y = zip(*mol_batch)
    x = MolGraph.tensorize(x, vocab, common_atom_vocab)
    y = MolGraph.tensorize(y, vocab, common_atom_vocab)
    return to_numpy(x)[:-1] + to_numpy(y) #no need of order for x

def tensorize_cond(mol_batch, vocab):
    x, y, cond = zip(*mol_batch)
    cond = [map(int, c.split(',')) for c in cond]
    cond = numpy.array(cond)
    x = MolGraph.tensorize(x, vocab, common_atom_vocab)
    y = MolGraph.tensorize(y, vocab, common_atom_vocab)
    return to_numpy(x)[:-1] + to_numpy(y) + (cond,) #no need of order for x

In [9]:
import pandas as pd

_all_data = pd.read_csv('data/chembl/all.txt')
all_data = _all_data.iloc[:1000]
all_data.to_csv('all_data.txt', index=False)

In [18]:
if __name__ == "__main__":
    lg = rdkit.RDLogger.logger()
    lg.setLevel(rdkit.RDLogger.CRITICAL)

    train = 'all_data.txt'
    vocab = 'vocab.txt'

    data = [line.strip() for line in open(train)]
    batch_size = len(data) // 16 + 1
    batches = [data[i: i + batch_size] for i in range(0, len(data), batch_size)]

    pool = Pool(16)
    all_data = pool.map(partial(tensorize, vocab=vocab), batches)
    pool.close()
    pool.join()

    all_data = [d for d in all_data if d is not None]

    random.shuffle(data)

    batches = [data[i: i + 32] for i in range(0, len(data), 32)]
    func = partial(tensorize, vocab=vocab)
    all_data = pool.map(func, batches)
    all_data = [d for d in all_data if d is not None]
    num_splits = len(all_data) // 1000

    le = (len(all_data) + num_splits - 1) // num_splits

    for split_id in range(num_splits):
        st = split_id * le
        sub_data = all_data[st: st + le]

        with open(f'tensors-{split_id}.pkl', 'wb') as f:
            pickle.dump(sub_data, f, pickle.HIGHEST_PROTOCOL)

KekulizeException: Can't kekulize mol.  Unkekulized atoms: 21


In [7]:
from rdkit import Chem
from rdkit.Chem import AllChem

def preprocess_molecule(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        print(f"Error: Could not parse SMILES: {smiles}")
        return None
    try:
        Chem.SanitizeMol(mol)
        Chem.Kekulize(mol, clearAromaticFlags=True)
    except Chem.rdchem.KekulizeException as e:
        print(f"Error kekulizing molecule: {smiles}, {e}")
        return None
    except Exception as e:
        print(f"Error processing molecule: {smiles}, {e}")
        return None
    AllChem.Compute2DCoords(mol)
    return mol

# 테스트할 SMILES 문자열 목록
smiles_list = [
    "CCO",  # 에탄올
    "C1=CC=CC=C1",  # 벤젠
    "CSc1c2sc(C3=C(C(=O)[O-])N4C(=O)C(C(C)O)C4C3C)cn2c[n+]1C1CCNC1"  # 복잡한 분자
]

for smiles in smiles_list:
    print(preprocess_molecule(smiles))

<rdkit.Chem.rdchem.Mol object at 0x7fa06f5d9580>
<rdkit.Chem.rdchem.Mol object at 0x7fa06f5d9580>
<rdkit.Chem.rdchem.Mol object at 0x7fa06f5d9580>


In [None]:
# import torch
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # 환경 변수 설정
# # os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"  # 사용할 GPU 번호로 설정

# # CUDA 메모리 초기화
# torch.cuda.empty_cache()

# print("PyTorch CUDA Version:", torch.version.cuda)
# print("CUDA Available:", torch.cuda.is_available())
# print("CUDA Device Count:", torch.cuda.device_count())
# # print("Current CUDA Device Name:", torch.cuda.get_device_name(torch.cuda.current_device()))


# import torch.nn as nn
# import torch.optim as optim
# import torch.optim.lr_scheduler as lr_scheduler
# from torch.utils.data import DataLoader, Dataset # Dataset 추가
# from torch.cuda.amp import GradScaler, autocast # 추가

# import random
# import pickle # 추가

# import rdkit
# import math, random, sys
# import numpy as np
# import argparse
# import os
# from tqdm.auto import tqdm
# import wandb
# import networkx as nx

# from hgraph import *

# # 환경 변수 설정
# # os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"  # 사용할 GPU 번호로 설정

# # CUDA 메모리 초기화
# torch.cuda.empty_cache()

# lg = rdkit.RDLogger.logger() 
# lg.setLevel(rdkit.RDLogger.CRITICAL)

# # class TensorDataset(Dataset):
# #     def __init__(self, tensors):
# #         self.tensors = tensors

# #     def __len__(self):
# #         return len(self.tensors)

# #     def __getitem__(self, idx):
# #         return self.tensors[idx]

# # def load_tensors(file_path):
# #     with open(file_path, 'rb') as file:
# #         tensors = pickle.load(file)
# #     return tensors

# # 사용자 정의 collate_fn 작성
# # train_generator.py에서 DataLoader를 사용할 때 발생하는 TypeError는 DataLoader의 기본 collate_fn이 
# # networkx의 DiGraph 객체를 처리할 수 없어서 발생하는 문제입니다. 이를 해결하기 위해 사용자 정의 
# # collate_fn을 작성하여 DataLoader에 전달할 필요가 있습니다.
# # from torch.utils.data._utils.collate import default_collate

# # def custom_collate(batch):
# #     batch = list(filter(lambda x: x is not None, batch))
# #     if isinstance(batch[0], nx.DiGraph):
# #         return batch
# #     else:
# #         return default_collate(batch)



# parser = argparse.ArgumentParser()
# parser.add_argument('--train', required=True)
# parser.add_argument('--vocab', required=True)
# parser.add_argument('--atom_vocab', default=common_atom_vocab)
# parser.add_argument('--save_dir', required=True)
# parser.add_argument('--load_model', default=None)
# parser.add_argument('--seed', type=int, default=42)

# parser.add_argument('--rnn_type', type=str, default='LSTM')
# parser.add_argument('--hidden_size', type=int, default=1024) # 250
# parser.add_argument('--embed_size', type=int, default=1024) # 250
# parser.add_argument('--batch_size', type=int, default=10000)
# # parser.add_argument('--batch_size', type=int, default=50)
# parser.add_argument('--latent_size', type=int, default=32)
# parser.add_argument('--depthT', type=int, default=15)
# parser.add_argument('--depthG', type=int, default=15)
# parser.add_argument('--diterT', type=int, default=1)
# parser.add_argument('--diterG', type=int, default=3)
# parser.add_argument('--dropout', type=float, default=0.2) # 0

# parser.add_argument('--lr', type=float, default=1e-3)
# parser.add_argument('--clip_norm', type=float, default=5.0)
# parser.add_argument('--step_beta', type=float, default=0.001)
# parser.add_argument('--max_beta', type=float, default=1.0)
# parser.add_argument('--warmup', type=int, default=10000)
# parser.add_argument('--kl_anneal_iter', type=int, default=2000)

# parser.add_argument('--epoch', type=int, default=20)
# parser.add_argument('--anneal_rate', type=float, default=0.9)
# parser.add_argument('--anneal_iter', type=int, default=25000)
# parser.add_argument('--print_iter', type=int, default=10000) # 배치사이즈와 동일하게 
# parser.add_argument('--save_iter', type=int, default=100000)

# args = parser.parse_args()
# print(args)

# # wandb
# wandb.login()
# wandb.init(project="jtnn_view", entity="seungbeom_jin")

# torch.manual_seed(args.seed)
# random.seed(args.seed)

# # vocab.txt 로드 및 처리
# vocab = [x.strip("\r\n ").split() for x in open(args.vocab)] 
# args.vocab = PairVocab(vocab)

# # 학습 및 검증 데이터 로드
# # train_tensors = load_tensors('train_processed/data/train/train_tensors.pkl')
# # valid_tensors = load_tensors('train_processed/data/valid/valid_tensors.pkl')

# # # 데이터 로더 설정
# # train_dataset = TensorDataset(train_tensors)
# # train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=4, 
# #                                pin_memory=True, collate_fn=custom_collate)
# # print('train_dataset load!')

# # valid_dataset = TensorDataset(valid_tensors)
# # valid_data_loader = DataLoader(valid_dataset, batch_size=args.batch_size, num_workers=4, 
# #                                pin_memory=True, collate_fn=custom_collate)
# # print('valid_dataset load!')

# # 모델 정의
# # model = HierVAE(args).cuda()
# model = HierVAE(args).to(device)
# print("Model #Params: %dK" % (sum([x.nelement() for x in model.parameters()]) / 1000,))

# for param in model.parameters():
#     if param.dim() == 1:
#         nn.init.constant_(param, 0)
#     else:
#         nn.init.xavier_normal_(param)

# optimizer = optim.Adam(model.parameters(), lr=args.lr)
# scaler = GradScaler()
# scheduler = lr_scheduler.ExponentialLR(optimizer, args.anneal_rate)

# if args.load_model:
#     print('continuing from checkpoint ' + args.load_model)
#     model_state, optimizer_state, total_step, beta = torch.load(args.load_model)
#     model.load_state_dict(model_state)
#     optimizer.load_state_dict(optimizer_state)
# else:
#     total_step = beta = 0

# # 파라미터 및 그래디언트 노름 계산 함수
# param_norm = lambda m: math.sqrt(sum([p.norm().item() ** 2 for p in m.parameters()]))
# grad_norm = lambda m: math.sqrt(sum([p.grad.norm().item() ** 2 for p in m.parameters() if p.grad is not None]))


# meters = torch.zeros(6).to(device)
# total_step = 0
# beta = 0.0

# # WandB 설정
# wandb.config.update(args)
# wandb.watch(model, log="all")

# # 평가 함수
# # def evaluate(model, data_loader):
# #     model.eval()
# #     meters = np.zeros(6)
# #     with torch.no_grad():
# #         for batch in tqdm(data_loader):
# #             loss, kl_div, wacc, iacc, tacc, sacc = model(*batch, beta=1.0)
# #             meters += np.array([kl_div.item(), loss.item(), wacc.item() * 100, iacc.item() * 100, tacc.item() * 100, sacc.item() * 100])
# #     meters /= len(data_loader)
# #     return meters

# # index_scatter 함수 수정
# def index_scatter(all_data, index, sub_data):
#     buf = torch.zeros_like(all_data, dtype=sub_data.dtype)
#     buf = buf.scatter_(0, index.repeat(sub_data.size(1), 1).t(), sub_data)
#     return buf

# print('training start!')
# for epoch in range(args.epoch):
#     # dataset = DataFolder(args.train, args.batch_size)
#     dataset = DataFolder(args.train, args.batch_size)
#     model.train()
#     meters = torch.zeros(6).to(device)
#     beta = 0.0

#     # for batch in tqdm(dataset):
#     for batch in tqdm(dataset):
#         total_step += 1
#         model.zero_grad()
        
        
#         # PyTorch에서 Automatic Mixed Precision (AMP) 기능을 사용하는 방법입니다. 
#         # AMP는 모델 학습 시 16비트와 32비트 부동 소수점 연산을 혼합하여 사용함으로써 GPU 메모리 사용량을 줄이고 
#         # 연산 속도를 높일 수 있습니다.
#         with autocast():
#             loss, kl_div, wacc, iacc, tacc, sacc = model(*batch, beta=beta)

#         # 원래 코드
#         # loss.backward()
#         # nn.utils.clip_grad_norm_(model.parameters(), args.clip_norm)
#         # optimizer.step()
#         scaler.scale(loss).backward()
#         scaler.unscale_(optimizer)
#         nn.utils.clip_grad_norm_(model.parameters(), 5.0)
#         scaler.step(optimizer)
#         scaler.update()
        

#         # Print kl_div to debug
#         # print(kl_div)
        
#         # Ensure all relevant tensors are moved to CPU and converted to Python scalars if needed
#         # kl_div_cpu = kl_div.cpu().item() if torch.is_tensor(kl_div) else kl_div
#         # loss_cpu = loss.cpu().item() if torch.is_tensor(loss) else loss
#         # wacc_cpu = wacc.cpu().item() if torch.is_tensor(wacc) else wacc * 100
#         # iacc_cpu = iacc.cpu().item() if torch.is_tensor(iacc) else iacc * 100
#         # tacc_cpu = tacc.cpu().item() if torch.is_tensor(tacc) else tacc * 100
#         # sacc_cpu = sacc.cpu().item() if torch.is_tensor(sacc) else sacc * 100

#         meters += torch.tensor([kl_div, loss, wacc * 100, iacc * 100, tacc * 100, sacc * 100]).to(device)

#         if total_step % args.print_iter == 0:
#             meters /= args.print_iter
#             print("[%d] Beta: %.3f, KL: %.2f, loss: %.3f, Word: %.2f, %.2f, Topo: %.2f, Assm: %.2f, PNorm: %.2f, GNorm: %.2f" % 
#                   (total_step, beta, meters[0], meters[1], meters[2], meters[3], meters[4], meters[5], param_norm(model), grad_norm(model)))
#             sys.stdout.flush()
            
#             # WandB 로그 기록
#             wandb.log({
#                 "batch": total_step,
#                 "epoch" : epoch,
#                 "beta": beta,
#                 "KL Divergence": meters[0],
#                 "Loss": meters[1],
#                 "Word Accuracy": meters[2],
#                 "Instance Accuracy": meters[3],
#                 "Topology Accuracy": meters[4],
#                 "Assembly Accuracy": meters[5],
#                 "Parameter Norm": param_norm(model),
#                 "Gradient Norm": grad_norm(model)
#             })
            
#             meters *= 0
        
#         if total_step % args.save_iter == 0:
#             ckpt = (model.state_dict(), optimizer.state_dict(), total_step, beta)
#             torch.save(ckpt, os.path.join(args.save_dir, f"model.ckpt.{total_step}"))

#         if total_step % args.anneal_iter == 0:
#             scheduler.step()
#             print("learning rate: %.6f" % scheduler.get_lr()[0])

#         if total_step >= args.warmup and total_step % args.kl_anneal_iter == 0:
#             beta = min(args.max_beta, beta + args.step_beta)
    
#     # 각 에포크가 끝날 때 검증 손실 계산
#     # valid_metrics = evaluate(model, valid_data_loader)
#     # print(f"Epoch {epoch}, Train Loss: {meters[1].item()}, Validation Loss: {valid_metrics[1].item()}, KL Divergence: {valid_metrics[0].item()}, Word Accuracy: {valid_metrics[2].item()}, Instance Accuracy: {valid_metrics[3].item()}, Topology Accuracy: {valid_metrics[4].item()}, Assembly Accuracy: {valid_metrics[5].item()}")
    
#     # # WandB 로그 기록
#     # wandb.log({
#     #     "epoch": epoch,
#     #     "train_loss": meters[1].item(),
#     #     "valid_loss": valid_metrics[1].item(),
#     #     "valid_kl_divergence": valid_metrics[0].item(),
#     #     "valid_word_accuracy": valid_metrics[2].item(),
#     #     "valid_instance_accuracy": valid_metrics[3].item(),
#     #     "valid_topology_accuracy": valid_metrics[4].item(),
#     #     "valid_assembly_accuracy": valid_metrics[5].item(),
#     #     "learning_rate": scheduler.get_last_lr()[0]
#     # })


# # for epoch in range(args.epoch):
# #     dataset = DataFolder(args.train, args.batch_size)

# #     for batch in tqdm(dataset):
# #         total_step += 1
# #         model.zero_grad()
        
# #         loss, kl_div, wacc, iacc, tacc, sacc = model(*batch, beta=beta)

# #         loss.backward()
# #         nn.utils.clip_grad_norm_(model.parameters(), args.clip_norm)
# #         optimizer.step()

# #         # print(kl_div)
# #         kl_div_cpu = kl_div.cpu().item() if torch.is_tensor(kl_div) else kl_div
# #         meters = meters + np.array([kl_div, loss.item(), wacc * 100, iacc * 100, tacc * 100, sacc * 100])

# #         if total_step % args.print_iter == 0:
# #             meters /= args.print_iter
# #             print("[%d] Beta: %.3f, KL: %.2f, loss: %.3f, Word: %.2f, %.2f, Topo: %.2f, Assm: %.2f, PNorm: %.2f, GNorm: %.2f" % (total_step, beta, meters[0], meters[1], meters[2], meters[3], meters[4], meters[5], param_norm(model), grad_norm(model)))
# #             sys.stdout.flush()
# #             meters *= 0
        
# #         if total_step % args.save_iter == 0:
# #             ckpt = (model.state_dict(), optimizer.state_dict(), total_step, beta)
# #             torch.save(ckpt, os.path.join(args.save_dir, f"model.ckpt.{total_step}"))

# #         if total_step % args.anneal_iter == 0:
# #             scheduler.step()
# #             print("learning rate: %.6f" % scheduler.get_lr()[0])

# #         if total_step >= args.warmup and total_step % args.kl_anneal_iter == 0:
# #             beta = min(args.max_beta, beta + args.step_beta)


In [2]:
import pickle
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import DataLoader, Dataset

class TensorDataset(Dataset):
    def __init__(self, tensors):
        self.tensors = tensors

    def __len__(self):
        return len(self.tensors)

    def __getitem__(self, idx):
        return self.tensors[idx]

def load_tensors(file_path):
    with open(file_path, 'rb') as file:
        tensors = pickle.load(file)
    return tensors



train_tensors = load_tensors('train_processed/data/train/train_tensors.pkl')
valid_tensors = load_tensors('train_processed/data/valid/valid_tensors.pkl')

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f230bec90a0>>
Traceback (most recent call last):
  File "/home/lnptest/anaconda3/envs/jtnn_test/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
