In [1]:
import os
import rdkit
import torch
import random
import numpy as np
import pandas as pd

from rdkit import Chem
from deepchem.feat.smiles_tokenizer import BasicSmilesTokenizer
from rdkit.Chem import Draw, AllChem, Descriptors, rdDepictor, rdDistGeom, MACCSkeys, rdMolDescriptors

from torch.utils.data import Dataset
from torch_geometric import utils as pyg_utils
from torch_geometric.data import InMemoryDataset, download_url, extract_gz, Data, DataLoader, Batch

# # 작업을 위한 별도의 함수 불러오기
# from utils.download_preprocess import CustomMoleculeNet, atom_features, EDGE_FEATURES

# 시각화를 위한 라이브러리
import matplotlib.pyplot as plt
import seaborn as sns

print(rdkit.__version__)

Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead
2025.03.3


In [2]:
CHARSMISET = {"(": 1, ".": 2, "0": 3, "2": 4, "4": 5, "6": 6, "8": 7, "@": 8,
                "B": 9, "D": 10, "F": 11, "H": 12, "L": 13, "N": 14, "P": 15, "R": 16,
                "T": 17, "V": 18, "Z": 19, "\\": 20, "b": 21, "d": 22, "f": 23, "h": 24,
                "l": 25, "n": 26, "r": 27, "t": 28, "#": 29, "%": 30, ")": 31, "+": 32,
                "-": 33, "/": 34, "1": 35, "3": 36, "5": 37, "7": 38, "9": 39, "=": 40,
                "A": 41, "C": 42, "E": 43, "G": 44, "I": 45, "K": 46, "M": 47, "O": 48,
                "S": 49, "U": 50, "W": 51, "Y": 52, "[": 53, "]": 54, "a": 55, "c": 56,
                "e": 57, "g": 58, "i": 59, "m": 60, "o": 61, "s": 62, "u": 63, "y": 64, '~': 65} # add ~: 65 

CHARISOSMILEN = 65

CHARPROTSET = {"A": 1, "C": 2, "B": 3, "E": 4, "D": 5, "G": 6,
               "F": 7, "I": 8, "H": 9, "K": 10, "M": 11, "L": 12,
               "O": 13, "N": 14, "Q": 15, "P": 16, "S": 17, "R": 18,
               "U": 19, "T": 20, "W": 21, "V": 22, "Y": 23, "X": 24, "Z": 25}

CHARPROTLEN = 25


########################################################################################################################
########## Function
########################################################################################################################


def integer_label_encoding(sequence, tp, max_length=100):
    """
    Integer encoding for string sequence.
    Args:
        sequence (str): Drug or Protein string sequence.
        max_length: Maximum encoding length of input string.
    """
    if tp == 'drug':
        charset = CHARSMISET
    elif tp == 'protein':
        charset = CHARPROTSET

    encoding = np.zeros(max_length)
    for idx, letter in enumerate(sequence[:max_length]):
        try:
            if tp == 'protein':
                letter = letter.upper()
            letter = str(letter)
            encoding[idx] = charset[letter]
        except KeyError:
            print(
                f"character {letter} does not exists in sequence category encoding, skip and treat as padding."
            )
    return Data(x=torch.from_numpy(encoding).to(torch.long).unsqueeze(dim=0))


In [3]:
import torch
import numpy as np
from rdkit import Chem
from rdkit import RDLogger
from pathlib import Path
from torch_geometric.data import Data
from torch_geometric.utils import add_self_loops

import logging
logger = logging.getLogger(__name__)
RDLogger.DisableLog('rdApp.*')  

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

def one_of_k_encoding(x, allowable_set):
    if x not in allowable_set:
        raise Exception("input {0} not in allowable set{1}:".format(x, allowable_set))
    return list(map(lambda s: x == s, allowable_set))

def one_of_k_encoding_unk(x, allowable_set):
    '''Maps inputs not in the allowable set to the last element.'''
    if x not in allowable_set:
        x = allowable_set[-1]
    return list(map(lambda s: x == s, allowable_set))

def atom_features_graphdta(atom):
    result = np.array(one_of_k_encoding_unk(atom.GetSymbol(),['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na','Ca', 'Fe', 'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb','Sb', 'Sn', 'Ag', 'Pd', 'Co', 'Se', 'Ti', 'Zn', 'H','Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In', 'Mn', 'Zr','Cr', 'Pt', 'Hg', 'Pb', 'Unknown']) +
                    one_of_k_encoding(atom.GetDegree(), [0, 1, 2, 3, 4, 5, 6,7,8,9,10]) +
                    one_of_k_encoding_unk(atom.GetTotalNumHs(), [0, 1, 2, 3, 4, 5, 6,7,8,9,10]) +
                    one_of_k_encoding_unk(atom.GetImplicitValence(), [0, 1, 2, 3, 4, 5, 6,7,8,9,10]) +
                    [atom.GetIsAromatic()])
    return torch.from_numpy(result).float()

def atom_features(atom,
                explicit_H=True,
                use_chirality=False):

    results = one_of_k_encoding_unk(
        atom.GetSymbol(),
        ['C','N','O', 'S','F','Si','P', 'Cl','Br','Mg','Na','Ca','Fe','As','Al','I','B','V','K','Tl',
            'Yb','Sb','Sn','Ag','Pd','Co','Se','Ti','Zn','H', 'Li','Ge','Cu','Au','Ni','Cd','In',
            'Mn','Zr','Cr','Pt','Hg','Pb','Unknown'
        ]) + [atom.GetDegree()/10, atom.GetImplicitValence(),
                atom.GetFormalCharge(), atom.GetNumRadicalElectrons()] + \
                one_of_k_encoding_unk(atom.GetHybridization(), [
                Chem.rdchem.HybridizationType.SP, Chem.rdchem.HybridizationType.SP2,
                Chem.rdchem.HybridizationType.SP3, Chem.rdchem.HybridizationType.
                                    SP3D, Chem.rdchem.HybridizationType.SP3D2
                ]) + [atom.GetIsAromatic()]
    # In case of explicit hydrogen(QM8, QM9), avoid calling `GetTotalNumHs`
    if explicit_H:
        results = results + [atom.GetTotalNumHs()]

    if use_chirality:
        try:
            results = results + one_of_k_encoding_unk(
            atom.GetProp('_CIPCode'),
            ['R', 'S']) + [atom.HasProp('_ChiralityPossible')]
        except:
            results = results + [False, False
                            ] + [atom.HasProp('_ChiralityPossible')]

    results = np.array(results).astype(np.float32)

    return torch.from_numpy(results)


def get_mol_edge_list_and_feat_mtx(mol_graph, pad=False, graphdta=True):
    if graphdta:
        n_features = [(atom.GetIdx(), atom_features_graphdta(atom)) for atom in mol_graph.GetAtoms()]
    else:
        n_features = [(atom.GetIdx(), atom_features(atom)) for atom in mol_graph.GetAtoms()]
    n_features.sort() # to make sure that the feature matrix is aligned according to the idx of the atom
    _, n_features = zip(*n_features)
    n_features = torch.stack(n_features)

    edge_list = torch.LongTensor([(b.GetBeginAtomIdx(), b.GetEndAtomIdx()) for b in mol_graph.GetBonds()])
    undirected_edge_list = torch.cat([edge_list, edge_list[:, [1, 0]]], dim=0) if len(edge_list) else edge_list
    edge_index = undirected_edge_list.T
    if pad:
        max_drug_nodes = 290
        actual_node_shape = n_features.shape
        num_virtual_nodes = max_drug_nodes - actual_node_shape[0]
        virtual_node_feat = torch.zeros(num_virtual_nodes, actual_node_shape[1])
        n_features = torch.cat((n_features, virtual_node_feat), dim=0)  # 290, feats

        # add self-loops
        edge_index_with_self_loop, _ = add_self_loops(edge_index, num_nodes=max_drug_nodes)
        edge_index = edge_index_with_self_loop

    return edge_index, n_features


def drug_to_graph(smi, pad=False, graphdta=True):
    mol = Chem.MolFromSmiles(smi)
    if mol:
        edge_index, n_features = get_mol_edge_list_and_feat_mtx(mol, pad, graphdta)
        return Data(x=n_features, edge_index=edge_index)


In [4]:
# ... existing code ...

# Updated transform_mol function with progress logging
from torch_geometric.data import Data
from tqdm import tqdm

def transform_mol(molecule_smiles, labels, choice):
    mols = [Chem.MolFromSmiles(mol) for mol in molecule_smiles if mol]
    print(f"Processing {len(molecule_smiles)} molecules with {choice} transformation...")
    
    # string tokenization
    if choice == 'string_tokenization': # vocab dictionary, encoded smiles를 출력
        print("Building vocabulary from SMILES tokens...")
        vocab = []
        max_len = 0
        tokenizer = BasicSmilesTokenizer()
        for smi in tqdm(molecule_smiles, desc="Tokenizing SMILES"):
            tokens = tokenizer.tokenize(smi)
            max_len = max(max_len, len(tokens))
            vocab += tokens
            
        uniq_vocab = sorted(set(vocab))
        smiles_vocab = {v: i for i, v in enumerate(uniq_vocab)}
        smiles_vocab['Unk'] = len(smiles_vocab)
        print(f"Vocabulary size: {len(smiles_vocab)}")
        
        print("Encoding SMILES sequences...")
        encoded_smiles = [[smiles_vocab.get(token, smiles_vocab['Unk']) for token in tokenizer.tokenize(smi)] for smi in tqdm(molecule_smiles, desc="Encoding SMILES")]
        smiles_vec = []
        for vec, l, smi in tqdm(zip(encoded_smiles, labels, molecule_smiles), desc="Creating Data objects", total=len(molecule_smiles)):
            pad_len = max_len - len(vec)
            vec = vec + ([0] * pad_len)
            smiles_vec.append(Data(x=torch.tensor(vec).view(1, -1), y=torch.tensor([l], dtype=torch.float).view(1, -1), smiles=smi))
        print(f"Completed string tokenization for {len(smiles_vec)} molecules")
        return smiles_vocab, smiles_vec

    # integer encoding (CNN)
    elif choice == 'integer_encoding':
        print("Converting SMILES to integer encoding...")
        integer_encoding_data = []
        for smi, l in tqdm(zip(molecule_smiles, labels), desc="Integer encoding", total=len(molecule_smiles)):
             drug = integer_label_encoding(smi, 'drug')
             drug.y = torch.tensor([l], dtype=torch.float).view(1, -1)
             integer_encoding_data.append(drug)
        print(f"Completed integer encoding for {len(integer_encoding_data)} molecules")
        return integer_encoding_data

    # 2D Graph
    elif choice == '2D_graph':
        print("Converting SMILES to 2D molecular graphs...")
        graph_data = [drug_to_graph(smi) for smi in tqdm(molecule_smiles, desc="Creating 2D graphs")]

        graph_2d = []
        for g, l, smi in tqdm(zip(graph_data, labels, molecule_smiles), desc="Adding labels to graphs", total=len(molecule_smiles)):
            g.y = torch.tensor([l], dtype=torch.float).view(1, -1)
            g.smiles = smi
            graph_2d.append(g)
        print(f"Completed 2D graph conversion for {len(graph_2d)} molecules")
        return graph_2d

    # 3D Graph
    elif choice == '3D_graph':
        print("Converting SMILES to 3D molecular graphs...")
        graph_3d = []
        for smi, l in tqdm(zip(molecule_smiles, labels), desc="Creating 3D graphs", total=len(molecule_smiles)):
            graph_data = drug_to_graph(smi)
            
            mol = Chem.MolFromSmiles(smi)
            atom_info = [(atom.GetIdx(), atom.GetSymbol()) for atom in mol.GetAtoms()]
                     
            mol = AllChem.AddHs(mol, addCoords=True)
            rdDistGeom.EmbedMolecule(mol)
    
            conf = mol.GetConformer()
            pos = np.array([conf.GetAtomPosition(idx) for idx, symbol in atom_info])
            graph_data.pos = pos
            graph_data.y = torch.tensor([l], dtype=torch.float).view(1, -1)
            graph_data.smiles = smi
            graph_3d.append(graph_data)
        print(f"Completed 3D graph conversion for {len(graph_3d)} molecules")
        return graph_3d
    
    # Fingerprint
    elif 'fingerprint' in choice:
        print(f"Generating {choice} fingerprints...")
        if choice == 'rdkit_fingerprint':
            fp = [Chem.RDKFingerprint(mol) for mol in tqdm(mols, desc="RDKit fingerprints")]
        
        elif choice == 'maccs_fingerprint':
            fp = [MACCSkeys.GenMACCSKeys(mol) for mol in tqdm(mols, desc="MACCS fingerprints")]
        
        elif choice == 'morgan_fingerprint':
            fp = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024) for mol in tqdm(mols, desc="Morgan fingerprints")]

        print("Converting fingerprints to Data objects...")
        fps = [Data(x=torch.tensor(f).view(1, -1), y=torch.tensor([l], dtype=torch.float).view(1, -1), smiles=smi) for f, l, smi in tqdm(zip(fp, labels, molecule_smiles), desc="Creating fingerprint Data objects", total=len(molecule_smiles))]
        print(f"Completed {choice} generation for {len(fps)} molecules")
        return fps

    # Descriptors
    elif choice == 'descriptors':
        print("Calculating molecular descriptors...")
        # 모델 학습을 위해서는 스케일링 작업이 별도로 필요하다는 것을 기억하자!
        desc = []
        for mol, l, smi in tqdm(zip(mols, labels, molecule_smiles), desc="Calculating descriptors", total=len(molecule_smiles)):
            x = torch.tensor(list(Descriptors.CalcMolDescriptors(mol).values()), dtype=torch.float).view(1, -1)
            y = torch.tensor([l], dtype=torch.float).view(1, -1)
            desc.append(Data(x=x, y=y, smiles=smi))
        print(f"Completed descriptor calculation for {len(desc)} molecules")
        return desc


In [5]:
import pandas as pd

dta_trn = pd.read_csv('data/davis/train.csv')
dta_val = pd.read_csv('data/davis/valid.csv')
dta_tst = pd.read_csv('data/davis/test.csv')

dta_trn['Set'] = 'TRN'
dta_val['Set'] = 'VAL'
dta_tst['Set'] = 'TST'

dta = pd.concat([dta_trn, dta_val, dta_tst]).reset_index(drop=True)

In [6]:
dta['CNN'] = transform_mol(dta['Drug'], dta['Y'], 'integer_encoding')
dta['2D-GNN'] = transform_mol(dta['Drug'], dta['Y'], '2D_graph')
dta['FP-Morgan'] = transform_mol(dta['Drug'], dta['Y'], 'morgan_fingerprint') # 1024
dta['FP-MACCS'] = transform_mol(dta['Drug'], dta['Y'], 'maccs_fingerprint') # 167

Processing 25772 molecules with integer_encoding transformation...
Converting SMILES to integer encoding...


Integer encoding: 100%|██████████| 25772/25772 [00:00<00:00, 36195.86it/s]


Completed integer encoding for 25772 molecules
Processing 25772 molecules with 2D_graph transformation...
Converting SMILES to 2D molecular graphs...


Creating 2D graphs: 100%|██████████| 25772/25772 [00:19<00:00, 1309.92it/s]
Adding labels to graphs: 100%|██████████| 25772/25772 [00:00<00:00, 146693.63it/s]


Completed 2D graph conversion for 25772 molecules
Processing 25772 molecules with morgan_fingerprint transformation...
Generating morgan_fingerprint fingerprints...


Morgan fingerprints: 100%|██████████| 25772/25772 [00:00<00:00, 30424.94it/s]


Converting fingerprints to Data objects...


Creating fingerprint Data objects: 100%|██████████| 25772/25772 [00:10<00:00, 2346.57it/s]


Completed morgan_fingerprint generation for 25772 molecules
Processing 25772 molecules with maccs_fingerprint transformation...
Generating maccs_fingerprint fingerprints...


MACCS fingerprints: 100%|██████████| 25772/25772 [00:29<00:00, 884.74it/s] 


Converting fingerprints to Data objects...


Creating fingerprint Data objects: 100%|██████████| 25772/25772 [00:02<00:00, 9095.42it/s]


Completed maccs_fingerprint generation for 25772 molecules


In [9]:
dta['3D-GNN'] = transform_mol(dta['Drug'], dta['Y'], '3D_graph')

Processing 25772 molecules with 3D_graph transformation...
Converting SMILES to 3D molecular graphs...


Creating 3D graphs: 100%|██████████| 25772/25772 [11:52<00:00, 36.18it/s]


Completed 3D graph conversion for 25772 molecules


In [8]:
dta['Target_Rep'] = dta['Target'].apply(lambda x: integer_label_encoding(x, 'protein', 1000))

In [10]:
import pickle
from pathlib import Path

fd = Path('data/davis/feature/')
fd.mkdir(parents=True, exist_ok=True)
for ft in ['CNN', '2D-GNN', '3D-GNN', 'FP-Morgan', 'FP-MACCS']:
    nfd = fd / ft
    nfd.mkdir(parents=True, exist_ok=True)
    
    trn_sub = dta[dta['Set'] == 'TRN'][[ft, 'Target_Rep']].reset_index(drop=True).rename(columns={ft: 'Drug_Rep'}).to_dict('records')
    val_sub = dta[dta['Set'] == 'VAL'][[ft, 'Target_Rep']].reset_index(drop=True).rename(columns={ft: 'Drug_Rep'}).to_dict('records')
    tst_sub = dta[dta['Set'] == 'TST'][[ft, 'Target_Rep']].reset_index(drop=True).rename(columns={ft: 'Drug_Rep'}).to_dict('records')
    print(f'{ft} feature_dim', dta[ft].values[0].x.shape)
    
    with open(nfd / 'trn.pkl', 'wb') as f:
        pickle.dump(trn_sub, f)
    with open(nfd / 'val.pkl', 'wb') as f:
        pickle.dump(val_sub, f)
    with open(nfd / 'tst.pkl', 'wb') as f:
        pickle.dump(tst_sub, f)
    
    print('Saved', nfd)

CNN feature_dim torch.Size([1, 100])
Saved data\davis\feature\CNN
2D-GNN feature_dim torch.Size([27, 78])
Saved data\davis\feature\2D-GNN
3D-GNN feature_dim torch.Size([27, 78])
Saved data\davis\feature\3D-GNN
FP-Morgan feature_dim torch.Size([1, 1024])
Saved data\davis\feature\FP-Morgan
FP-MACCS feature_dim torch.Size([1, 167])
Saved data\davis\feature\FP-MACCS


In [15]:
torch.tensor(dta['3D-GNN'].values[0].pos).shape

torch.Size([27, 3])

In [22]:
zinc = pd.read_csv('data/zinc/zinc15_250K.csv')
zinc.head()

Unnamed: 0,smiles
0,CCN(CCSC)C(=O)N[C@@](C)(CC)C(F)(F)F
1,CC1(C)CN(C(=O)Nc2cc3ccccc3nn2)C[C@@]2(CCOC2)O1
2,CC[C@H](NC(C)=O)C(=O)NCC1(NC(=O)Cc2nonc2C)CC1
3,O=C(N[C@@H]1CC[C@H](F)C1)[C@H]1C[C@@H]1c1ccc2c...
4,COCC(=O)N(C)CC(=O)NCC1(Nc2nccn3nnnc23)CC1


In [32]:
def transform_mol_nolabel(molecule_smiles, choice):
    mols = [Chem.MolFromSmiles(mol) for mol in molecule_smiles if mol]
    print(f"Processing {len(molecule_smiles)} molecules with {choice} transformation...")
    
     # integer encoding (CNN)
    if choice == 'integer_encoding':
        print("Converting SMILES to integer encoding...")
        integer_encoding_data = {}
        for smi in tqdm(molecule_smiles):
             drug = integer_label_encoding(smi, 'drug')
             integer_encoding_data[smi] = drug
        print(f"Completed integer encoding for {len(integer_encoding_data)} molecules")
        return integer_encoding_data

    # 2D Graph
    elif choice == '2D_graph':
        print("Converting SMILES to 2D molecular graphs...")
        graph_data = {smi: drug_to_graph(smi) for smi in tqdm(molecule_smiles, desc="Creating 2D graphs")}
        print(f"Completed 2D graph conversion for {len(graph_data)} molecules")
        return graph_data

    # 3D Graph
    elif choice == '3D_graph':
        print("Converting SMILES to 3D molecular graphs...")
        graph_3d = {}
        for smi in tqdm(molecule_smiles):
            graph_data = drug_to_graph(smi)
            
            mol = Chem.MolFromSmiles(smi)
            atom_info = [(atom.GetIdx(), atom.GetSymbol()) for atom in mol.GetAtoms()]
                     
            mol = AllChem.AddHs(mol, addCoords=True)
            emb_mol = rdDistGeom.EmbedMolecule(mol)
            if emb_mol == -1:
                emb_mol =rdDistGeom.EmbedMolecule(mol, maxAttempts=5000)
            
            if emb_mol == -1:
                print(f"Failed to embed molecule {smi} (skip)")
                continue

            conf = mol.GetConformer()
            pos = np.array([conf.GetAtomPosition(idx) for idx, symbol in atom_info])
            graph_data.pos = pos
            graph_3d[smi] = graph_data
        print(f"Completed 3D graph conversion for {len(graph_3d)} molecules")
        return graph_3d
    
    # Fingerprint
    elif 'fingerprint' in choice:
        print(f"Generating {choice} fingerprints...")
        if choice == 'rdkit_fingerprint':
            fp = [Chem.RDKFingerprint(mol) for mol in tqdm(mols, desc="RDKit fingerprints")]
        
        elif choice == 'maccs_fingerprint':
            fp = [MACCSkeys.GenMACCSKeys(mol) for mol in tqdm(mols, desc="MACCS fingerprints")]
        
        elif choice == 'morgan_fingerprint':
            fp = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024) for mol in tqdm(mols, desc="Morgan fingerprints")]

        print("Converting fingerprints to Data objects...")
        fps = {smi: Data(x=torch.tensor(f).view(1, -1)) for f, smi in tqdm(zip(fp, molecule_smiles), desc="Creating fingerprint Data objects")}
        print(f"Completed {choice} generation for {len(fps)} molecules")
        return fps

In [24]:
zinc_cnn = transform_mol_nolabel(zinc['smiles'], 'integer_encoding')
with open('data/zinc/feature/zinc_cnn.pkl', 'wb') as f:
    pickle.dump(zinc_cnn, f)

Processing 250000 molecules with integer_encoding transformation...
Converting SMILES to integer encoding...


100%|██████████| 250000/250000 [00:04<00:00, 56899.15it/s]


Completed integer encoding for 250000 molecules


In [25]:
zinc_2d_gnn = transform_mol_nolabel(zinc['smiles'], '2D_graph')
with open('data/zinc/feature/zinc_2d_gnn.pkl', 'wb') as f:
    pickle.dump(zinc_2d_gnn, f)

Processing 250000 molecules with 2D_graph transformation...
Converting SMILES to 2D molecular graphs...


Creating 2D graphs: 100%|██████████| 250000/250000 [02:23<00:00, 1747.18it/s]


Completed 2D graph conversion for 250000 molecules


In [28]:
zinc_fp_morgan = transform_mol_nolabel(zinc['smiles'], 'morgan_fingerprint')
with open('data/zinc/feature/zinc_fp_morgan.pkl', 'wb') as f:
    pickle.dump(zinc_fp_morgan, f)

Processing 250000 molecules with morgan_fingerprint transformation...
Generating morgan_fingerprint fingerprints...


Morgan fingerprints: 100%|██████████| 250000/250000 [00:06<00:00, 38082.33it/s]


Converting fingerprints to Data objects...


Creating fingerprint Data objects: 250000it [01:49, 2287.91it/s]


Completed morgan_fingerprint generation for 250000 molecules


In [29]:
zinc_fp_maccs = transform_mol_nolabel(zinc['smiles'], 'maccs_fingerprint')
with open('data/zinc/feature/zinc_fp_maccs.pkl', 'wb') as f:
    pickle.dump(zinc_fp_maccs, f)

Processing 250000 molecules with maccs_fingerprint transformation...
Generating maccs_fingerprint fingerprints...


MACCS fingerprints: 100%|██████████| 250000/250000 [02:23<00:00, 1737.13it/s]


Converting fingerprints to Data objects...


Creating fingerprint Data objects: 250000it [00:26, 9515.46it/s] 


Completed maccs_fingerprint generation for 250000 molecules


In [33]:
zinc_3d_gnn = transform_mol_nolabel(zinc['smiles'], '3D_graph')
with open('data/zinc/feature/zinc_3d_gnn.pkl', 'wb') as f:
    pickle.dump(zinc_3d_gnn, f)

Processing 250000 molecules with 3D_graph transformation...
Converting SMILES to 3D molecular graphs...


  0%|          | 4/250000 [00:00<1:53:19, 36.77it/s]

CCN(CCSC)C(=O)N[C@@](C)(CC)C(F)(F)F <rdkit.Chem.rdchem.Mol object at 0x000001BBB1948660>
CC1(C)CN(C(=O)Nc2cc3ccccc3nn2)C[C@@]2(CCOC2)O1 <rdkit.Chem.rdchem.Mol object at 0x000001BBB1948580>
CC[C@H](NC(C)=O)C(=O)NCC1(NC(=O)Cc2nonc2C)CC1 <rdkit.Chem.rdchem.Mol object at 0x000001BBB1948660>
O=C(N[C@@H]1CC[C@H](F)C1)[C@H]1C[C@@H]1c1ccc2c(c1)OCCO2 <rdkit.Chem.rdchem.Mol object at 0x000001BBB1948580>
COCC(=O)N(C)CC(=O)NCC1(Nc2nccn3nnnc23)CC1 <rdkit.Chem.rdchem.Mol object at 0x000001BBB1948660>
C[C@H](C[C@H](C)N)NC(=O)C(=O)NCCCN1C(=O)CNC1=O <rdkit.Chem.rdchem.Mol object at 0x000001BBB19486D0>
CC(C)(C(N)=O)C(=O)N[C@@H]1CN([C@H]2CCNC2=O)C[C@H]1O <rdkit.Chem.rdchem.Mol object at 0x000001BBB1948580>
Cc1ccc(C(=O)N2CC[C@@H]2CNC(=O)[C@H]2[C@@H]3COC[C@@H]32)cc1 <rdkit.Chem.rdchem.Mol object at 0x000001BBB1948660>


  0%|          | 13/250000 [00:00<1:49:32, 38.03it/s]

COc1ccc(C(=O)NCC2(NCC=C(C)C)CCC2)cc1 <rdkit.Chem.rdchem.Mol object at 0x000001BBB1948580>
CCNC(=O)CC(=O)NC[C@@H](CO)N[C@@H](C)c1nncn1C <rdkit.Chem.rdchem.Mol object at 0x000001BBB1948660>
CCn1nncc1CN1C[C@@H](O)[C@H](NC(=O)C2=COCCC2)C1 <rdkit.Chem.rdchem.Mol object at 0x000001BBB1948580>
CN(C)C(=O)CN1C[C@@H]2COC[C@H](C1)N2C(=O)[C@@H]1CCCOC1 <rdkit.Chem.rdchem.Mol object at 0x000001BBB1948660>
O=C(O)CC1(C(=O)NCCc2ccccc2O)CCOCC1 <rdkit.Chem.rdchem.Mol object at 0x000001BBB1948580>
CCCC(=O)N1CC(N2C[C@H](NC(=O)[C@@H]3CC34CC4)CC2=O)C1 <rdkit.Chem.rdchem.Mol object at 0x000001BBB1948510>
CC[C@@H]1CN(C(=O)N[C@@H]2CCCc3c2cnn3C)[C@@H](C)CO1 <rdkit.Chem.rdchem.Mol object at 0x000001BBB19486D0>
O=C(Cc1nc2ncccn2n1)NC1CCN(Cc2ccon2)CC1 <rdkit.Chem.rdchem.Mol object at 0x000001BBB19485F0>


  0%|          | 21/250000 [00:00<1:52:46, 36.95it/s]

O=C(O)c1cc(S(=O)(=O)NC2CC3(CCC3)C2)on1 <rdkit.Chem.rdchem.Mol object at 0x000001BBB19486D0>
CCc1nnc(CN2C[C@H]3OCCN(C(=O)CNC(C)=O)[C@H]3C2)o1 <rdkit.Chem.rdchem.Mol object at 0x000001BBB19485F0>
CCCOc1ccc(C(=O)NCCc2n[nH]c(C)n2)cc1OCCC <rdkit.Chem.rdchem.Mol object at 0x000001BBB1948660>
O=C(NCCN1CCNC(=O)C1)NCc1ccc2c(c1)CCO2 <rdkit.Chem.rdchem.Mol object at 0x000001BBB19485F0>
C[C@H](NC(=O)N1CC[C@@H](c2ccccc2)C1)c1ccn(C)n1 <rdkit.Chem.rdchem.Mol object at 0x000001BBB19486D0>
C[C@@H]1C[C@H](NC(=O)c2cn(C)nn2)CCN1CC(=O)N(C)C <rdkit.Chem.rdchem.Mol object at 0x000001BBB1948510>
Cc1cnc(COCC(=O)N2CC(NC(=O)CCn3cncn3)C2)o1 <rdkit.Chem.rdchem.Mol object at 0x000001BBB19486D0>
Cc1coc(C(=O)N2CCCC[C@H]2[C@H]2CCCN2CC(=O)N(C)C)c1 <rdkit.Chem.rdchem.Mol object at 0x000001BBB1948510>


  0%|          | 30/250000 [00:00<1:55:31, 36.06it/s]

Cn1nc(C(=O)NCCCNCCO)cc1C(F)(F)F <rdkit.Chem.rdchem.Mol object at 0x000001BBB19485F0>
CN1C[C@@H]2C[C@H]1CN2Cc1cccc2c1CCN2C(=O)OC(C)(C)C <rdkit.Chem.rdchem.Mol object at 0x000001BBB19486D0>
COCCCN(C)CCNC(=O)C(=O)N1CCNC(=O)C1 <rdkit.Chem.rdchem.Mol object at 0x000001BBB1948580>
C[C@@H](O)CCCN1C[C@@H]2CCN(C(=O)COCc3ccncc3)[C@@H]2C1 <rdkit.Chem.rdchem.Mol object at 0x000001BBB19486D0>
CCN1CC[C@H](NC(=O)C(=O)NC[C@H]2NC(C)(C)O[C@@H]2C)C1=O <rdkit.Chem.rdchem.Mol object at 0x000001BBB19487B0>
CCN1C[C@@H]2CCN(C(=O)c3cnn4cc(C)cnc34)[C@@H]2C1 <rdkit.Chem.rdchem.Mol object at 0x000001BBB19486D0>
N[C@@H]1NCN(C(=O)c2ccc(-n3cncn3)nc2)[C@H]2NCN[C@@H]21 <rdkit.Chem.rdchem.Mol object at 0x000001BBB19487B0>
CNS(=O)(=O)CCNC(=O)N[C@@](C)(CO)C1CC1 <rdkit.Chem.rdchem.Mol object at 0x000001BBB1948580>
CCCC(=O)N[C@H]1CN(C(=O)c2cnc3n2CCC3)C[C@@H]1C <rdkit.Chem.rdchem.Mol object at 0x000001BBB19486D0>
N[C@H](Cc1cn(CCc2cccnc2)nn1)C(=O)O <rdkit.Chem.rdchem.Mol object at 0x000001BBB1948510>


  0%|          | 34/250000 [00:10<22:09:16,  3.13it/s]

CN[C@H]1C[C@H]2CC[C@H]1N2C(=O)c1ccccc1CN1CCCCC1 <rdkit.Chem.rdchem.Mol object at 0x000001BBB19486D0>





ValueError: Bad Conformer Id

In [34]:
smi = 'CN[C@H]1C[C@H]2CC[C@H]1N2C(=O)c1ccccc1CN1CCCCC1'
graph_data = drug_to_graph(smi)

mol = Chem.MolFromSmiles(smi)
atom_info = [(atom.GetIdx(), atom.GetSymbol()) for atom in mol.GetAtoms()]
            
mol = AllChem.AddHs(mol, addCoords=True)
rdDistGeom.EmbedMolecule(mol)

print(smi, mol)
conf = mol.GetConformer()
pos = np.array([conf.GetAtomPosition(idx) for idx, symbol in atom_info])
graph_data.pos = pos


CN[C@H]1C[C@H]2CC[C@H]1N2C(=O)c1ccccc1CN1CCCCC1 <rdkit.Chem.rdchem.Mol object at 0x000001BBB1948890>


ValueError: Bad Conformer Id

In [37]:
rdDistGeom.EmbedMolecule(mol)
mol.GetConformer()

ValueError: Bad Conformer Id

In [38]:
rdDistGeom.EmbedMolecule(mol)

-1