In [1]:
import os
import rdkit
import torch
import random
import numpy as np
import pandas as pd

from rdkit import Chem
from deepchem.feat.smiles_tokenizer import BasicSmilesTokenizer
from rdkit.Chem import Draw, AllChem, Descriptors, rdDepictor, rdDistGeom, MACCSkeys, rdMolDescriptors
from rdkit.Chem import rdDepictor

from torch.utils.data import Dataset
from torch_geometric import utils as pyg_utils
from torch_geometric.data import InMemoryDataset, download_url, extract_gz, Data, DataLoader, Batch

# # 작업을 위한 별도의 함수 불러오기
# from utils.download_preprocess import CustomMoleculeNet, atom_features, EDGE_FEATURES

# 시각화를 위한 라이브러리
import matplotlib.pyplot as plt
import seaborn as sns

print(rdkit.__version__)

Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead
2024.03.1


In [2]:
CHARSMISET = {"(": 1, ".": 2, "0": 3, "2": 4, "4": 5, "6": 6, "8": 7, "@": 8,
                "B": 9, "D": 10, "F": 11, "H": 12, "L": 13, "N": 14, "P": 15, "R": 16,
                "T": 17, "V": 18, "Z": 19, "\\": 20, "b": 21, "d": 22, "f": 23, "h": 24,
                "l": 25, "n": 26, "r": 27, "t": 28, "#": 29, "%": 30, ")": 31, "+": 32,
                "-": 33, "/": 34, "1": 35, "3": 36, "5": 37, "7": 38, "9": 39, "=": 40,
                "A": 41, "C": 42, "E": 43, "G": 44, "I": 45, "K": 46, "M": 47, "O": 48,
                "S": 49, "U": 50, "W": 51, "Y": 52, "[": 53, "]": 54, "a": 55, "c": 56,
                "e": 57, "g": 58, "i": 59, "m": 60, "o": 61, "s": 62, "u": 63, "y": 64, '~': 65} # add ~: 65 

CHARISOSMILEN = 65

CHARPROTSET = {"A": 1, "C": 2, "B": 3, "E": 4, "D": 5, "G": 6,
               "F": 7, "I": 8, "H": 9, "K": 10, "M": 11, "L": 12,
               "O": 13, "N": 14, "Q": 15, "P": 16, "S": 17, "R": 18,
               "U": 19, "T": 20, "W": 21, "V": 22, "Y": 23, "X": 24, "Z": 25}

CHARPROTLEN = 25


########################################################################################################################
########## Function
########################################################################################################################


def integer_label_encoding(sequence, tp, max_length=100):
    """
    Integer encoding for string sequence.
    Args:
        sequence (str): Drug or Protein string sequence.
        max_length: Maximum encoding length of input string.
    """
    if tp == 'drug':
        charset = CHARSMISET
    elif tp == 'protein':
        charset = CHARPROTSET

    encoding = np.zeros(max_length)
    for idx, letter in enumerate(sequence[:max_length]):
        try:
            if tp == 'protein':
                letter = letter.upper()
            letter = str(letter)
            encoding[idx] = charset[letter]
        except KeyError:
            print(
                f"character {letter} does not exists in sequence category encoding, skip and treat as padding."
            )
    return Data(x=torch.from_numpy(encoding).to(torch.long).unsqueeze(dim=0))


In [3]:
def smiles_to_coord(smiles):
    try:
        # SMILES → Mol
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            print(f"[Warning] MolFromSmiles failed for: {smiles}")
            return None
        
        mol = Chem.AddHs(mol)

        # 3D 좌표 생성
        status = AllChem.EmbedMolecule(mol, randomSeed=42)
        if status != 0:
            print(f"[Warning] EmbedMolecule failed for: {smiles}")
            return None
        
        # 에너지 최소화
        AllChem.UFFOptimizeMolecule(mol)

        # conformer 가져오기
        if mol.GetNumConformers() == 0:
            print(f"[Warning] No conformer generated for: {smiles}")
            return None
        conf = mol.GetConformer()

        # 원자 번호와 좌표 추출
        z = []
        pos = []
        for atom in mol.GetAtoms():
            z.append(atom.GetAtomicNum())
            p = conf.GetAtomPosition(atom.GetIdx())
            pos.append([p.x, p.y, p.z])
        
        z = torch.tensor(z, dtype=torch.long)
        pos = torch.tensor(pos, dtype=torch.float)
        return Data(z=z, pos=pos)

    except Exception as e:
        print(f"[Exception] Failed for {smiles}: {e}")
        return None


In [22]:
from transformers import AutoTokenizer, AutoModel
import torch

def smiles_to_chemberta_data(smiles, tokenizer, model):

    return data


In [5]:
import torch
import numpy as np
from rdkit import Chem
from rdkit import RDLogger
from pathlib import Path
from torch_geometric.data import Data
from torch_geometric.utils import add_self_loops

import logging
logger = logging.getLogger(__name__)
RDLogger.DisableLog('rdApp.*')  

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

In [23]:
# ... existing code ...

# Updated transform_mol function with progress logging
from torch_geometric.data import Data
from tqdm import tqdm
from utils.molecule_feature import *

def transform_mol(molecule_smiles, labels, choice):
    global 
    mols = [Chem.MolFromSmiles(mol) for mol in molecule_smiles if mol]
    print(f"Processing {len(molecule_smiles)} molecules with {choice} transformation...")
    
    # string tokenization
    if choice == 'string_tokenization': # vocab dictionary, encoded smiles를 출력
        print("Building vocabulary from SMILES tokens...")
        vocab = []
        max_len = 0
        tokenizer = BasicSmilesTokenizer()
        for smi in tqdm(molecule_smiles, desc="Tokenizing SMILES"):
            tokens = tokenizer.tokenize(smi)
            max_len = max(max_len, len(tokens))
            vocab += tokens
            
        uniq_vocab = sorted(set(vocab))
        smiles_vocab = {v: i for i, v in enumerate(uniq_vocab)}
        smiles_vocab['Unk'] = len(smiles_vocab)
        print(f"Vocabulary size: {len(smiles_vocab)}")
        
        print("Encoding SMILES sequences...")
        encoded_smiles = [[smiles_vocab.get(token, smiles_vocab['Unk']) for token in tokenizer.tokenize(smi)] for smi in tqdm(molecule_smiles, desc="Encoding SMILES")]
        smiles_vec = []
        for vec, l, smi in tqdm(zip(encoded_smiles, labels, molecule_smiles), desc="Creating Data objects", total=len(molecule_smiles)):
            pad_len = max_len - len(vec)
            vec = vec + ([0] * pad_len)
            smiles_vec.append(Data(x=torch.tensor(vec).view(1, -1), y=torch.tensor([l], dtype=torch.float).view(1, -1), smiles=smi))
        print(f"Completed string tokenization for {len(smiles_vec)} molecules")
        return smiles_vocab, smiles_vec

    # integer encoding (CNN)
    elif choice == 'integer_encoding':
        print("Converting SMILES to integer encoding...")
        integer_encoding_data = []
        for smi, l in tqdm(zip(molecule_smiles, labels), desc="Integer encoding", total=len(molecule_smiles)):
             drug = integer_label_encoding(smi, 'drug')
             drug.y = torch.tensor([l], dtype=torch.float).view(1, -1)
             integer_encoding_data.append(drug)
        print(f"Completed integer encoding for {len(integer_encoding_data)} molecules")
        return integer_encoding_data

    # 2D Graph
    elif choice == '2D_graph':
        print("Converting SMILES to 2D molecular graphs...")
        graph_data = [smiles_to_feature(smi) for smi in tqdm(molecule_smiles, desc="Creating 2D graphs")]

        graph_2d = []
        for g, l, smi in tqdm(zip(graph_data, labels, molecule_smiles), desc="Adding labels to graphs", total=len(molecule_smiles)):
            g.y = torch.tensor([l], dtype=torch.float).view(1, -1)
            g.smiles = smi
            graph_2d.append(g)
        print(f"Completed 2D graph conversion for {len(graph_2d)} molecules")
        return graph_2d

    # 3D Graph
    elif choice == '3D_graph':
        print("Converting SMILES to 3D molecular graphs...")
        graph_3d = []
        for smi, l in tqdm(zip(molecule_smiles, labels), desc="Creating 3D graphs", total=len(molecule_smiles)):
            graph_data = smiles_to_coord(smi)
            if graph_data is None:
                print(f"Failed to create 3D graph for {smi}")
                continue
            graph_data.y = torch.tensor([l], dtype=torch.float).view(1, -1)
            graph_3d.append(graph_data)
        print(f"Completed 3D graph conversion for {len(graph_3d)} molecules")
        return graph_3d

    # ChemBERTa
    elif choice == 'chemberta':
        tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MLM")
        model = AutoModel.from_pretrained("DeepChem/ChemBERTa-77M-MLM")
        print("Converting SMILES to ChemBERTa embeddings...")
        chemberta_data = []
        for smi, l in tqdm(zip(molecule_smiles, labels), desc="Creating ChemBERTa embeddings", total=len(molecule_smiles)):
            with torch.no_grad():
                inputs = tokenizer(smi, return_tensors='pt', padding=True, truncation=True)
                outputs = model(**inputs)
                embedding = outputs.last_hidden_state[:, 0, :].squeeze(0)  # (hidden_size,)
            data = Data(x=embedding, smiles=smi)
            data.y = torch.tensor([l], dtype=torch.float).view(1, -1)
            chemberta_data.append(data)
        print(f"Completed ChemBERTa embedding conversion for {len(chemberta_data)} molecules")
        return chemberta_data
    
    # Fingerprint
    elif 'fingerprint' in choice:
        print(f"Generating {choice} fingerprints...")
        if choice == 'rdkit_fingerprint':
            fp = [Chem.RDKFingerprint(mol) for mol in tqdm(mols, desc="RDKit fingerprints")]
        
        elif choice == 'maccs_fingerprint':
            fp = [MACCSkeys.GenMACCSKeys(mol) for mol in tqdm(mols, desc="MACCS fingerprints")]
        
        elif choice == 'morgan_fingerprint':
            fp = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024) for mol in tqdm(mols, desc="Morgan fingerprints")]

        print("Converting fingerprints to Data objects...")
        fps = [Data(x=torch.tensor(f).view(1, -1), y=torch.tensor([l], dtype=torch.float).view(1, -1), smiles=smi) for f, l, smi in tqdm(zip(fp, labels, molecule_smiles), desc="Creating fingerprint Data objects", total=len(molecule_smiles))]
        print(f"Completed {choice} generation for {len(fps)} molecules")
        return fps

    # Descriptors
    elif choice == 'descriptors':
        print("Calculating molecular descriptors...")
        # 모델 학습을 위해서는 스케일링 작업이 별도로 필요하다는 것을 기억하자!
        desc = []
        for mol, l, smi in tqdm(zip(mols, labels, molecule_smiles), desc="Calculating descriptors", total=len(molecule_smiles)):
            x = torch.tensor(list(Descriptors.CalcMolDescriptors(mol).values()), dtype=torch.float).view(1, -1)
            y = torch.tensor([l], dtype=torch.float).view(1, -1)
            desc.append(Data(x=x, y=y, smiles=smi))
        print(f"Completed descriptor calculation for {len(desc)} molecules")
        return desc


SyntaxError: invalid syntax (826572973.py, line 9)

In [9]:
import pandas as pd

dta_trn = pd.read_csv('dta_dataset/davis/train.csv')
dta_val = pd.read_csv('dta_dataset/davis/valid.csv')
dta_tst = pd.read_csv('dta_dataset/davis/test.csv')

dta_trn['Set'] = 'TRN'
dta_val['Set'] = 'VAL'
dta_tst['Set'] = 'TST'

dta = pd.concat([dta_trn, dta_val, dta_tst]).reset_index(drop=True)

In [10]:
dta['CNN'] = transform_mol(dta['Drug'], dta['Y'], 'integer_encoding')
dta['2D-GNN'] = transform_mol(dta['Drug'], dta['Y'], '2D_graph')
dta['FP-Morgan'] = transform_mol(dta['Drug'], dta['Y'], 'morgan_fingerprint') # 1024
dta['FP-MACCS'] = transform_mol(dta['Drug'], dta['Y'], 'maccs_fingerprint') # 167

Processing 25772 molecules with integer_encoding transformation...
Converting SMILES to integer encoding...


Integer encoding: 100%|██████████| 25772/25772 [00:00<00:00, 29204.36it/s]


Completed integer encoding for 25772 molecules
Processing 25772 molecules with 2D_graph transformation...
Converting SMILES to 2D molecular graphs...


Creating 2D graphs: 100%|██████████| 25772/25772 [00:21<00:00, 1198.49it/s]
Adding labels to graphs: 100%|██████████| 25772/25772 [00:00<00:00, 146108.90it/s]


Completed 2D graph conversion for 25772 molecules
Processing 25772 molecules with morgan_fingerprint transformation...
Generating morgan_fingerprint fingerprints...


Morgan fingerprints: 100%|██████████| 25772/25772 [00:01<00:00, 21031.98it/s]


Converting fingerprints to Data objects...


Creating fingerprint Data objects: 100%|██████████| 25772/25772 [00:11<00:00, 2163.78it/s]


Completed morgan_fingerprint generation for 25772 molecules
Processing 25772 molecules with maccs_fingerprint transformation...
Generating maccs_fingerprint fingerprints...


MACCS fingerprints: 100%|██████████| 25772/25772 [00:30<00:00, 838.31it/s] 


Converting fingerprints to Data objects...


Creating fingerprint Data objects: 100%|██████████| 25772/25772 [00:02<00:00, 8679.03it/s]


Completed maccs_fingerprint generation for 25772 molecules


In [11]:
dta['3D-GNN'] = transform_mol(dta['Drug'], dta['Y'], '3D_graph')

Processing 25772 molecules with 3D_graph transformation...
Converting SMILES to 3D molecular graphs...


Creating 3D graphs: 100%|██████████| 25772/25772 [23:27<00:00, 18.31it/s]


Completed 3D graph conversion for 25772 molecules


In [24]:
dta['ChemBERTa'] = transform_mol(dta['Drug'], dta['Y'], 'chemberta')

Processing 25772 molecules with chemberta transformation...
Converting SMILES to ChemBERTa embeddings...


Creating ChemBERTa embeddings:   0%|          | 0/25772 [00:00<?, ?it/s]


UnboundLocalError: cannot access local variable 'tokenizer' where it is not associated with a value

In [15]:
dta['Target_Rep'] = dta['Target'].apply(lambda x: integer_label_encoding(x, 'protein', 1000))

In [None]:
import pickle
from pathlib import Path

fd = Path('dta_dataset/davis/feature/')
fd.mkdir(parents=True, exist_ok=True)
# for ft in ['CNN', '2D-GNN', 'FP-Morgan', 'FP-MACCS']:
# for ft in ['3D-GNN', 'ChemBERTa']:
for ft in ['ChemBERTa']:
    nfd = fd / ft
    nfd.mkdir(parents=True, exist_ok=True)
    
    trn_sub = dta[dta['Set'] == 'TRN'][[ft, 'Target_Rep']].reset_index(drop=True).rename(columns={ft: 'Drug_Rep'}).to_dict('records')
    val_sub = dta[dta['Set'] == 'VAL'][[ft, 'Target_Rep']].reset_index(drop=True).rename(columns={ft: 'Drug_Rep'}).to_dict('records')
    tst_sub = dta[dta['Set'] == 'TST'][[ft, 'Target_Rep']].reset_index(drop=True).rename(columns={ft: 'Drug_Rep'}).to_dict('records')
    # print(f'{ft} feature_dim', dta[ft].values[0].x.shape)
    
    with open(nfd / 'trn.pkl', 'wb') as f:
        pickle.dump(trn_sub, f)
    with open(nfd / 'val.pkl', 'wb') as f:
        pickle.dump(val_sub, f)
    with open(nfd / 'tst.pkl', 'wb') as f:
        pickle.dump(tst_sub, f)
    
    print('Saved', nfd)

Saved dta_dataset\davis\feature\3D-GNN


In [19]:
dta[ft].values[0]

Data(pos=[49, 3], z=[49], y=[1, 1])

In [None]:
torch.tensor(dta['3D-GNN'].values[0].pos).shape

In [None]:
zinc = pd.read_csv('data/zinc/zinc15_250K.csv')
zinc.head()

In [16]:
def transform_mol_nolabel(molecule_smiles, choice):
    mols = [Chem.MolFromSmiles(mol) for mol in molecule_smiles if mol]
    print(f"Processing {len(molecule_smiles)} molecules with {choice} transformation...")
    
     # integer encoding (CNN)
    if choice == 'integer_encoding':
        print("Converting SMILES to integer encoding...")
        integer_encoding_data = {}
        for smi in tqdm(molecule_smiles):
             drug = integer_label_encoding(smi, 'drug')
             integer_encoding_data[smi] = drug
        print(f"Completed integer encoding for {len(integer_encoding_data)} molecules")
        return integer_encoding_data

    # 2D Graph
    elif choice == '2D_graph':
        print("Converting SMILES to 2D molecular graphs...")
        graph_data = {smi: drug_to_graph(smi) for smi in tqdm(molecule_smiles, desc="Creating 2D graphs")}
        print(f"Completed 2D graph conversion for {len(graph_data)} molecules")
        return graph_data

    # 3D Graph
    elif choice == '3D_graph':
        print("Converting SMILES to 3D molecular graphs...")
        graph_3d = {}
        for smi in tqdm(molecule_smiles):
            graph_data = drug_to_graph(smi)
            
            mol = Chem.MolFromSmiles(smi)
            atom_info = [(atom.GetIdx(), atom.GetSymbol()) for atom in mol.GetAtoms()]
                     
            mol = AllChem.AddHs(mol, addCoords=True)
            emb_mol = rdDistGeom.EmbedMolecule(mol)
            if emb_mol == -1:
                rdDepictor.Compute2DCoords(mol)

            conf = mol.GetConformer()
            pos = np.array([conf.GetAtomPosition(idx) for idx, symbol in atom_info])
            graph_data.pos = pos
            graph_3d[smi] = graph_data
        print(f"Completed 3D graph conversion for {len(graph_3d)} molecules")
        return graph_3d
    
    # Fingerprint
    elif 'fingerprint' in choice:
        print(f"Generating {choice} fingerprints...")
        if choice == 'rdkit_fingerprint':
            fp = [Chem.RDKFingerprint(mol) for mol in tqdm(mols, desc="RDKit fingerprints")]
        
        elif choice == 'maccs_fingerprint':
            fp = [MACCSkeys.GenMACCSKeys(mol) for mol in tqdm(mols, desc="MACCS fingerprints")]
        
        elif choice == 'morgan_fingerprint':
            fp = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024) for mol in tqdm(mols, desc="Morgan fingerprints")]

        print("Converting fingerprints to Data objects...")
        fps = {smi: Data(x=torch.tensor(f).view(1, -1)) for f, smi in tqdm(zip(fp, molecule_smiles), desc="Creating fingerprint Data objects")}
        print(f"Completed {choice} generation for {len(fps)} molecules")
        return fps

In [None]:
zinc_cnn = transform_mol_nolabel(zinc['smiles'], 'integer_encoding')
with open('data/zinc/feature/zinc_cnn.pkl', 'wb') as f:
    pickle.dump(zinc_cnn, f)

In [None]:
zinc_2d_gnn = transform_mol_nolabel(zinc['smiles'], '2D_graph')
with open('data/zinc/feature/zinc_2d_gnn.pkl', 'wb') as f:
    pickle.dump(zinc_2d_gnn, f)

In [None]:
zinc_fp_morgan = transform_mol_nolabel(zinc['smiles'], 'morgan_fingerprint')
with open('data/zinc/feature/zinc_fp_morgan.pkl', 'wb') as f:
    pickle.dump(zinc_fp_morgan, f)

In [None]:
zinc_fp_maccs = transform_mol_nolabel(zinc['smiles'], 'maccs_fingerprint')
with open('data/zinc/feature/zinc_fp_maccs.pkl', 'wb') as f:
    pickle.dump(zinc_fp_maccs, f)

In [None]:
zinc_3d_gnn = transform_mol_nolabel(zinc['smiles'], '3D_graph')
with open('data/zinc/feature/zinc_3d_gnn.pkl', 'wb') as f:
    pickle.dump(zinc_3d_gnn, f)

In [18]:
import pickle
with open('data/zinc/feature/zinc_3d_gnn.pkl', 'wb') as f:
    pickle.dump(zinc_3d_gnn, f)

In [None]:
name = '20250628'
for dt in ['bace', 'bbbp', 'tox21', 'toxcast', 'sider', 'clintox', 'hiv']:
    for ft in ['3D-GNN']:
        cmd = f"python Train_Property.py --dataset {dt} --feature {ft} --filename {name} --project {dt.upper()}_{ft}_{name}  > ./logs/new/{dt}_{ft}.txt"
        print(cmd)
        

In [None]:
name = '20250627'
for dt in ['bace', 'bbbp', 'tox21', 'toxcast', 'sider', 'clintox', 'hiv']:
    for ft in ['FP-Morgan', 'FP-MACCS', 'CNN']:
        cmd = f"python Train_Property.py --dataset {dt} --feature {ft} --filename {name} --project {dt.upper()}_{ft}_{name}  > ./logs/new/{dt}_{ft}.txt"
        print(cmd)
        

In [None]:
with torch.no_grad():
    outputs = model(**inputs)
embeddings = outputs.last_hidden_state[:,0,:]  # (batch_size, hidden_size)

print(embeddings.shape)  # 예: torch.Size([3, 384])

In [None]:
mol = smiles_to_coord('CCO')
mol