In [None]:
import os
import rdkit
import torch
import random
import numpy as np
import pandas as pd

from rdkit import Chem, AllChem
from deepchem.feat.smiles_tokenizer import BasicSmilesTokenizer
from rdkit.Chem import Draw, AllChem, Descriptors, rdDepictor, rdDistGeom, MACCSkeys, rdMolDescriptors
from rdkit.Chem import rdDepictor

from torch.utils.data import Dataset
from torch_geometric import utils as pyg_utils
from torch_geometric.data import InMemoryDataset, download_url, extract_gz, Data, DataLoader, Batch

# # 작업을 위한 별도의 함수 불러오기
# from utils.download_preprocess import CustomMoleculeNet, atom_features, EDGE_FEATURES

# 시각화를 위한 라이브러리
import matplotlib.pyplot as plt
import seaborn as sns

print(rdkit.__version__)

In [None]:

CHARSMISET = {"(": 1, ".": 2, "0": 3, "2": 4, "4": 5, "6": 6, "8": 7, "@": 8,
                "B": 9, "D": 10, "F": 11, "H": 12, "L": 13, "N": 14, "P": 15, "R": 16,
                "T": 17, "V": 18, "Z": 19, "\\": 20, "b": 21, "d": 22, "f": 23, "h": 24,
                "l": 25, "n": 26, "r": 27, "t": 28, "#": 29, "%": 30, ")": 31, "+": 32,
                "-": 33, "/": 34, "1": 35, "3": 36, "5": 37, "7": 38, "9": 39, "=": 40,
                "A": 41, "C": 42, "E": 43, "G": 44, "I": 45, "K": 46, "M": 47, "O": 48,
                "S": 49, "U": 50, "W": 51, "Y": 52, "[": 53, "]": 54, "a": 55, "c": 56,
                "e": 57, "g": 58, "i": 59, "m": 60, "o": 61, "s": 62, "u": 63, "y": 64,
                'p': 65, '~': 66, '>': 67, '<': 68} # add p, ~, >, <

CHARISOSMILEN = 68

CHARPROTSET = {"A": 1, "C": 2, "B": 3, "E": 4, "D": 5, "G": 6,
               "F": 7, "I": 8, "H": 9, "K": 10, "M": 11, "L": 12,
               "O": 13, "N": 14, "Q": 15, "P": 16, "S": 17, "R": 18,
               "U": 19, "T": 20, "W": 21, "V": 22, "Y": 23, "X": 24, "Z": 25}

CHARPROTLEN = 25


########################################################################################################################
########## Function
########################################################################################################################


def integer_label_encoding(sequence, tp, max_length=100):
    """
    Integer encoding for string sequence.
    Args:
        sequence (str): Drug or Protein string sequence.
        max_length: Maximum encoding length of input string.
    """
    if tp == 'drug':
        charset = CHARSMISET
    elif tp == 'protein':
        charset = CHARPROTSET

    encoding = np.zeros(max_length)
    for idx, letter in enumerate(sequence[:max_length]):
        try:
            if tp == 'protein':
                letter = letter.upper()
            letter = str(letter)
            encoding[idx] = charset[letter]
        except KeyError:
            print(
                f"character {letter} does not exists in sequence category encoding, skip and treat as padding."
            )
    return Data(x=torch.from_numpy(encoding).to(torch.long).unsqueeze(dim=0))


In [None]:
import torch
import numpy as np
from rdkit import Chem
from rdkit import RDLogger
from pathlib import Path
from torch_geometric.data import Data
from torch_geometric.utils import add_self_loops

import logging
logger = logging.getLogger(__name__)
RDLogger.DisableLog('rdApp.*')  

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

In [None]:

def smiles_to_coord(smiles):
    try:
        # SMILES → Mol
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            print(f"[Warning] MolFromSmiles failed for: {smiles}")
            return None
        
        mol = Chem.AddHs(mol)

        # 3D 좌표 생성
        status = AllChem.EmbedMolecule(mol, randomSeed=42)
        if status == -1:
            AllChem.Compute2DCoords(mol)
            print(f"[Warning] EmbedMolecule failed for: {smiles}")        
        else:
            AllChem.UFFOptimizeMolecule(mol)

        # conformer 가져오기
        if mol.GetNumConformers() == 0:
            print(f"[Warning] No conformer generated for: {smiles}")
            return None
        conf = mol.GetConformer()

        # 원자 번호와 좌표 추출
        z = []
        pos = []
        for atom in mol.GetAtoms():
            z.append(atom.GetAtomicNum())
            p = conf.GetAtomPosition(atom.GetIdx())
            pos.append([p.x, p.y, p.z])
        
        z = torch.tensor(z, dtype=torch.long)
        pos = torch.tensor(pos, dtype=torch.float)
        return Data(z=z, pos=pos)

    except Exception as e:
        print(f"[Exception] Failed for {smiles}: {e}")
        return None

In [None]:
# ... existing code ...

# Updated transform_mol function with progress logging
from torch_geometric.data import Data
from tqdm import tqdm
from utils.molecule_feature import *
from transformers import AutoTokenizer, AutoModel

def transform_mol(molecule_smiles, labels, choice):
    mols = [Chem.MolFromSmiles(mol) for mol in molecule_smiles if mol]
    print(f"Processing {len(molecule_smiles)} molecules with {choice} transformation...")
    
    # string tokenization
    if choice == 'string_tokenization': # vocab dictionary, encoded smiles를 출력
        print("Building vocabulary from SMILES tokens...")
        vocab = []
        max_len = 0
        tokenizer = BasicSmilesTokenizer()
        for smi in tqdm(molecule_smiles, desc="Tokenizing SMILES"):
            tokens = tokenizer.tokenize(smi)
            max_len = max(max_len, len(tokens))
            vocab += tokens
            
        uniq_vocab = sorted(set(vocab))
        smiles_vocab = {v: i for i, v in enumerate(uniq_vocab)}
        smiles_vocab['Unk'] = len(smiles_vocab)
        print(f"Vocabulary size: {len(smiles_vocab)}")
        
        print("Encoding SMILES sequences...")
        encoded_smiles = [[smiles_vocab.get(token, smiles_vocab['Unk']) for token in tokenizer.tokenize(smi)] for smi in tqdm(molecule_smiles, desc="Encoding SMILES")]
        smiles_vec = []
        for vec, l, smi in tqdm(zip(encoded_smiles, labels, molecule_smiles), desc="Creating Data objects", total=len(molecule_smiles)):
            pad_len = max_len - len(vec)
            vec = vec + ([0] * pad_len)
            smiles_vec.append(Data(x=torch.tensor(vec).view(1, -1), y=torch.tensor([l], dtype=torch.float).view(1, -1), smiles=smi))
        print(f"Completed string tokenization for {len(smiles_vec)} molecules")
        return smiles_vocab, smiles_vec

    # integer encoding (CNN)
    elif choice == 'integer_encoding':
        print("Converting SMILES to integer encoding...")
        integer_encoding_data = []
        for smi, l in tqdm(zip(molecule_smiles, labels), desc="Integer encoding", total=len(molecule_smiles)):
             drug = integer_label_encoding(smi, 'drug')
             drug.y = torch.tensor([l], dtype=torch.float).view(1, -1)
             integer_encoding_data.append(drug)
        print(f"Completed integer encoding for {len(integer_encoding_data)} molecules")
        return integer_encoding_data

    # 2D Graph
    elif choice == '2D_graph':
        print("Converting SMILES to 2D molecular graphs...")
        graph_data = [smiles_to_feature(smi) for smi in tqdm(molecule_smiles, desc="Creating 2D graphs")]

        graph_2d = []
        for g, l, smi in tqdm(zip(graph_data, labels, molecule_smiles), desc="Adding labels to graphs", total=len(molecule_smiles)):
            g.y = torch.tensor([l], dtype=torch.float).view(1, -1)
            g.smiles = smi
            graph_2d.append(g)
        print(f"Completed 2D graph conversion for {len(graph_2d)} molecules")
        return graph_2d

    # 3D Graph
    elif choice == '3D_graph':
        print("Converting SMILES to 3D molecular graphs...")
        graph_3d = []
        for smi, l in tqdm(zip(molecule_smiles, labels), desc="Creating 3D graphs", total=len(molecule_smiles)):
            graph_data = smiles_to_coord(smi)
            if graph_data is None:
                print(f"Failed to create 3D graph for {smi}")
                continue
            graph_data.y = torch.tensor([l], dtype=torch.float).view(1, -1)
            graph_3d.append(graph_data)
        print(f"Completed 3D graph conversion for {len(graph_3d)} molecules")
        return graph_3d

    # ChemBERTa
    elif choice == 'chemberta':
        tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MLM")
        model = AutoModel.from_pretrained("DeepChem/ChemBERTa-77M-MLM")
        print("Converting SMILES to ChemBERTa embeddings...")
        chemberta_data = []
        for smi, l in tqdm(zip(molecule_smiles, labels), desc="Creating ChemBERTa embeddings", total=len(molecule_smiles)):
            with torch.no_grad():
                inputs = tokenizer(smi, return_tensors='pt', padding=True, truncation=True)
                outputs = model(**inputs)
                embedding = outputs.last_hidden_state[:, 0, :].squeeze(0)  # (hidden_size,)
            data = Data(x=embedding.unsqueeze(0), smiles=smi)
            data.y = torch.tensor([l], dtype=torch.float).view(1, -1)
            chemberta_data.append(data)
        print(f"Completed ChemBERTa embedding conversion for {len(chemberta_data)} molecules")
        return chemberta_data
    
    # Fingerprint
    elif 'fingerprint' in choice:
        print(f"Generating {choice} fingerprints...")
        if choice == 'rdkit_fingerprint':
            fp = [Chem.RDKFingerprint(mol) for mol in tqdm(mols, desc="RDKit fingerprints")]
        
        elif choice == 'maccs_fingerprint':
            fp = [MACCSkeys.GenMACCSKeys(mol) for mol in tqdm(mols, desc="MACCS fingerprints")]
        
        elif choice == 'morgan_fingerprint':
            fp = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024) for mol in tqdm(mols, desc="Morgan fingerprints")]

        print("Converting fingerprints to Data objects...")
        fps = [Data(x=torch.tensor(f).view(1, -1), y=torch.tensor([l], dtype=torch.float).view(1, -1), smiles=smi) for f, l, smi in tqdm(zip(fp, labels, molecule_smiles), desc="Creating fingerprint Data objects", total=len(molecule_smiles))]
        print(f"Completed {choice} generation for {len(fps)} molecules")
        return fps

    # Descriptors
    elif choice == 'descriptors':
        print("Calculating molecular descriptors...")
        # 모델 학습을 위해서는 스케일링 작업이 별도로 필요하다는 것을 기억하자!
        desc = []
        for mol, l, smi in tqdm(zip(mols, labels, molecule_smiles), desc="Calculating descriptors", total=len(molecule_smiles)):
            x = torch.tensor(list(Descriptors.CalcMolDescriptors(mol).values()), dtype=torch.float).view(1, -1)
            y = torch.tensor([l], dtype=torch.float).view(1, -1)
            desc.append(Data(x=x, y=y, smiles=smi))
        print(f"Completed descriptor calculation for {len(desc)} molecules")
        return desc


In [None]:
import pandas as pd

dta_trn = pd.read_csv('dta_dataset/kiba/train.csv')
dta_val = pd.read_csv('dta_dataset/kiba/valid.csv')
dta_tst = pd.read_csv('dta_dataset/kiba/test.csv')

dta_trn['Set'] = 'TRN'
dta_val['Set'] = 'VAL'
dta_tst['Set'] = 'TST'

dta = pd.concat([dta_trn, dta_val, dta_tst]).reset_index(drop=True)

In [None]:
dta['CNN'] = transform_mol(dta['Drug'], dta['Y'], 'integer_encoding')

In [None]:
dta['2D-GNN'] = transform_mol(dta['Drug'], dta['Y'], '2D_graph')
dta['FP-Morgan'] = transform_mol(dta['Drug'], dta['Y'], 'morgan_fingerprint') # 1024
dta['FP-MACCS'] = transform_mol(dta['Drug'], dta['Y'], 'maccs_fingerprint') # 167

In [None]:
dta['ChemBERTa'] = transform_mol(dta['Drug'], dta['Y'], 'chemberta')

In [None]:
dta['Target_Rep'] = dta['Target'].apply(lambda x: integer_label_encoding(x, 'protein', 1000))

In [None]:
dta['3D-GNN'] = transform_mol(dta['Drug'], dta['Y'], '3D_graph')

In [None]:
import pickle
from pathlib import Path

fd = Path('dta_dataset/kiba/feature/')
fd.mkdir(parents=True, exist_ok=True)
for ft in ['CNN']:
# for ft in ['3D-GNN', 'ChemBERTa']:
# for ft in ['2D-GNN', 'FP-Morgan', 'FP-MACCS', 'CNN']:
    nfd = fd / ft
    nfd.mkdir(parents=True, exist_ok=True)
    
    trn_sub = dta[dta['Set'] == 'TRN'][[ft, 'Target_Rep']].reset_index(drop=True).rename(columns={ft: 'Drug_Rep'}).to_dict('records')
    val_sub = dta[dta['Set'] == 'VAL'][[ft, 'Target_Rep']].reset_index(drop=True).rename(columns={ft: 'Drug_Rep'}).to_dict('records')
    tst_sub = dta[dta['Set'] == 'TST'][[ft, 'Target_Rep']].reset_index(drop=True).rename(columns={ft: 'Drug_Rep'}).to_dict('records')
    # print(f'{ft} feature_dim', dta[ft].values[0].x.shape)
    
    with open(nfd / 'trn.pkl', 'wb') as f:
        pickle.dump(trn_sub, f)
    with open(nfd / 'val.pkl', 'wb') as f:
        pickle.dump(val_sub, f)
    with open(nfd / 'tst.pkl', 'wb') as f:
        pickle.dump(tst_sub, f)
    
    print('Saved', nfd)

In [None]:
zinc = pd.read_csv('data/zinc/zinc15_250K.csv')
zinc.head()

In [None]:
# def transform_mol_nolabel(molecule_smiles, choice):
#     mols = [Chem.MolFromSmiles(mol) for mol in molecule_smiles if mol]
#     print(f"Processing {len(molecule_smiles)} molecules with {choice} transformation...")
    
#      # integer encoding (CNN)
#     if choice == 'integer_encoding':
#         print("Converting SMILES to integer encoding...")
#         integer_encoding_data = {}
#         for smi in tqdm(molecule_smiles):
#              drug = integer_label_encoding(smi, 'drug')
#              integer_encoding_data[smi] = drug
#         print(f"Completed integer encoding for {len(integer_encoding_data)} molecules")
#         return integer_encoding_data

#     # 2D Graph
#     elif choice == '2D_graph':
#         print("Converting SMILES to 2D molecular graphs...")
#         graph_data = {smi: drug_to_graph(smi) for smi in tqdm(molecule_smiles, desc="Creating 2D graphs")}
#         print(f"Completed 2D graph conversion for {len(graph_data)} molecules")
#         return graph_data

#     # 3D Graph
#     elif choice == '3D_graph':
#         print("Converting SMILES to 3D molecular graphs...")
#         graph_3d = {}
#         for smi in tqdm(molecule_smiles):
#             graph_data = drug_to_graph(smi)
            
#             mol = Chem.MolFromSmiles(smi)
#             atom_info = [(atom.GetIdx(), atom.GetSymbol()) for atom in mol.GetAtoms()]
                     
#             mol = AllChem.AddHs(mol, addCoords=True)
#             emb_mol = rdDistGeom.EmbedMolecule(mol)
#             if emb_mol == -1:
#                 rdDepictor.Compute2DCoords(mol)

#             conf = mol.GetConformer()
#             pos = np.array([conf.GetAtomPosition(idx) for idx, symbol in atom_info])
#             graph_data.pos = pos
#             graph_3d[smi] = graph_data
#         print(f"Completed 3D graph conversion for {len(graph_3d)} molecules")
#         return graph_3d
    
#     # Fingerprint
#     elif 'fingerprint' in choice:
#         print(f"Generating {choice} fingerprints...")
#         if choice == 'rdkit_fingerprint':
#             fp = [Chem.RDKFingerprint(mol) for mol in tqdm(mols, desc="RDKit fingerprints")]
        
#         elif choice == 'maccs_fingerprint':
#             fp = [MACCSkeys.GenMACCSKeys(mol) for mol in tqdm(mols, desc="MACCS fingerprints")]
        
#         elif choice == 'morgan_fingerprint':
#             fp = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024) for mol in tqdm(mols, desc="Morgan fingerprints")]

#         print("Converting fingerprints to Data objects...")
#         fps = {smi: Data(x=torch.tensor(f).view(1, -1)) for f, smi in tqdm(zip(fp, molecule_smiles), desc="Creating fingerprint Data objects")}
#         print(f"Completed {choice} generation for {len(fps)} molecules")
#         return fps

In [None]:
name = '20250628'
# for dt in ['bace', 'bbbp', 'tox21', 'toxcast', 'sider', 'clintox', 'hiv']:
for dt in ['tox21', 'hiv']:
    for ft in ['ChemBERTa']:
        cmd = f"python Train_Property.py --dataset {dt} --feature {ft} --filename {name} --project {dt.upper()}_{ft}_{name}  > ./logs/new/{dt}_{ft}.txt"
        print(cmd)
        

In [None]:
name = '20250710'
for dt in ['bace', 'bbbp', 'tox21', 'toxcast', 'sider', 'clintox', 'hiv', 'esol', 'freesolv', 'lipophilicity']:
    for ft in ['2D-GNN', '2D-GNN-tuto', 'CNN', 'FP-MACCS', 'FP-Morgan', '3D-GNN', 'ChemBERTa']:
        cmd = f"python Train_Property.py --dataset {dt} --feature {ft} --filename {name} --project {dt.upper()}_{ft}_{name}  > ./logs/new2/{dt}_{ft}.txt"
        print(cmd)
        

In [None]:
name = '20250630'
for dt in ['bace', 'bbbp', 'tox21', 'toxcast', 'sider', 'clintox', 'hiv']:
    for ft in ['2D-GNN', '2D-GNN-tuto', 'CNN', 'FP-MACCS', 'FP-Morgan', '3D-GNN', 'ChemBERTa']:
        cmd = f"python Train_Property.py --dataset {dt} --feature {ft} --filename {name} --project {dt.upper()}_{ft}_{name}  > ./logs/new2/{dt}_{ft}.txt"
        print(cmd)
        

In [None]:
from data.loader import MoleculeDataset ##
data_root = "dataset/"
feature= '2D-GNN'

dataset = MoleculeDataset(data_root + 'bace', dataset='bace', feature=feature)
dataset = MoleculeDataset(data_root + 'bbbp', dataset='bbbp', feature=feature)
dataset = MoleculeDataset(data_root + 'tox21', dataset='tox21', feature=feature)
dataset = MoleculeDataset(data_root + 'toxcast', dataset='toxcast', feature=feature)
dataset = MoleculeDataset(data_root + 'sider', dataset='sider', feature=feature)
dataset = MoleculeDataset(data_root + 'clintox', dataset='clintox', feature=feature)
dataset = MoleculeDataset(data_root + 'hiv', dataset='hiv', feature=feature)

In [None]:
for dt_name in ['bace', 'bbbp', 'tox21', 'toxcast', 'sider', 'clintox', 'hiv']:
    print(dt_name)
    dt1 = MoleculeDataset(data_root + dt_name, dataset=dt_name, feature='2D-GNN')
    dt2 = MoleculeDataset(data_root + dt_name, dataset=dt_name, feature='3D-GNN')
    check = set(dt1.smiles) & set(dt2.smiles)
    print(len(dt1.smiles), len(dt2.smiles), len(check))
    if len(check) != len(dt1.smiles):
        break

In [1]:
from data.loader import MoleculeDataset ##
data_root = "dataset/"
for feature in ['DESC']:
    dataset = MoleculeDataset(data_root + 'bace', dataset='bace', feature=feature)
    dataset = MoleculeDataset(data_root + 'bbbp', dataset='bbbp', feature=feature)
    dataset = MoleculeDataset(data_root + 'tox21', dataset='tox21', feature=feature)
    dataset = MoleculeDataset(data_root + 'toxcast', dataset='toxcast', feature=feature)
    dataset = MoleculeDataset(data_root + 'sider', dataset='sider', feature=feature)
    dataset = MoleculeDataset(data_root + 'clintox', dataset='clintox', feature=feature)
    dataset = MoleculeDataset(data_root + 'hiv', dataset='hiv', feature=feature)

Processing...


root:  dataset\bace
feature:  DESC
base_smi length:  1513


: 

In [None]:
dataset.data

In [None]:
from data.loader import MoleculeDataset ##
data_root = "dataset/"
for dt_name in ['bace', 'bbbp', 'tox21', 'toxcast', 'sider', 'clintox', 'hiv']:
    dataset_base = MoleculeDataset(data_root + dt_name, dataset=dt_name, feature='2D-GNN')
    dataset_3d = MoleculeDataset(data_root + dt_name, dataset=dt_name, feature='3D-GNN')

    base_smi = '\n'.join(str(i) for i in dataset_base.smiles)
    with open(f'dataset/{dt_name}/processed/base_smi.txt', 'w') as f:
        f.write(f'base_smi: {len(dataset_base.smiles)}\n')
        f.write(base_smi)

    smi_3d = set(dataset_base.smiles) & set(dataset_3d.smiles)
    smi_3d_file = '\n'.join(str(i) for i in smi_3d)
    with open(f'dataset/{dt_name}/processed/3d_smi.txt', 'w') as f:
        f.write(f'3d_smi: {len(smi_3d)}\n')
        f.write(smi_3d_file)

In [None]:
from data.loader import MoleculeDataset ##
data_root = "dataset/"
# for feature in ['3D-GNN']:
for feature in ['2D-GNN', '2D-GNN-tuto', 'CNN', '3D-GNN','FP-MACCS', 'FP-Morgan', 'ChemBERTa']:
    print(feature)
    dataset = MoleculeDataset(data_root + 'esol', dataset='esol', feature=feature)
    dataset = MoleculeDataset(data_root + 'freesolv', dataset='freesolv', feature=feature)
    dataset = MoleculeDataset(data_root + 'lipophilicity', dataset='lipophilicity', feature=feature)


In [None]:
name = '20250708'
# for dt in ['bace', 'bbbp', 'tox21', 'toxcast', 'sider', 'clintox', 'hiv', 'freesolv', 'esol', 'lipophilicity']:
for ft in ['2D-GNN-tuto']:
    for dt in ['hiv']:
        for tp in ['2L-GCN']:
            cmd = f"python Train_Property.py --dataset {dt} --feature {ft} --filename {name}-{tp} --project {dt.upper()}_{ft}_{name}_{tp}  > ./logs/comp_gcl_tuto_others/{dt}_{ft}_{tp}.txt"
            print(cmd)
        

In [None]:
name = '20250708'
for ft in []:
    for dt in ['bace', 'bbbp', 'tox21', 'toxcast', 'sider', 'hiv', 'freesolv', 'esol', 'lipophilicity']:
        
            cmd = f"python Train_Property.py --dataset {dt} --feature {ft} --filename {name}-{tp} --project {dt.upper()}_{ft}_{name}_{tp}  > ./logs/comp_gcl_tuto_others/{dt}_{ft}_{tp}.txt"
                print(cmd)
        if ft == '2D-GNN-copy':
            for tp in ['2L-GIN', '5L-GIN', '2L-GCN', '5L-GCN', '2L-GIN-emb', '5L-GIN-emb', '2L-GCN-emb', '5L-GCN-emb', '2L-GIN-emb-fit', '5L-GIN-emb-fit', '2L-GCN-emb-fit', '5L-GCN-emb-fit']:
                cmd = f"python Train_Property.py --dataset {dt} --feature {ft} --filename {name}-{tp} --project {dt.upper()}_{ft}_{name}_{tp}  > ./logs/comp_gcl_tuto_others/{dt}_{ft}_{tp}.txt"
                print(cmd)
        

In [None]:
name = '20250708'
# for dt in ['bace', 'bbbp', 'tox21', 'toxcast', 'sider', 'clintox', 'hiv', 'freesolv', 'esol', 'lipophilicity']:
for ft in ['2D-GNN-copy']:
    for dt in ['bace', 'bbbp', 'tox21', 'toxcast', 'sider', 'hiv', 'freesolv', 'esol', 'lipophilicity']:
        if ft in ['2D-GNN-tuto', '2D-GNN-copy2', '2D-GNN-copy3']:
            for tp in ['2L-GIN', '5L-GIN', '2L-GCN', '5L-GCN']:
                cmd = f"python Train_Property.py --dataset {dt} --feature {ft} --filename {name}-{tp} --project {dt.upper()}_{ft}_{name}_{tp}  > ./logs/comp_gcl_tuto_others/{dt}_{ft}_{tp}.txt"
                print(cmd)
        if ft == '2D-GNN-copy':
            for tp in ['2L-GIN', '5L-GIN', '2L-GCN', '5L-GCN', '2L-GIN-emb', '5L-GIN-emb', '2L-GCN-emb', '5L-GCN-emb', '2L-GIN-emb-fit', '5L-GIN-emb-fit', '2L-GCN-emb-fit', '5L-GCN-emb-fit']:
                cmd = f"python Train_Property.py --dataset {dt} --feature {ft} --filename {name}-{tp} --project {dt.upper()}_{ft}_{name}_{tp}  > ./logs/comp_gcl_tuto_others/{dt}_{ft}_{tp}.txt"
                print(cmd)
        

In [None]:

for dt in ['bace', 'bbbp', 'tox21', 'toxcast', 'sider', 'clintox', 'hiv', 'freesolv', 'esol', 'lipophilicity']:
    for ft in ['DESC']:
        
        cmd = f"python Train_Property.py --dataset {dt} --feature {ft} --filename {name} --project {dt.upper()}_{ft}_{name}  > ./logs/new4/{dt}_{ft}.txt"
        print(cmd)
        

In [None]:
from data.loader import MoleculeDataset ##
data_root = "dataset/"
for dt_name in ['freesolv', 'esol', 'lipophilicity']:
    dataset_base = MoleculeDataset(data_root + dt_name, dataset=dt_name, feature='2D-GNN')
    dataset_3d = MoleculeDataset(data_root + dt_name, dataset=dt_name, feature='3D-GNN')

    base_smi = '\n'.join(str(i) for i in dataset_base.smiles)
    with open(f'dataset/{dt_name}/processed/base_smi.txt', 'w') as f:
        f.write(f'base_smi: {len(dataset_base.smiles)}\n')
        f.write(base_smi)

    smi_3d = set(dataset_base.smiles) & set(dataset_3d.smiles)
    smi_3d_file = '\n'.join(str(i) for i in smi_3d)
    with open(f'dataset/{dt_name}/processed/3d_smi.txt', 'w') as f:
        f.write(f'3d_smi: {len(smi_3d)}\n')
        f.write(smi_3d_file)

In [None]:
import pandas as pd
from pathlib import Path
folder = Path('logs/comp_gcl_tuto_others')
result = []
for file in folder.glob('*.txt'):
    fn = file.stem
    dn, ftn, mdn = fn.split('_')
    
    if ftn == '2D-GNN-copy':
        mdnn = mdn.split('-')
        # print(mdnn)
        if mdnn[-1] == 'fit':
            pn = 5
        elif mdnn[-1] == 'emb':
            pn = 4
        else:
            pn = 3
    elif ftn == '2D-GNN-copy2':
        pn = 2
    elif ftn == '2D-GNN-copy3':
        pn = 1
    elif ftn == '2D-GNN-tuto':
        pn = 0
    
    print(ftn, mdn, pn)

    with open(file, 'r', encoding='utf-16') as f:
        lines = f.readlines()
        if 'Test Score' in lines[-1]:
            result.append({'dataset': dn, 'ft': ftn, 'md': mdn, 'pn': pn, 'score': lines[-1].split(':')[-1].strip()})

pd.DataFrame(result).to_excel('summary_results.xlsx')


In [None]:
name = '20250712'
for ft in ['FP-MACCS', 'FP-Morgan', 'CNN', 'ChemBERTa', '2D-GNN-tuto', '3D-GNN']:
    for dt in ['bace', 'bbbp', 'tox21', 'toxcast', 'sider', 'hiv', 'freesolv', 'esol', 'lipophilicity']:
        cmd = f"python Train_Property_ms.py --dataset {dt} --feature {ft} --filename {name} --project {dt.upper()}_{ft}_{name}  > ./logs/{name}/{dt}_{ft}.txt"
        print(cmd)

In [1]:
from rdkit import Chem
from tqdm import tqdm
from rdkit import RDLogger
from rdkit.Chem import Descriptors
RDLogger.DisableLog('rdApp.*')

tqdm.pandas()

def calc_desc(smi):
    mol = Chem.MolFromSmiles(smi)
    desc = Descriptors.CalcMolDescriptors(mol)
    if desc:
        if pd.DataFrame([desc]).isnull().sum().sum() == len(desc):
            return None
        else:
            desc_select = {}
            for k, v in desc.items():
                if k.startswith('fr'):
                    desc_select[k] = v
                elif k.startswith('Num') or k.endswith('Count'):
                    desc_select[k] = v
                elif k in ['qed', 'SPS', 'ExactMolWt', 'MolWt', 'TPSA', 'HeavyAtomMolWt', 'Ipc', 'MolLogP', 'MolMR', 'HallKierAlpha', 'FractionCSP3']:
                    desc_select[k] = v
            return desc_select
    else:
        return None

In [2]:
import torch
import pickle
import pandas as pd
from pathlib import Path
from rdkit.Chem import AllChem
from data.loader import MoleculeDataset
from data.splitters import scaffold_split

tqdm.pandas()

total = []
fds = [fd for fd in Path('dataset').glob('*')]
for fd in fds:
    data = pd.read_csv([f for f in (fd / 'raw').glob('*.csv')][0])

    data_root = "dataset/"
    dataset = MoleculeDataset(data_root + fd.stem, dataset=fd.stem, feature='CNN')
    smiles_list = pd.read_csv(data_root + fd.stem + '/processed/smiles.csv', header=None)[0].tolist()
    train_dataset, valid_dataset, test_dataset = scaffold_split(dataset, smiles_list, null_value=0, frac_train=0.8,frac_valid=0.1, frac_test=0.1)
    
    trn_smiles = sum(train_dataset.smiles, [])
    val_smiles = sum(valid_dataset.smiles, [])
    tst_smiles = sum(test_dataset.smiles, [])
    
    if fd.stem == 'bace':
        smi_col = 'mol'
    else:
        smi_col = 'smiles'

    box = {'trn': {}, 'val': {}, 'tst': {}}
    fail = []
    exclude = []
    for smi in tqdm(data[smi_col]):
        
        desc = calc_desc(smi)
        if desc:
            if fd.stem in ['bbbp', 'toxcast', 'clintox']:
                mol = AllChem.MolFromSmiles(smi)
                smi = AllChem.MolToSmiles(mol)
            
            desc['smiles'] = smi
            if smi in trn_smiles:
                box['trn'][smi] = desc
            elif smi in val_smiles:
                box['val'][smi] = desc
            elif smi in tst_smiles:
                box['tst'][smi] = desc
            else:
                exclude.append(smi)
        else:
            fail.append(smi)
    
    trn = pd.DataFrame(box['trn'].values())
    val = pd.DataFrame(box['val'].values())
    tst = pd.DataFrame(box['tst'].values())
    
    pre_total = pd.concat([trn, val, tst]).reset_index(drop=True)
    pre_total.to_csv(fd / 'processed' / 'desc_pre.csv', index=False)
    
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    trn[trn.columns[:-1]] = pd.DataFrame(scaler.fit_transform(trn[trn.columns[:-1]]), columns=trn.columns[:-1])
    val[val.columns[:-1]] = pd.DataFrame(scaler.transform(val[val.columns[:-1]]), columns=val.columns[:-1])
    tst[tst.columns[:-1]] = pd.DataFrame(scaler.transform(tst[tst.columns[:-1]]), columns=tst.columns[:-1])
    
    total = pd.concat([trn, val, tst]).reset_index(drop=True)

    result = {}
    for _, row in total.iterrows():
        result[row['smiles']] = torch.tensor(list(row.values[:-1])).unsqueeze(0)

    with open(fd / 'processed' / 'desc.pkl', 'wb') as f:
        pickle.dump(result, f)
    
    print(fd.stem, len(result), len(fail), len(exclude))
    print(result[row['smiles']].shape)
print('done')

100%|██████████| 1513/1513 [00:16<00:00, 92.41it/s] 


bace 1513 0 0
torch.Size([1, 115])


100%|██████████| 2050/2050 [00:16<00:00, 126.60it/s]


bbbp 1972 11 3
torch.Size([1, 115])


100%|██████████| 1483/1483 [00:13<00:00, 110.77it/s]


clintox 1435 4 25
torch.Size([1, 115])


100%|██████████| 1128/1128 [00:05<00:00, 195.88it/s]


esol 1121 0 7
torch.Size([1, 115])


100%|██████████| 642/642 [00:02<00:00, 241.98it/s]


freesolv 628 0 14
torch.Size([1, 115])


100%|██████████| 41127/41127 [05:56<00:00, 115.52it/s]


hiv 40759 0 368
torch.Size([1, 115])


100%|██████████| 4200/4200 [00:35<00:00, 118.84it/s]


lipophilicity 4200 0 0
torch.Size([1, 115])


100%|██████████| 1427/1427 [00:44<00:00, 32.33it/s] 


sider 1356 0 71
torch.Size([1, 115])


100%|██████████| 7831/7831 [00:51<00:00, 153.36it/s]


tox21 7774 0 57
torch.Size([1, 115])


100%|██████████| 8597/8597 [00:57<00:00, 148.46it/s]


toxcast 8536 20 41
torch.Size([1, 115])
done


In [3]:
from data.loader import MoleculeDataset
data_root = "dataset/"
for feature in ['DESC']:
    dataset = MoleculeDataset(data_root + 'bace', dataset='bace', feature=feature)
    dataset = MoleculeDataset(data_root + 'bbbp', dataset='bbbp', feature=feature)
    dataset = MoleculeDataset(data_root + 'tox21', dataset='tox21', feature=feature)
    dataset = MoleculeDataset(data_root + 'toxcast', dataset='toxcast', feature=feature)
    dataset = MoleculeDataset(data_root + 'sider', dataset='sider', feature=feature)
    dataset = MoleculeDataset(data_root + 'clintox', dataset='clintox', feature=feature)
    dataset = MoleculeDataset(data_root + 'hiv', dataset='hiv', feature=feature)
    dataset = MoleculeDataset(data_root + 'freesolv', dataset='freesolv', feature=feature)
    dataset = MoleculeDataset(data_root + 'esol', dataset='esol', feature=feature)
    dataset = MoleculeDataset(data_root + 'lipophilicity', dataset='lipophilicity', feature=feature)

root:  dataset\bace
feature:  DESC
base_smi length:  1513


Processing...


Created data:  1513
root:  dataset\bbbp
feature:  DESC
base_smi length:  1972


Done!
Processing...


Created data:  1972
root:  dataset\tox21
feature:  DESC
base_smi length:  7774


Done!
Processing...


Created data:  7774
root:  dataset\toxcast
feature:  DESC
base_smi length:  8536


Done!
Processing...


Created data:  8536
root:  dataset\sider
feature:  DESC
base_smi length:  1356


Done!
Processing...


Created data:  1356
root:  dataset\clintox
feature:  DESC
base_smi length:  1435


Done!
Processing...


Created data:  1435
root:  dataset\hiv
feature:  DESC
base_smi length:  40759


Done!
Processing...


Created data:  40759


Done!
Processing...
Done!
Processing...


root:  dataset\freesolv
feature:  DESC
base_smi length:  628
Created data:  628
root:  dataset\esol
feature:  DESC
base_smi length:  1121
Created data:  1121


Done!
Processing...


root:  dataset\lipophilicity
feature:  DESC
base_smi length:  4200
Created data:  4200


Done!


In [5]:
dataset.data

Data(x=[4200, 115], smiles=[4200], id=[4200], y=[4200])

In [9]:
import pandas as pd
from data.loader import MoleculeDataset
from data.splitters import scaffold_split

data_root = "dataset/"
dataset = MoleculeDataset(data_root + 'bbbp', dataset='bbbp', feature='DESC')
smiles_list = pd.read_csv(data_root + 'bbbp' + '/processed/smiles.csv', header=None)[0].tolist()
train_dataset, valid_dataset, test_dataset = scaffold_split(dataset, smiles_list, null_value=0, frac_train=0.8,frac_valid=0.1, frac_test=0.1)

In [11]:
train_dataset.x.shape

torch.Size([1577, 115])

In [12]:
ft = 'DESC'
for name in ['20250716']:
    for dt in ['bace', 'bbbp', 'tox21', 'toxcast', 'sider', 'clintox', 'hiv', 'freesolv', 'esol', 'lipophilicity']:
        cmd = f"python Train_Property_ms.py --dataset {dt} --feature {ft} --filename {name} --project {dt.upper()}_{ft}_{name}  > ./logs/desc/{dt}_{ft}_{name}.txt"
        print(cmd)
    print('ls')
        

python Train_Property_ms.py --dataset bace --feature DESC --filename 20250716 --project BACE_DESC_20250716  > ./logs/desc/bace_DESC_20250716.txt
python Train_Property_ms.py --dataset bbbp --feature DESC --filename 20250716 --project BBBP_DESC_20250716  > ./logs/desc/bbbp_DESC_20250716.txt
python Train_Property_ms.py --dataset tox21 --feature DESC --filename 20250716 --project TOX21_DESC_20250716  > ./logs/desc/tox21_DESC_20250716.txt
python Train_Property_ms.py --dataset toxcast --feature DESC --filename 20250716 --project TOXCAST_DESC_20250716  > ./logs/desc/toxcast_DESC_20250716.txt
python Train_Property_ms.py --dataset sider --feature DESC --filename 20250716 --project SIDER_DESC_20250716  > ./logs/desc/sider_DESC_20250716.txt
python Train_Property_ms.py --dataset clintox --feature DESC --filename 20250716 --project CLINTOX_DESC_20250716  > ./logs/desc/clintox_DESC_20250716.txt
python Train_Property_ms.py --dataset hiv --feature DESC --filename 20250716 --project HIV_DESC_20250716  