In [1]:
from tdc.single_pred import ADME
data = ADME(name = 'VDss_Lombardo')
split = data.get_split()

Found local copy...
Loading...
Done!


In [2]:
split['train'].columns, split['train'].shape, split['valid'].shape, split['test'].shape

(Index(['Drug_ID', 'Drug', 'Y'], dtype='object'), (791, 3), (113, 3), (226, 3))

In [3]:
import os
import pandas as pd

# Define save path
save_path = "E:/K-MELLODDY-Project/data/ADMET_PK_Public_Dataset/TDC/ADMET_Feature_Extraction"
os.makedirs(save_path, exist_ok=True)

# Save each split to a CSV file
train_path = os.path.join(save_path, "Half_Life_Obach_train.csv")
valid_path = os.path.join(save_path, "Half_Life_Obach_valid.csv")
test_path = os.path.join(save_path, "Half_Life_Obach_test.csv")

split['train'].to_csv(train_path, index=False)
split['valid'].to_csv(valid_path, index=False)
split['test'].to_csv(test_path, index=False)

print(f"Files saved to: {os.path.abspath(save_path)}")

Files saved to: E:\K-MELLODDY-Project\data\ADMET_PK_Public_Dataset\TDC\ADMET_Feature_Extraction01


In [22]:
import pandas as pd
import selfies as sf
import numpy as np
from tqdm.auto import tqdm

tqdm.pandas()

def build_shared_selfies_features(datasets, max_len=None):
    """
    Convert multiple datasets of SMILES into SELFIES features with a shared vocabulary.

    Parameters
    ----------
    datasets : dict
        Dictionary of name -> pd.Series of SMILES (e.g., {"train": train_smiles, ...})
    max_len : int, optional
        Maximum token length for padding/truncation. If None, will use max length across all datasets.

    Returns
    -------
    features_dict : dict
        Dictionary of name -> np.ndarray of indexed SELFIES tokens
    vocab : dict
        Shared token-to-index dictionary
    max_len : int
        Maximum sequence length
    """

    # Step 1: Convert SMILES → SELFIES for all datasets
    selfies_dict = {}
    for name, smiles in datasets.items():
        selfies_dict[name] = smiles.progress_apply(lambda x: sf.encoder(x) if x is not None else None).dropna()

    # Step 2: Tokenize all datasets
    tokens_dict = {name: selfies.apply(lambda s: list(sf.split_selfies(s))) 
                   for name, selfies in selfies_dict.items()}

    # Step 3: Build a global vocabulary across all datasets
    all_tokens = set()
    for toks in tokens_dict.values():
        for t in toks:
            all_tokens.update(t)
    vocab = {tok: idx + 1 for idx, tok in enumerate(sorted(all_tokens))}
    vocab["[PAD]"] = 0

    # Step 4: Determine max_len
    if max_len is None:
        max_len = max(len(toks) for toks_list in tokens_dict.values() for toks in toks_list)

    # Step 5: Convert tokens → indices with padding
    def tokens_to_indices(toks):
        idxs = [vocab.get(tok, 0) for tok in toks]
        if len(idxs) < max_len:
            idxs += [0] * (max_len - len(idxs))
        else:
            idxs = idxs[:max_len]
        return idxs

    features_dict = {}
    for name, s in tokens_dict.items():
        indices = s.progress_apply(tokens_to_indices)
        features_dict[name] = pd.DataFrame({
            "SELFIES": selfies_dict[name],
            "Indices": indices
        })    

    return features_dict, vocab, max_len


In [28]:
import os
import pandas as pd
import numpy as np

# Setup paths
save_path = "E:/K-MELLODDY-Project/data/ADMET_PK_Public_Dataset/TDC/ADMET_Feature_Extraction"
output_path = os.path.join(save_path, "selfies")
os.makedirs(output_path, exist_ok=True)

# Dataset splits
splits = ['train', 'valid', 'test']

# Save the vocabulary as CSV
vocab_df = pd.DataFrame(list(vocab.items()), columns=["Token", "Index"])
vocab_df.to_csv(os.path.join(output_path, "selfies_vocab.csv"), index=False)
print("Saved shared vocabulary to selfies_vocab.csv")

# Process each split
for split in splits:
    print(f"Processing {split}...")
    df = pd.read_csv(f"{save_path}/Half_Life_Obach_{split}.csv")
    labels = df["Y"].values  # Target labels

    # Extract features from features_dict
    feat_df = features_dict[split].copy()
    feat_df["Label"] = labels

    # Expand the "Indices" column into separate columns for CSV
    indices_expanded = pd.DataFrame(feat_df["Indices"].tolist(),
                                    columns=[f"Idx_{i+1}" for i in range(feat_df["Indices"].str.len().max())])
    
    final_df = pd.concat([feat_df["SELFIES"], indices_expanded, feat_df["Label"]], axis=1)

    # Save to CSV
    final_df.to_csv(os.path.join(output_path, f"features_{split}.csv"), index=False)
    print(f"Saved {split} features to features_{split}.csv")


Saved shared vocabulary to selfies_vocab.csv
Processing train...
Saved train features to features_train.csv
Processing valid...
Saved valid features to features_valid.csv
Processing test...
Saved test features to features_test.csv


In [41]:
import numpy as np

from sklearn import preprocessing

from rdkit import Chem
from rdkit import RDLogger

from rdkit.Chem import DataStructs
from rdkit.Chem.rdMolDescriptors import GetHashedMorganFingerprint
from rdkit.Avalon.pyAvalonTools import GetAvalonCountFP
from rdkit.Chem import rdReducedGraphs
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator

from molfeat.trans.pretrained import PretrainedDGLTransformer

class scaler:
    def __init__(self, log=False):
        self.log = log
        self.offset = None
        self.scaler = None

    def fit(self, y):
        # make the values non-negative
        self.offset = np.min([np.min(y), 0.0])
        y = y.reshape(-1, 1) - self.offset

        # scale the input data
        if self.log:
            y = np.log10(y + 1.0)

        self.scaler = preprocessing.StandardScaler().fit(y)

    def transform(self, y):
        y = y.reshape(-1, 1) - self.offset

        # scale the input data
        if self.log:
            y = np.log10(y + 1.0)

        y_scale = self.scaler.transform(y)

        return y_scale

    def inverse_transform(self, y_scale):
        y = self.scaler.inverse_transform(y_scale.reshape(-1, 1))

        if self.log:
            y = 10.0**y - 1.0

        y = y + self.offset

        return y

# from https://github.com/rdkit/rdkit/discussions/3863
def count_to_array(fingerprint):
    array = np.zeros((0,), dtype=np.int8)
    
    DataStructs.ConvertToNumpyArray(fingerprint, array)

    return array

def get_avalon_fingerprints(molecules, n_bits=1024):
    fingerprints = molecules.apply(lambda x: GetAvalonCountFP(x, nBits=n_bits))

    fingerprints = fingerprints.apply(count_to_array)
    
    return np.stack(fingerprints.values)

def get_morgan_fingerprints(molecules, n_bits=1024, radius=2):
    fingerprints = molecules.apply(lambda x: 
        GetHashedMorganFingerprint(x, nBits=n_bits, radius=radius))

    fingerprints = fingerprints.apply(count_to_array)
    
    return np.stack(fingerprints.values)

def get_erg_fingerprints(molecules):
    fingerprints = molecules.apply(rdReducedGraphs.GetErGFingerprint)
    
    return np.stack(fingerprints.values)

# from https://www.blopig.com/blog/2022/06/how-to-turn-a-molecule-into-a-vector-of-physicochemical-descriptors-using-rdkit/
def get_chosen_descriptors():
    chosen_descriptors = ['BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 
        'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 
        'EState_VSA1', 'EState_VSA10', 'EState_VSA11', 'EState_VSA2', 'EState_VSA3', 
        'EState_VSA4', 'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 
        'EState_VSA9', 'ExactMolWt', 'FpDensityMorgan1', 'FpDensityMorgan2', 
        'FpDensityMorgan3', 'FractionCSP3', 'HallKierAlpha', 'HeavyAtomCount', 
        'HeavyAtomMolWt', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 
        'MaxAbsEStateIndex', 'MaxAbsPartialCharge', 'MaxEStateIndex', 'MaxPartialCharge', 
        'MinAbsEStateIndex', 'MinAbsPartialCharge', 'MinEStateIndex', 'MinPartialCharge', 
        'MolLogP', 'MolMR', 'MolWt', 'NHOHCount', 'NOCount', 'NumAliphaticCarbocycles', 
        'NumAliphaticHeterocycles', 'NumAliphaticRings', 'NumAromaticCarbocycles', 
        'NumAromaticHeterocycles', 'NumAromaticRings', 'NumHAcceptors', 'NumHDonors', 
        'NumHeteroatoms', 'NumRadicalElectrons', 'NumRotatableBonds', 
        'NumSaturatedCarbocycles', 'NumSaturatedHeterocycles', 'NumSaturatedRings', 
        'NumValenceElectrons', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 
        'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 
        'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'RingCount', 'SMR_VSA1', 
        'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 
        'SMR_VSA8', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 
        'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 
        'SlogP_VSA8', 'SlogP_VSA9', 'TPSA', 'VSA_EState1', 'VSA_EState10', 'VSA_EState2', 
        'VSA_EState3', 'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 
        'VSA_EState8', 'VSA_EState9', 'fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN', 
        'fr_Ar_COO', 'fr_Ar_N', 'fr_Ar_NH', 'fr_Ar_OH', 'fr_COO', 'fr_COO2', 'fr_C_O', 
        'fr_C_O_noCOO', 'fr_C_S', 'fr_HOCCN', 'fr_Imine', 'fr_NH0', 'fr_NH1', 'fr_NH2', 
        'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2', 'fr_Nhpyrrole', 'fr_SH', 
        'fr_aldehyde', 'fr_alkyl_carbamate', 'fr_alkyl_halide', 'fr_allylic_oxid', 
        'fr_amide', 'fr_amidine', 'fr_aniline', 'fr_aryl_methyl', 'fr_azide', 'fr_azo', 
        'fr_barbitur', 'fr_benzene', 'fr_benzodiazepine', 'fr_bicyclic', 'fr_diazo', 
        'fr_dihydropyridine', 'fr_epoxide', 'fr_ester', 'fr_ether', 'fr_furan', 'fr_guanido', 
        'fr_halogen', 'fr_hdrzine', 'fr_hdrzone', 'fr_imidazole', 'fr_imide', 'fr_isocyan', 
        'fr_isothiocyan', 'fr_ketone', 'fr_ketone_Topliss', 'fr_lactam', 'fr_lactone', 
        'fr_methoxy', 'fr_morpholine', 'fr_nitrile', 'fr_nitro', 'fr_nitro_arom', 
        'fr_nitro_arom_nonortho', 'fr_nitroso', 'fr_oxazole', 'fr_oxime', 
        'fr_para_hydroxylation', 'fr_phenol', 'fr_phenol_noOrthoHbond', 'fr_phos_acid', 
        'fr_phos_ester', 'fr_piperdine', 'fr_piperzine', 'fr_priamide', 'fr_prisulfonamd', 
        'fr_pyridine', 'fr_quatN', 'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 
        'fr_term_acetylene', 'fr_tetrazole', 'fr_thiazole', 'fr_thiocyan', 'fr_thiophene', 
        'fr_unbrch_alkane', 'fr_urea', 'qed']
    
    return chosen_descriptors

def get_rdkit_features(molecules):
    calculator = MolecularDescriptorCalculator(all_descriptor_names)
    # calculator = MolecularDescriptorCalculator(get_chosen_descriptors())

    X_rdkit = molecules.apply(
        lambda x: np.array(calculator.CalcDescriptors(x)))

    X_rdkit = np.vstack(X_rdkit.values)

    return X_rdkit

def get_gin_supervised_masking(molecules):
    transformer = PretrainedDGLTransformer(kind='gin_supervised_masking', dtype=float)

    return transformer(molecules)


def get_fingerprints(smiles, include_selfies=False):
    RDLogger.DisableLog('rdApp.*')
    molecules = smiles.apply(Chem.MolFromSmiles)

    fingerprints = []

    # Standard featurizers
    fingerprints.append(get_morgan_fingerprints(molecules))
    fingerprints.append(get_avalon_fingerprints(molecules))
    fingerprints.append(get_erg_fingerprints(molecules))
    fingerprints.append(get_rdkit_features(molecules))
    # fingerprints.append(get_gin_supervised_masking(molecules))

    return np.concatenate(fingerprints, axis=1)

In [42]:
def debug_featurizer_shapes(smiles, include_selfies=False, max_len=None):
    RDLogger.DisableLog('rdApp.*')
    molecules = smiles.apply(Chem.MolFromSmiles)

    print("Number of molecules:", len(molecules))

    # Morgan
    morgan = get_morgan_fingerprints(molecules)
    print("Morgan fingerprints:", morgan.shape)

    # Avalon
    avalon = get_avalon_fingerprints(molecules)
    print("Avalon fingerprints:", avalon.shape)

    # ErG
    erg = get_erg_fingerprints(molecules)
    print("ErG fingerprints:", erg.shape)

    # RDKit descriptors
    rdkit_feats = get_rdkit_features(molecules)
    print("RDKit descriptors:", rdkit_feats.shape)

    # # GIN embeddings
    # gin = get_gin_supervised_masking(molecules)
    # print("GIN supervised masking:", gin.shape)


In [43]:
import os
import pandas as pd
import numpy as np

# Setup paths
save_path = "E:/K-MELLODDY-Project/data/ADMET_PK_Public_Dataset/TDC/ADMET_Feature_Extraction"
output_path = os.path.join(save_path, "fingerprints")
os.makedirs(output_path, exist_ok=True)

# Dataset splits
splits = ['train', 'valid', 'test']

# Process each
for split in splits:
    print(f"Processing {split}...")
    df = pd.read_csv(f"{save_path}/Half_Life_Obach_{split}.csv")
    smiles = df["Drug"]
    labels = df["Y"].values  # No scaling needed

    # Get fingerprints
    debug_featurizer_shapes(smiles, include_selfies=True)
    X = get_fingerprints(smiles)

    # Save
    np.save(f"{output_path}/X_{split}.npy", X)
    np.save(f"{output_path}/y_{split}.npy", labels)
    print(f"Saved to: X_{split}.npy and y_{split}.npy")

Processing train...
Number of molecules: 791
Morgan fingerprints: (791, 1024)
Avalon fingerprints: (791, 1024)
ErG fingerprints: (791, 315)
RDKit descriptors: (791, 217)
GIN supervised masking: (791, 300)
Saved to: X_train.npy and y_train.npy
Processing valid...
Number of molecules: 113
Morgan fingerprints: (113, 1024)
Avalon fingerprints: (113, 1024)
ErG fingerprints: (113, 315)
RDKit descriptors: (113, 217)
GIN supervised masking: (113, 300)
Saved to: X_valid.npy and y_valid.npy
Processing test...
Number of molecules: 226
Morgan fingerprints: (226, 1024)
Avalon fingerprints: (226, 1024)
ErG fingerprints: (226, 315)
RDKit descriptors: (226, 217)
GIN supervised masking: (226, 300)
Saved to: X_test.npy and y_test.npy


In [44]:
X_train_fp = np.load(save_path + "/fingerprints/X_train.npy")
y_train_fp = np.load(save_path + "/fingerprints/y_train.npy")

X_valid_fp = np.load(save_path + "/fingerprints/X_valid.npy")
y_valid_fp = np.load(save_path + "/fingerprints/y_valid.npy")

X_test_fp = np.load(save_path + "/fingerprints/X_test.npy")
y_test_fp = np.load(save_path + "/fingerprints/y_test.npy")

X_train_fp.shape, y_train_fp.shape, X_valid_fp.shape, y_valid_fp.shape, X_test_fp.shape, y_test_fp.shape

((791, 2580), (791,), (113, 2580), (113,), (226, 2580), (226,))

In [45]:
print("NaNs in X_train:", np.isnan(X_train_fp).any())
print("NaNs in y_train:", np.isnan(y_train_fp).any())

print("\nNaNs in y_valid:", np.isnan(X_valid_fp).any())
print("NaNs in y_valid:", np.isnan(y_valid_fp).any())

print("\nNaNs in y_test:", np.isnan(X_test_fp).any())
print("NaNs in y_test:", np.isnan(y_test_fp).any())


NaNs in X_train: False
NaNs in y_train: False

NaNs in y_valid: False
NaNs in y_valid: False

NaNs in y_test: False
NaNs in y_test: False


In [52]:
import torch

# Load the file
splits = torch.load(r"E:\K-MELLODDY-Project\data\ADMET_PK_Public_Dataset\Data_LargeMix\pcqm4m_g25_n4\pcqm4m_g25_n4_random_splits.pt")

print(type(splits))      # should be dict
print(splits.keys())     # see available splits

# Example: access train split
train_data = splits["train"]
val_data = splits["val"]
test_data = splits["test"]


<class 'dict'>
dict_keys(['train', 'val', 'test'])


In [58]:
import pandas as pd

# Load parquet file
df = pd.read_parquet(r"E:\K-MELLODDY-Project\data\ADMET_PK_Public_Dataset\Data_LargeMix\pcba_1328\PCBA_1328_1564k.parquet")

print(df.head())       # Show first rows
print(df.columns)      # Show available columns


   Unnamed: 0                                             SMILES  assayID-1  \
0           1                              B.CC(=O)OC1CN2CCC1CC2        0.0   
1           7                               B.c1ccc(N2CCOCC2)cc1        NaN   
2          11                                          BC(=O)O.N        NaN   
3          13                Br.Br.CCCC1CC(C)(c2csc(NCC)n2)OC1=O        NaN   
4          14  Br.Br.CCCC1CC(C)(c2csc(NCCNc3nc(C4(C)CC(CCC)C(...        NaN   

   assayID-101  assayID-103  assayID-105  assayID-107  assayID-109  \
0          0.0          0.0          NaN          0.0          0.0   
1          NaN          NaN          NaN          NaN          NaN   
2          NaN          NaN          NaN          NaN          NaN   
3          NaN          NaN          NaN          NaN          NaN   
4          NaN          NaN          NaN          NaN          NaN   

   assayID-11  assayID-113  ...  assayID-1645856  assayID-1645857  \
0         NaN          NaN  ...    

In [57]:
df.shape

(3810323, 31)

In [59]:
df.shape

(1563664, 1332)