In [1]:
!pip install rdkit
!pip install PaDEL-pywrapper

Collecting rdkit
  Downloading rdkit-2024.9.5-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.9.5-cp311-cp311-manylinux_2_28_x86_64.whl (34.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.9.5
Collecting PaDEL-pywrapper
  Downloading PaDEL_pywrapper-1.0.5-py3-none-any.whl.metadata (5.5 kB)
Collecting install-jdk==0.3.0 (from PaDEL-pywrapper)
  Downloading install-jdk-0.3.0.tar.gz (3.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bounded-pool-executor==0.0.3 (from PaDEL-pywrapper)
  Downloading bounded_pool_executor-0.0.3-py3-none-any.whl.metadata (2.7 kB)
Downloading PaDEL_pywrapper-1.0.5-py3-none-any.whl (37.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.5/37.5 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bounded_pool_executor-0.0.3-py3-no

In [77]:
import pickle
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from PaDEL_pywrapper import PaDEL
from PaDEL_pywrapper import descriptors

In [3]:
# Load models and data
!wget https://raw.githubusercontent.com/cpariona/biomedical-thesis/refs/heads/main/data/ML_implementation_resources/robust_scaler.pickle
!wget https://raw.githubusercontent.com/cpariona/biomedical-thesis/refs/heads/main/data/ML_implementation_resources/minmax_scaler.pickle
!wget https://raw.githubusercontent.com/cpariona/biomedical-thesis/refs/heads/main/data/ML_implementation_resources/RDKit_select_descriptors.pickle
!wget https://raw.githubusercontent.com/cpariona/biomedical-thesis/refs/heads/main/data/ML_implementation_resources/PaDEL_select_descriptors.pickle
!wget https://raw.githubusercontent.com/cpariona/biomedical-thesis/refs/heads/main/data/ML_implementation_resources/selector_LGBM.pickle
!wget https://raw.githubusercontent.com/cpariona/biomedical-thesis/refs/heads/main/data/ML_implementation_resources/lgbm_best_model.pickle

--2025-02-18 19:47:55--  https://raw.githubusercontent.com/cpariona/biomedical-thesis/refs/heads/main/data/ML_implementation_resources/robust_scaler.pickle
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14447 (14K) [application/octet-stream]
Saving to: ‘robust_scaler.pickle’


2025-02-18 19:47:55 (32.0 MB/s) - ‘robust_scaler.pickle’ saved [14447/14447]

--2025-02-18 19:47:56--  https://raw.githubusercontent.com/cpariona/biomedical-thesis/refs/heads/main/data/ML_implementation_resources/minmax_scaler.pickle
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting re

In [44]:
# Load selected features
with open("RDKit_select_descriptors.pickle", "rb") as f:
    RDKit_select_descriptors = pickle.load(f)

with open("PaDEL_select_descriptors.pickle", "rb") as f:
    PaDEL_select_descriptors = pickle.load(f)

# Load the saved scalers
with open("robust_scaler.pickle", "rb") as f:
    robust_scaler = pickle.load(f)

with open("minmax_scaler.pickle", "rb") as f:
    minmax_scaler = pickle.load(f)

# Load RFE model
with open("selector_LGBM.pickle", "rb") as f:
    selector_lgbm = pickle.load(f)

# Load the trained model
with open("lgbm_best_model.pickle", "rb") as f:
    lgbm_model = pickle.load(f)

In [10]:
# RDKit selected descriptors function
def get_selected_RDKitdescriptors(smile, selected_descriptors, missingVal=None):
    ''' Calculates only the selected descriptors for a molecule '''
    res = {}
    mol = Chem.MolFromSmiles(smile)
    if mol is None:
        return {desc: missingVal for desc in selected_descriptors}

    for nm, fn in Descriptors._descList:
        if nm in selected_descriptors:
            try:
                res[nm] = fn(mol)
            except:
                import traceback
                traceback.print_exc()
                res[nm] = missingVal
    return res

In [119]:
smile = 'C1=CC=CC=C1'
df = pd.DataFrame({'smiles': [smile]})

In [120]:
# Calculate selected RDKit descriptors
RDKit_descriptors = [get_selected_RDKitdescriptors(m, RDKit_select_descriptors) for m in df['smiles']]
RDKit_df = pd.DataFrame(RDKit_descriptors)

# Calculate PaDEL descriptors
mols = [Chem.MolFromSmiles(smiles) for smiles in df['smiles'].tolist()]
padel = PaDEL(descriptors)
PaDEL_descriptors = padel.calculate(mols)
PaDEL_df = PaDEL_descriptors[PaDEL_select_descriptors]

PaDEL-Descriptor is a software for calculating molecular
descriptors and fingerprints. The software calculates
1875 descriptors (1444 1D and 2D descriptors, and 431
3D descriptors) and 12 types of fingerprints.

###################################

Should you publish results based on the PaDEL descriptors,
please cite:

Yap, C.W. (2011), PaDEL-descriptor: An open source software
to calculate molecular descriptors and fingerprints.
J. Comput. Chem., 32: 1466-1474. https://doi.org/10.1002/jcc.21707

###################################





In [121]:
# Concatenate RDKit and PaDEL dataframes
RDKit_PaDEL_df = pd.concat([RDKit_df, PaDEL_df], axis=1)
RDKit_PaDEL_df_columns = RDKit_PaDEL_df.columns

In [122]:
# Scale data
RDKit_PaDEL_scaled_ = robust_scaler.transform(RDKit_PaDEL_df)
RDKit_PaDEL_scaled = minmax_scaler.transform(RDKit_PaDEL_scaled_)
RDKit_PaDEL_scaled_df = pd.DataFrame(RDKit_PaDEL_scaled)
RDKit_PaDEL_scaled_df.columns = RDKit_PaDEL_df_columns

In [123]:
# Selected features
selected_features_mask = selector_lgbm.support_
Selected_features = RDKit_PaDEL_df_columns[selected_features_mask]
RDKit_PaDEL = RDKit_PaDEL_scaled_df[Selected_features]

In [124]:
# Make predictions
predictions = lgbm_model.predict(RDKit_PaDEL)

In [125]:
predictions

array([-3.18661119])