In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.3.6-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.3.6-cp310-cp310-manylinux_2_28_x86_64.whl (32.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.6


In [None]:
!pip install PaDEL-pywrapper

Collecting PaDEL-pywrapper
  Downloading PaDEL_pywrapper-1.0.5-py3-none-any.whl.metadata (5.5 kB)
Collecting install-jdk==0.3.0 (from PaDEL-pywrapper)
  Downloading install-jdk-0.3.0.tar.gz (3.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bounded-pool-executor==0.0.3 (from PaDEL-pywrapper)
  Downloading bounded_pool_executor-0.0.3-py3-none-any.whl.metadata (2.7 kB)
Downloading PaDEL_pywrapper-1.0.5-py3-none-any.whl (37.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.5/37.5 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bounded_pool_executor-0.0.3-py3-none-any.whl (3.4 kB)
Building wheels for collected packages: install-jdk
  Building wheel for install-jdk (setup.py) ... [?25l[?25hdone
  Created wheel for install-jdk: filename=install_jdk-0.3.0-py3-none-any.whl size=3725 sha256=ae2b96e2820d9379ec124fd6fb19e61f8a72004c4db883424f09a5d90407cb8b
  Stored in directory: /root/.cache/pip/wheels/79/7a/47/9a4619174f7ca0f1068edb7a5

In [None]:
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski
from PaDEL_pywrapper import PaDEL
from PaDEL_pywrapper.descriptor import ALOGP, Crippen, FMF
from PaDEL_pywrapper import descriptors

ModuleNotFoundError: No module named 'rdkit'

In [None]:
import pandas as pd
import numpy as np

## Loading collected data

In [None]:
!wget https://raw.githubusercontent.com/cpariona/biomedical-thesis/refs/heads/main/data/docking_results/docking_scores_data.csv

--2024-12-09 03:18:40--  https://raw.githubusercontent.com/cpariona/biomedical-thesis/refs/heads/main/data/docking_results/docking_scores_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 338231 (330K) [text/plain]
Saving to: ‘docking_scores_data.csv’


2024-12-09 03:18:40 (5.53 MB/s) - ‘docking_scores_data.csv’ saved [338231/338231]



In [None]:
df = pd.read_csv("docking_scores_data.csv")
df.columns

Index(['smiles', 'Delta_Gibbs'], dtype='object')

In [None]:
df.shape

(8530, 2)

In [None]:
# Handling missing data && cleaning
df = df.dropna(subset=['Delta_Gibbs'])
df = df.drop_duplicates(['smiles']).reset_index(drop=True)

In [None]:
df.shape

(8515, 2)

## Compute Molecular Descriptors

### Compute the RDKit discriptor

In [None]:
def getRDKitDescriptors(smile, missingVal=None):
    ''' calculate the full list of descriptors for a molecule

        missingVal is used if the descriptor cannot be calculated
    '''
    res = {}
    for nm,fn in Descriptors._descList:
        # some of the descriptor fucntions can throw errors if they fail, catch those here:
        try:
            val = fn(Chem.MolFromSmiles(smile))
        except:
            # print the error message:
            import traceback
            traceback.print_exc()
            # and set the descriptor value to whatever missingVal is
            val = missingVal
        res[nm] = val
    return res

In [None]:
# Calculate descriptors
RDKit_descriptors = [getRDKitDescriptors(m) for m in df['smiles'].tolist()]

In [None]:
# Create RDKit DataFrame
RDKit_df = pd.DataFrame(RDKit_descriptors)

In [None]:
RDKit_df.shape

(8515, 210)

In [None]:
# Save RDKit descriptors
RDKit_df.to_csv('RDKit_descriptors.csv', index=False)

### Compute the PaDEL discriptor

In [None]:
# Calculate descriptors
mols = [Chem.MolFromSmiles(smiles) for smiles in df['smiles'].tolist()]
padel = PaDEL(descriptors)
PaDEL_descriptors = padel.calculate(mols)

In [None]:
PaDEL_descriptors.shape

(8515, 1444)

In [None]:
# Save RDKit descriptors
PaDEL_descriptors.to_csv('PaDEL_descriptors.csv', index=False)