Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Calculate molecular descriptors using datamol #5

Merged
merged 3 commits into from
Feb 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions data/01_raw/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
## Plasmodium falciparum 3D7 Data

The data(`plasmodium_falciparum_3d7_assays.csv`) contains assays related to Plasmodium falciparum 3D7 strains. It was retrieved from the CheMBL database(version 33).

### Data Cleaning
The code used for data cleaning is in the notebook named `raw_data_cleaning.ipynb`.

The `standard_value` was converted to microMolar(uM) units for consistency.
Only records having a non-null pchembl_values were retained.

### IC50 dataset
The final dataset(**2257 records**) consists of IC50 values(in uM) for various compounds tested against the parasite.
- **File:** `plasmodium_falciparum_3d7_ic50.csv`
- **Columns:**
- `canonical_smiles`: Canonical SMILES representation of the compound.
- `uM_value`: Standard value measured in microMolar (uM) units, after conversion.

5,236 changes: 5,236 additions & 0 deletions data/01_raw/plasmodium_falciparum_3d7_assays.csv

Large diffs are not rendered by default.

2,258 changes: 2,258 additions & 0 deletions data/01_raw/plasmodium_falciparum_3d7_ic50.csv

Large diffs are not rendered by default.

Empty file removed data/__init__.py
Empty file.
557 changes: 557 additions & 0 deletions notebooks/raw_data_cleaning.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
scikit-learn
datamol
rdkit
shap
24 changes: 12 additions & 12 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from setuptools import setup, find_packages


with open("README.md", "r", encoding="utf-8") as fh:
with open("README.md", "r", encoding="utf8") as fh:
long_description = fh.read()

with open("requirements.txt") as f:
install_requires = f.read().splitlines()

setup(
name="xai4chem",
author="Hellen Namulinda",
Expand All @@ -13,19 +16,16 @@
long_description_content_type="text/markdown",
url="https://github.com/ersilia-os/xai4chem",
license="GPLv3",
python_requires=">=3.7",
python_requires=">=3.10",
install_requires=install_requires,
packages=find_packages(exclude=("utilities")),
classifiers=[
# 'Programming Language :: Python :: 3.7',
# 'Programming Language :: Python :: 3.8',
# 'Programming Language :: Python :: 3.9',
# 'Programming Language :: Python :: 3 :: Only',
# 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
# 'Operating System :: OS Independent',
# 'Topic :: Scientific/Engineering :: Artificial Intelligence',
],
install_requires=[""],
keywords="explainable-ai, chemistry, xai, machine learning, drug-discovery",
"Programming Language :: Python :: 3.10",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Operating System :: OS Independent",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
],
keywords="xai, chemistry, machine-learning, drug-discovery",
project_urls={
"Documentation": "",
"Source Code": "https://github.com/ersilia-os/xai4chem",
Expand Down
157 changes: 157 additions & 0 deletions xai4chem/datamol_desc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import datamol as dm
import numpy as np
import pandas as pd
from tqdm import tqdm
import joblib
from sklearn.preprocessing import RobustScaler, KBinsDiscretizer
from sklearn.feature_selection import VarianceThreshold


# To filter features with a high percentage of missing values.
class NanFilter:
def __init__(self, max_na):
self._name = "nan_filter"
self.MAX_NA = max_na

def fit(self, X):
max_na = int((1 - self.MAX_NA) * X.shape[0])
self.col_idxs = [j for j in range(X.shape[1]) if np.sum(np.isnan(X[:, j])) <= max_na]

def transform(self, X):
return X[:, self.col_idxs]

def save(self, file_name):
joblib.dump(self, file_name)

@classmethod
def load(cls, file_name):
return joblib.load(file_name)

# To impute missing values
class Imputer:
def __init__(self):
self._name = "imputer"
self._fallback = 0

def fit(self, X):
self.impute_values = [np.median(X[:, j][~np.isnan(X[:, j])]) for j in range(X.shape[1])]

def transform(self, X):
for j in range(X.shape[1]):
X[np.isnan(X[:, j]), j] = self.impute_values[j]
return X

def save(self, file_name):
joblib.dump(self, file_name)

@classmethod
def load(cls, file_name):
return joblib.load(file_name)

# To remove features that have almost constant values.
class VarianceFilter:
def __init__(self):
self._name = "variance_filter"

def fit(self, X):
self.sel = VarianceThreshold()
self.sel.fit(X)
self.col_idxs = self.sel.transform(np.arange(X.shape[1]).reshape(1, -1)).ravel()

def transform(self, X):
return self.sel.transform(X)

def save(self, file_name):
joblib.dump(self, file_name)

@classmethod
def load(cls, file_name):
return joblib.load(file_name)


class Scaler(object):
def __init__(self):
self._name = "scaler"
self.abs_limit = 10

def fit(self, X):
self.scaler = RobustScaler()
self.scaler.fit_transform(X)

def transform(self, X):
X = self.scaler.transform(X)
return np.clip(X, -self.abs_limit, self.abs_limit)

def save(self, file_name):
joblib.dump(self, file_name)

@classmethod
def load(cls, file_name):
return joblib.load(file_name)

def datamol_featurizer(smiles_list):
R = []
for smiles in tqdm(smiles_list):
mol = dm.to_mol(smiles)
descriptors = dm.descriptors.compute_many_descriptors(mol)
R.append(descriptors)
return pd.DataFrame(R)

class DatamolDescriptor:
def __init__(self, max_na=0.1, use_scaling=False, discretize=True,n_bins=5, kbd_strategy='quantile'):
"""
Parameters:
- max_na: float, optional (default=0.1)
Maximum allowed percentage of missing values in features.
Whether to apply feature scaling.
- discretize: bool, optional (default=True)
Whether to discretize features.
- n_bins: int, optional (default=5)
Number of bins used for discretization.
- kbd_strategy: str, optional (default='quantile')
Strategy used for binning. Options: 'uniform', 'quantile', 'kmeans'.
"""
self.nan_filter = NanFilter(max_na=max_na)
self.imputer = Imputer()
self.variance_filter = VarianceFilter()
self.scaler = Scaler()
self.discretizer = KBinsDiscretizer(n_bins=n_bins, encode="ordinal", strategy=kbd_strategy)
self.discretize = discretize
self.use_scaling = use_scaling

def fit(self, smiles):
df = datamol_featurizer(smiles)
X = np.array(df, dtype=np.float32)
self.nan_filter.fit(X)
X = self.nan_filter.transform(X)
self.imputer.fit(X)
X = self.imputer.transform(X)
self.variance_filter.fit(X)
X = self.variance_filter.transform(X)
if self.discretize:
self.discretizer.fit(X)
if self.use_scaling:
self.scaler.fit(X)
X = self.scaler.transform(X)
col_idxs = self.variance_filter.col_idxs
feature_names = list(df.columns)
self.feature_names = [feature_names[i] for i in col_idxs]

def transform(self, smiles):
df = datamol_featurizer(smiles)
X = np.array(df, dtype=np.float32)
X = self.nan_filter.transform(X)
X = self.imputer.transform(X)
X = self.variance_filter.transform(X)
if self.discretize:
X = self.discretizer.transform(X)
if self.use_scaling:
X = self.scaler.transform(X)
return np.array(X, dtype=int)

def save(self, file_name):
joblib.dump(self, file_name)

@classmethod
def load(cls, file_name):
return joblib.load(file_name)