In [1]:
pip install -r requirements.txt

Collecting jupyter (from -r requirements.txt (line 2))
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting scikit-fingerprints (from -r requirements.txt (line 6))
  Downloading scikit_fingerprints-1.11.0-py3-none-any.whl.metadata (9.9 kB)
Collecting jupyterlab (from jupyter->-r requirements.txt (line 2))
  Downloading jupyterlab-4.3.0-py3-none-any.whl.metadata (16 kB)
Collecting datasketch (from scikit-fingerprints->-r requirements.txt (line 6))
  Downloading datasketch-1.6.5-py3-none-any.whl.metadata (5.8 kB)
Collecting descriptastorus (from scikit-fingerprints->-r requirements.txt (line 6))
  Downloading descriptastorus-2.8.0-py3-none-any.whl.metadata (364 bytes)
Collecting e3fp (from scikit-fingerprints->-r requirements.txt (line 6))
  Downloading e3fp-1.2.5-py3-none-any.whl.metadata (4.5 kB)
Collecting mordredcommunity (from scikit-fingerprints->-r requirements.txt (line 6))
  Downloading mordredcommunity-2.0.6-py3-none-any.whl.metadata (6.2 kB)
Collecting

In [2]:
from skfp.datasets.moleculenet import load_bace

smiles_list, y = load_bace()

print(f"Example molecule: {smiles_list[0]}")
print(f"Example class: {y[3]}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Example molecule: O1CC[C@@H](NC(=O)[C@@H](Cc2cc3cc(ccc3nc2N)-c2ccccc2C)C)CC1(C)C
Example class: 1


In [3]:
from skfp.preprocessing import MolFromSmilesTransformer

mol_from_smiles = MolFromSmilesTransformer()

mols = mol_from_smiles.transform(smiles_list)

In [4]:
import numpy as np
from skfp.model_selection import scaffold_train_test_split


train_idxs, test_idxs = scaffold_train_test_split(
    mols, test_size=0.2, return_indices=True
)

# split mols and labels
mols_train = np.array(mols)[train_idxs]
mols_test = np.array(mols)[test_idxs]

y_train = y[train_idxs]
y_test = y[test_idxs]

print(f"Train set size: {len(mols_train)}")
print(f"Test set size: {len(mols_test)}")

Train set size: 1210
Test set size: 303


In [5]:
from skfp.preprocessing import MolStandardizer


standardizer = MolStandardizer()

mols_train = standardizer.transform(mols_train)
mols_test = standardizer.transform(mols_test)

In [6]:
from skfp.fingerprints import ECFPFingerprint


# create fingerprint transformer object
ecfp_fp = ECFPFingerprint()

# transform molecules into feature vectors
X_train_ecfp = ecfp_fp.transform(mols_train)
X_test_ecfp = ecfp_fp.transform(mols_test)

print(f"Fingerprint data shape: {X_train_ecfp.shape}")
print(f"Example vector: {X_train_ecfp[0]}")

Fingerprint data shape: (1210, 2048)
Example vector: [0 1 0 ... 0 0 0]


In [8]:
# import Tanimoto count distance from scikit-fingerprints
from sklearn.neighbors import KNeighborsClassifier
from skfp.distances import tanimoto_count_distance
from skfp.metrics import multioutput_auroc_score

# create kNN model with appropriate metric
clf = KNeighborsClassifier(n_jobs=-1, metric=tanimoto_count_distance)

# fit, predict
clf.fit(X_train_ecfp, y_train)
y_pred = clf.predict_proba(X_test_ecfp)[:, 1]

# calculate and print AUROC score
print(f"ECFP AUROC for kNN: {multioutput_auroc_score(y_test, y_pred):.2%}")

ECFP AUROC for kNN: 77.95%
