# Demo Scikit-Fingerprints

## Format Data

### Imports

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

from skfp.datasets.moleculenet import load_hiv
from skfp.fingerprints import ECFPFingerprint
from skfp.model_selection import scaffold_train_test_split
from skfp.preprocessing import MolFromSmilesTransformer

### Load Data

In [40]:
smiles_list, y = load_hiv()
print("SMILES:")
print(smiles_list[:3])
print()
print("Labels:")
print(y[:1000])

SMILES:
['CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)=[O+]2', 'C(=Cc1ccccc1)C1=[O+][Cu-3]2([O+]=C(C=Cc3ccccc3)CC(c3ccccc3)=[O+]2)[O+]=C(c2ccccc2)C1', 'CC(=O)N1c2ccccc2Sc2c1ccc1ccccc21']

Labels:
[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0

### Convert Smiles to Mols - Split Training/Testing Data Sets

In [43]:
mol_from_smiles = MolFromSmilesTransformer()
dataset_size = 5000
mols = mol_from_smiles.transform(smiles_list[:dataset_size])
mols_train, mols_test, y_train, y_test = scaffold_train_test_split(
    mols, y[:dataset_size], test_size=0.2
)
print("Molecules:")
print(mols_train[:3])

Molecules:
[<rdkit.Chem.rdchem.Mol object at 0x000002B772EA0820>, <rdkit.Chem.rdchem.Mol object at 0x000002B772EA0C10>, <rdkit.Chem.rdchem.Mol object at 0x000002B772EA0D60>]


### Convert to Fingerprints

In [46]:
ecfp_fp = ECFPFingerprint()
X_train = ecfp_fp.transform(mols_train)
X_test = ecfp_fp.transform(mols_test)
print("ECFP fingerprints:")
print(X_train.shape)
print(X_train[:3])

ECFP fingerprints:
(4000, 2048)
[[0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## Train and Predict

### Train Model and Predict Accuracy

In [50]:
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)

y_pred = clf.predict_proba(X_test)[:, 1]
auroc = roc_auc_score(y_test, y_pred)

print(f"AUROC: {auroc:.2%}")

AUROC: 85.22%


### Test Predictions

In [52]:
def predict(index):
    print("prediction: " + str(clf.predict(X_test)[index]) + ", actual: " + str(y_test[index]))

predict(5)
predict(400)
predict(852)
predict(853)
predict(849)

prediction: 0, actual: 0
prediction: 0, actual: 0
prediction: 1, actual: 1
prediction: 1, actual: 1
prediction: 0, actual: 1


# Make it Useful

## Create a Smiles Pipeline

In [143]:
from sklearn.pipeline import make_pipeline
smiles_pipeline = make_pipeline(
    MolFromSmilesTransformer(),
    ecfp_fp,
    clf,
)

## Test Pipeline Predictions

In [171]:
from rdkit import Chem

# 1 Test
mol = "O=C(Nc1ccc(C2=NCCN2)cc1)Nc1cccc(C(=O)Nc2ccc(C3=NCCN3)cc2)c1"
print("Smiles String: " + mol)
print(smiles_pipeline.predict([mol]))

# 0 Test
mol = "Cc1cc(NCCCCCCNc2cc(C)nc3ccc(Cl)cc23)c2cc(Cl)ccc2n1"
print("Smiles String: " + mol)
print(smiles_pipeline.predict([mol]))

# Manual Entry
mol = "CC(C)OC(=S)SSC(=S)OC(C)C"
print("Smiles String: " + mol)
print(smiles_pipeline.predict([mol]))

Smiles String: O=C(Nc1ccc(C2=NCCN2)cc1)Nc1cccc(C(=O)Nc2ccc(C3=NCCN3)cc2)c1
[1]
Smiles String: Cc1cc(NCCCCCCNc2cc(C)nc3ccc(Cl)cc23)c2cc(Cl)ccc2n1
[0]
Smiles String: CC(C)OC(=S)SSC(=S)OC(C)C
[0]
