# Scikit-Learn/Fingerprints Demo
## Goals
* __Create and test a machine learning model__
    * Import and split a data set
    * Convert mols to fingerprints
    * Train the model using 80% of the data
    * Test the model's accuracy with the other 20% of the data
* __Create a pipeline to:__
    * Use Smiles as input
    * Predict molecules

## Format Data

### _Imports_

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

from skfp.datasets.moleculenet import load_hiv
from skfp.fingerprints import ECFPFingerprint
from skfp.model_selection import scaffold_train_test_split
from skfp.preprocessing import MolFromSmilesTransformer

### _Load Data_

In [10]:
smiles_list, y = load_hiv()
print("SMILES:")
print(smiles_list[:3])
print()
print("Labels:")
print(y[:1000])

SMILES:
['CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)=[O+]2', 'C(=Cc1ccccc1)C1=[O+][Cu-3]2([O+]=C(C=Cc3ccccc3)CC(c3ccccc3)=[O+]2)[O+]=C(c2ccccc2)C1', 'CC(=O)N1c2ccccc2Sc2c1ccc1ccccc21']

Labels:
[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0

### _Convert Smiles to Mols - Split Training/Testing Data Sets_

In [14]:
mol_from_smiles = MolFromSmilesTransformer()
dataset_size = 5000
mols = mol_from_smiles.transform(smiles_list[:dataset_size])
mols_train, mols_test, y_train, y_test = scaffold_train_test_split(
    mols, y[:dataset_size], test_size=0.2
)
print("Molecules:")
print(mols_train[:3])

Molecules:
[<rdkit.Chem.rdchem.Mol object at 0x00000150F95B7060>, <rdkit.Chem.rdchem.Mol object at 0x00000150F95B7450>, <rdkit.Chem.rdchem.Mol object at 0x00000150F95B75A0>]


### _Convert to Fingerprints_

In [17]:
ecfp_fp = ECFPFingerprint()
X_train = ecfp_fp.transform(mols_train)
X_test = ecfp_fp.transform(mols_test)
print("ECFP fingerprints:")
print(X_train.shape)
print(X_train[:3])
print(type(X_train))

ECFP fingerprints:
(4000, 2048)
[[0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
<class 'numpy.ndarray'>


## Train and Predict

### _Train Model and Predict Accuracy_

In [49]:
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)

clf.score(X_test, y_test)

0.96

### _Test Predictions_

In [24]:
def predict(index):
    print("prediction: " + str(clf.predict(X_test)[index]) + ", actual: " + str(y_test[index]))

predict(5)
predict(400)
predict(852)
predict(853)
predict(849)

prediction: 0, actual: 0
prediction: 0, actual: 0
prediction: 1, actual: 1
prediction: 1, actual: 1
prediction: 0, actual: 1


## Make it Useful

### _Create a Smiles Pipeline_

In [28]:
from sklearn.pipeline import make_pipeline
smiles_pipeline = make_pipeline(
    MolFromSmilesTransformer(),
    ecfp_fp,
    clf,
)

### _Test Pipeline Predictions_

In [31]:
from rdkit import Chem

# 1 Test
mol = "O=C(Nc1ccc(C2=NCCN2)cc1)Nc1cccc(C(=O)Nc2ccc(C3=NCCN3)cc2)c1" # From HIV Test Dataset
print("Smiles String: " + mol)
print(smiles_pipeline.predict([mol])[0])

# 0 Test
mol = "CS(=O)(=O)NC(=O)c1cc(Oc2ccc(C(F)(F)F)cc2Cl)ccc1[N+](=O)[O-]" # From TOX21 Test Dataset
print("Smiles String: " + mol)
print(smiles_pipeline.predict([mol])[0])

# Manual Entry
mol = "O=C(Nc1cccc(C(=O)Nc2ccc(C3=NCCN3)cc2)c1)c1ccc(C2=NCCN2)cc1" # Random
print("Smiles String: " + mol)
print(smiles_pipeline.predict([mol])[0])

Smiles String: O=C(Nc1ccc(C2=NCCN2)cc1)Nc1cccc(C(=O)Nc2ccc(C3=NCCN3)cc2)c1
1
Smiles String: CS(=O)(=O)NC(=O)c1cc(Oc2ccc(C(F)(F)F)cc2Cl)ccc1[N+](=O)[O-]
0
Smiles String: O=C(Nc1cccc(C(=O)Nc2ccc(C3=NCCN3)cc2)c1)c1ccc(C2=NCCN2)cc1
1


In [33]:
def perf_measure(y_actual, y_hat):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_hat)): 
        if y_actual[i]==y_hat[i]==1:
           TP += 1
        if y_hat[i]==1 and y_actual[i]!=y_hat[i]:
           FP += 1
        if y_actual[i]==y_hat[i]==0:
           TN += 1
        if y_hat[i]==0 and y_actual[i]!=y_hat[i]:
           FN += 1

    return(TP, FP, TN, FN)

In [39]:
TP, FP, TN, FN = perf_measure(clf.predict(X_test), y_test)
print(f"True Positives: {TP}\nFalse Positives: {FP}\nTrue Negatives: {TN}\nFalse Negatives: {FN}\n")

True Positives: 11
False Positives: 37
True Negatives: 949
False Negatives: 3



In [45]:
# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

In [47]:
print(f"Sensitivity, hit rate, recall, or true positive rate: {TPR}\nSpecificity or true negative rate: {TNR}\nPrecision or positive predictive value: {PPV}\nNegative predictive value: {NPV}\nFall out or false positive rate: {FPR}\nFalse negative rate: {FNR}\nFalse discovery rate: {FDR}\nOverall accuracy: {ACC}")

Sensitivity, hit rate, recall, or true positive rate: 0.7857142857142857
Specificity or true negative rate: 0.962474645030426
Precision or positive predictive value: 0.22916666666666666
Negative predictive value: 0.9968487394957983
Fall out or false positive rate: 0.037525354969574036
False negative rate: 0.21428571428571427
False discovery rate: 0.7708333333333334
Overall accuracy: 0.96
