# Introduction Sequence
https://github.com/scikit-fingerprints/scikit-fingerprints/blob/master/examples/01_skfp_introduction.ipynb

## Import Modules

In [127]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

from skfp.datasets.moleculenet import load_bace
from skfp.fingerprints import ECFPFingerprint
from skfp.model_selection import scaffold_train_test_split
from skfp.preprocessing import MolFromSmilesTransformer

## Load data
_Using skfp_\
Data comes from: [beta-secretase 1 (BACE) dataset](https://pubs.acs.org/doi/10.1021/acs.jcim.6b00290)
- X - set of drugs
- y - attribute associated with beta-secretase 1 enzyme inhibition

In [130]:
smiles_list, y = load_bace() # load the beta-secretase 1 (BACE) dataset - https://doi.org/10.1021/acs.jcim.
print("SMILES:")
print(smiles_list[:3])
print()
print("Labels:")
print(y[:1000])

SMILES:
['O1CC[C@@H](NC(=O)[C@@H](Cc2cc3cc(ccc3nc2N)-c2ccccc2C)C)CC1(C)C', 'Fc1cc(cc(F)c1)C[C@H](NC(=O)[C@@H](N1CC[C@](NC(=O)C)(CC(C)C)C1=O)CCc1ccccc1)[C@H](O)[C@@H]1[NH2+]C[C@H](OCCC)C1', 'S1(=O)(=O)N(c2cc(cc3c2n(cc3CC)CC1)C(=O)N[C@H]([C@H](O)C[NH2+]Cc1cc(OC)ccc1)Cc1ccccc1)C']

Labels:
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

## Transform X (Smiles) to Mols, then Split Training and Test Data
_Using skfp_

In [149]:
mol_from_smiles = MolFromSmilesTransformer()
mols = mol_from_smiles.transform(smiles_list)
mols_train, mols_test, y_train, y_test = scaffold_train_test_split(
    mols, y, test_size=0.2
)
print("Molecules:")
print(mols[:3])

Molecules:
[<rdkit.Chem.rdchem.Mol object at 0x000001A33A1FD230>, <rdkit.Chem.rdchem.Mol object at 0x000001A33A1FCB30>, <rdkit.Chem.rdchem.Mol object at 0x000001A33A1FDF50>]


## Transform Training and Test Data (Mols) into Fingerprints
_Using skfp_

In [152]:
ecfp_fp = ECFPFingerprint()
X_train = ecfp_fp.transform(mols_train)
X_test = ecfp_fp.transform(mols_test)
print("ECFP fingerprints:")
print(X_train.shape)
print(X_train[:3])

ECFP fingerprints:
(1210, 2048)
[[0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## Finally, Fit the Training Sets to a Random Forest Classifier, and Predict with the Test Set
_Using sklearn_

In [155]:
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)

y_pred = clf.predict_proba(X_test)[:, 1]
auroc = roc_auc_score(y_test, y_pred)

print(f"AUROC: {auroc:.2%}")

AUROC: 78.25%
