# 7: Active Learning and Enamine

**Authors: Mateusz K Bieniek, Ben Cree, Rachael Pirie, Joshua T. Horton, Natalie J. Tatum, Daniel J. Cole**

## Overview
Configure the Active Learning

In [None]:
import pandas as pd
import prody
from rdkit import Chem

import fegrow
from fegrow import ChemSpace

from fegrow.testing import core_5R83_path, smiles_5R83_path, rec_5R83_path

In [None]:
# create the chemical space
cs = ChemSpace()
# we're not growing the scaffold, we're superimposing bigger molecules on it
cs.add_scaffold(Chem.SDMolSupplier(core_5R83_path)[0])
cs.add_protein(rec_5R83_path)

In [None]:
# turn on the caching in RAM (optional)
cs.set_dask_caching()

In [None]:
# load 50k Smiles
smiles = pd.read_csv(smiles_5R83_path).Smiles.to_list()

# for testing, sort by size and pick small
smiles.sort(key=len)
# take 200 smallest smiles
smiles = smiles[:200]

# here we add Smiles which should already have been matched
# to the scaffold (rdkit Mol.HasSubstructureMatch)
cs.add_smiles(smiles)

In [None]:
cs

# Active Learning

In [None]:
# There is nothing to train the model on, so initially "first_random" is used by default
random1 = cs.active_learning(3, first_random=True)
random2 = cs.active_learning(3, first_random=True)

# note the different indices selected (unless you're lucky!)
print(random1.index.to_list(), random2.index.to_list())

In [None]:
# now evaluate the first selection
random1_results = cs.evaluate(random1, ani=False)

In [None]:
# check the scores, note that they were updated in the master dataframe too
random1_results

In [None]:
# by default Gaussian Process with Greedy approach is used
# note that this time 
greedy1 = cs.active_learning(3)
greedy2 = cs.active_learning(3)
print(greedy1.index.to_list(), greedy2.index.to_list())

In [None]:
# learn in cycles
for cycle in range(2):
    greedy = cs.active_learning(3)
    greedy_results = cs.evaluate(greedy)
    
    # save the new results
    greedy_results.to_csv(f'notebook6_iteration{cycle}_results.csv')

# save the entire chemical space with all the results
cs.to_sdf('notebook6_chemspace.sdf')

In [None]:
computed = cs.df[~cs.df.score.isna()]
print('Computed cases in total: ', len(computed))

In [None]:
from fegrow.al import Model, Query

In [None]:
# This is the default configuration
cs.model = Model.gaussian_process()
cs.query = Query.Greedy()

cs.active_learning(3)

In [None]:
cs.query = Query.UCB(beta=10)
cs.active_learning(3)

In [None]:
# The query methods available in modAL.acquisition are made available, these include
# Query.greedy(), 
# Query.PI(tradeoff=0) - highest probability of improvement
# Query.EI(tradeoff=0) - highest expected improvement
# Query.UCB(beta=1) - highest upper confidence bound (employes modAL.models.BayesianOptimizer)

# Models include the scikit:
# Model.linear()
# Model.elastic_net()
# Model.random_forest()
# Model.gradient_boosting_regressor()
# Model.mlp_regressor()

# Model.gaussian_process()  # uses a TanimotoKernel by default, meaning that it
#                           # compares the fingerprints of all the training dataset
#                           # with the cases not yet studied, which can be expensive
#                           # computationally

cs.model = Model.linear()
cs.query = Query.Greedy()
cs.active_learning()

### Search the Enamine database usuing the sw.docking.org (check if online)
Please note that you should check whether you have the permission to use this interface. 
Furthermore, you are going to need the pip package `pydockingorg`

In [None]:
# search only molecules similar to the best molecule score-wise (n_best)
# and return up to 5
new_enamines = cs.add_enamine_molecules(n_best=1, results_per_search=10)

In [None]:
new_enamines

In [None]:
# we marked the molecules to avoid searching for them again
# for that we use the column "enamine_searched"
cs.df[cs.df.enamine_searched == True]