In [1]:
%config Completer.use_jedi = True

In [2]:
# https://medium.com/analytics-vidhya/feature-selection-using-scikit-learn-5b4362e0c19b
# https://towardsdatascience.com/rank-the-features-now-rank-again-4dafd8cde3c8

In [3]:
ROOT_DIR = ".."
DATASET_DIR = "{}/datasets".format(ROOT_DIR)
DATASET_DIR

'../datasets'

In [4]:
### Use LightGBM

# ### Using ML/DL libraries
# 1. OpenChem
# 2. ChemProp
# 3. DeepChem

In [5]:
import os
import sys


from matplotlib import pyplot
import numpy as np
import pandas as pd
from pprint import pprint
import re

from scipy import stats
import seaborn as sns
import shap
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, FunctionTransformer, scale
from sklearn.svm import SVC

from statsmodels.stats.outliers_influence import variance_inflation_factor


ROOT_DIR = os.pardir
sys.path.insert(0, os.path.abspath(ROOT_DIR))

# Display pipelines and other objects
from sklearn import set_config
set_config(display="diagram")

In [6]:
def detect_outlier_z_scores(df):
  """
  To perform outlier detection, we are going to employ the Z-Score method because it is the simplest one.
  This s a slight modification of the code from the following link
  https://www.kaggle.com/alexandrehsd/binary-multiclass-classification-factor-analysis/notebookSS
  """
  flag_outlier = False

  for feature in df:
    #print(feature)
    column = df[feature]
    mean = np.mean(column)
    std = np.std(column)
    z_scores = (column - mean) / std
    outliers = np.abs(z_scores) > 3
    
    n_outliers = sum(outliers)
    
    if n_outliers > 0:
      print("{} has {} outliers".format(feature, n_outliers))
      flag_outlier = True

  if not flag_outlier:
    print("\nThe dataset has no outliers.")
    
    return None

def remove_outliers_by_z_score(df:pd.DataFrame, threshold:int = 3):
    ## Find outliers for all features
    z = np.abs(stats.zscore(df))
    outliers = np.where(z > threshold)
    columns = df.columns.tolist()
    cols_with_outliers = [columns[i] for i in 
                         set(outliers[1].tolist())]
    
    print("Features with outliers ({}) : {}".format(len(cols_with_outliers), cols_with_outliers))
    print(outliers[0].size)
    
    ## Remove outliers
    print("\nRemoving {} rows...".format(  len(set(outliers[0].tolist()))   ))
    print(np.where(z <= threshold)[0].size)
    new_df = df[(z <= threshold).all(axis=1)]
    print(new_df.shape)
    return new_df





In [7]:
# #https://stackoverflow.com/questions/37685412/avoid-scaling-binary-columns-in-sci-kit-learn-standsardscaler

# transformer_pipeline = Pipeline(steps=[
#     ('feature_processing', FeatureUnion(transformer_list = [
#          ( 'no_transformation',
#               Pipeline(steps = [
#                  ('bcut_maccs_pubchem', FunctionTransformer(lambda data: data.loc[:, cols_bcut_maccs_pubchem + [target]]))
#               ])),
              
        
#         #numeric to transform
#         ('numeric', Pipeline(steps = [
#             ('select', FunctionTransformer(lambda data: data.loc[:, cols_to_transform])),
#             ('scale', StandardScaler())
#                     ]))        
#     ])  
#     )
# ])

# transformer_pipeline

In [8]:
dataset = pd.read_csv("{}/csv/nr-ahr.csv".format(DATASET_DIR))
features = dataset.columns.tolist()
target = "Activity"
test_ratio = 0.3
random_state = 233233

pattern = re.compile("MACCS")
cols_bcut_maccs_pubchem = [x for x in dataset.columns.tolist() if not pattern.match(x) is None]
cols_to_transform = [y for y in features if not y in cols_bcut_maccs_pubchem]
cols_to_transform.remove(target)

dataset.dropna(inplace=True)
dataset = dataset.reset_index()
dataset.drop(["index"], axis=1, inplace=True)

# print("index" in dataset.columns.tolist())

dataset[cols_to_transform] = scale(dataset[cols_to_transform])
dataset[cols_bcut_maccs_pubchem]



# dataset.iloc[:,1400:1500].info()

Unnamed: 0,MACCS_1,MACCS_10,MACCS_100,MACCS_101,MACCS_102,MACCS_103,MACCS_104,MACCS_105,MACCS_106,MACCS_107,...,MACCS_90,MACCS_91,MACCS_92,MACCS_93,MACCS_94,MACCS_95,MACCS_96,MACCS_97,MACCS_98,MACCS_99
0,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
4,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1877,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1878,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1879,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
1880,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Split Data

In [9]:
from sklearn.model_selection import StratifiedShuffleSplit

In [10]:
strat_train_set, strat_test_set = None, None
splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_ratio, random_state=random_state)

for train_index, test_index in splitter.split(dataset, dataset[target]):
    strat_train_set = dataset.loc[train_index]
    strat_test_set  = dataset.loc[test_index]
    print(len(train_index))
    print(len(test_index))

# strat_train_set.head()
X_train = strat_train_set[strat_train_set.columns.difference([target])]
y_train = strat_train_set[target]
X_test, y_test   = strat_test_set[strat_test_set.columns.difference([target])], strat_test_set[target]

# print("strat_train_set : \n{}".format(strat_train_set[target].value_counts()/len(strat_train_set)))
# print("strat_test_set  : \n{}".format(strat_test_set[target].value_counts()/len(strat_test_set)))

1317
565


## Training with Random Forest
### Grid search 

In [11]:
kfold = 5
# scoring = {'f1': 'f1_weighted', 'jaccard':'jaccard'} #, 'accuracy': 'accuracy'
scoring = ['f1_weighted']
params_grid_rf = {  'bootstrap': [True]
                  , 'max_depth': [5, 10, 30, None]
                  , 'criterion': ['gini'] # , 'entropy'
                  , 'max_features': ['auto']
                  , 'min_samples_split': [5, 10]
                  , 'n_estimators': [100, 150, 300]
#                   , 'min_impurity_decrease': [0.0, 0.1]
                 }

In [12]:
rfc = RandomForestClassifier(random_state=random_state)
grid_searcher = GridSearchCV(estimator = rfc, param_grid = params_grid_rf, cv = kfold, n_jobs = 1, verbose = 0, scoring = 'f1_weighted', return_train_score=True)
grid_searcher.fit(X_train, y_train)

In [13]:
best_rf_grid = grid_searcher.best_estimator_
best_rf_grid_train_score = f1_score(best_rf_grid.predict(X_train), y_train , average='weighted' ) 
best_rf_grid_test_score = f1_score(best_rf_grid.predict(X_test), y_test , average='weighted' )
print("Best RF pamateters: {}".format(grid_searcher.best_params_))
print("Best RF score: {}".format(grid_searcher.best_score_))
print("Best RF train score (F1-weigthed): {}".format(best_rf_grid_train_score))
print("Best RF test score (F1-weigthed): {}".format(best_rf_grid_test_score))

Best RF pamateters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 30, 'max_features': 'auto', 'min_samples_split': 5, 'n_estimators': 150}
Best RF score: 0.8121788366260467
Best RF train score (F1-weigthed): 0.9924069855732726
Best RF test score (F1-weigthed): 0.8424798500119917


### Randomized search 

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 400, num = 5)]
max_depth    = [int(x) for x in np.linspace(10, 110, num = 11)]

params_random_rf = {  'bootstrap': [True]
                  , 'max_depth': max_depth + [None]
                  , 'criterion': ['gini'] # , 'entropy'
                  , 'max_features': ['auto']
                  , 'min_samples_split': [5]
                  , 'n_estimators': n_estimators
                  , 'min_impurity_decrease': [0.0]
                  , 
                 }

In [None]:
random_searcher = rf_random = RandomizedSearchCV(estimator = rfc, param_distributions = params_random_rf, n_iter = 16
                                                 , scoring= 'f1_weighted' , cv = kfold, verbose=2, random_state=random_state, n_jobs = -1)
random_searcher.fit(X_train, y_train)

In [None]:
best_rf_random = random_searcher.best_estimator_
print("Best RF pamateters: {}".format(random_searcher.best_params_))
print("Best RF score: {}".format(random_searcher.best_score_))
print("Best RF train score (F1-weigthed): {}".format(f1_score(best_rf_random.predict(X_train), y_train , average='weighted' )))
print("Best RF test score (F1-weigthed): {}".format(f1_score(best_rf_random.predict(X_test), y_test , average='weighted' )))


## Model explanation with SHAP

* Reference(s):
>- https://www.kaggle.com/code/prashant111/explain-your-model-predictions-with-shapley-values/notebook \
>- https://onezero.blog/machine-learning-model-explanation-using-shapley-values/ \
>- https://www.datatrigger.org/post/interpretable_machine_learning_shap/ \
>- https://shap.readthedocs.io/en/latest/example_notebooks/overviews/An%20introduction%20to%20explainable%20AI%20with%20Shapley%20values.html \
>- https://medium.com/analytics-vidhya/interpretability-of-machine-learning-models-9787cf8a3789 \
>- https://shap.readthedocs.io/en/latest/index.html \
>- https://www.kaggle.com/code/dansbecker/shap-values/tutorial \
>- https://towardsdatascience.com/explainable-ai-xai-with-shap-multi-class-classification-problem-64dd30f97cea

In [None]:
# import shap library
import shap

print(X_train.shape)
# explain the model's predictions using SHAP
explainer = shap.TreeExplainer(best_rf_random, feature_perturbation="interventional", model_output="raw")
shap_values = explainer.shap_values(X_train)

# visualize the first prediction's explanation 
shap.initjs()


In [None]:
# The mean prediction of your model on the data (for each categorical outcome).
explainer.expected_value

In [None]:
# Summary plot on the train set
shap.summary_plot(shap_values, X_train, plot_type='bar');

In [None]:
# Summary plot on the test set
shap_values_test = explainer.shap_values(X_test)
shap.summary_plot(shap_values_test, X_test, plot_type='bar');

In [None]:
## Explain the contribution of the first 20 features on the prediction of the whole data set
shap.force_plot(
    explainer.expected_value[0],  
    shap_values[0],
    X_train.iloc[:, :20]
)

In [None]:
# Shap values all all features for the first instance/row of the dataset
shap.force_plot(
    explainer.expected_value[0],  
    shap_values[0][0],
    X_train.iloc[:1, :]
)

### SHAP Dependence Plots
SHAP dependence plots show the effect of a single feature across the whole dataset. They plot a feature’s value vs. the SHAP value of that feature across many samples. SHAP dependence plots are similar to partial dependence plots, but account for the interaction effects present in the features, and are only defined in regions of the input space supported by data. The vertical dispersion of SHAP values at a single feature value is driven by interaction effects, and another feature is chosen for coloring to highlight possible interactions.

* Reference(s):
>- https://shap.readthedocs.io/en/latest/example_notebooks/tabular_examples/tree_based_models/Census%20income%20classification%20with%20XGBoost.html#Explain-predictions

In [None]:
# Here we display the dependency plots of the featuees "ALogP (#1)", "PubChem_504", and "Aromatic Bonds Count" on the whole dataset

for name in ["ALogP (#1)", "PubChem_504", "Aromatic Bonds Count"]:
    shap.dependence_plot(name, shap_values[0], X_train, display_features=X_train)

## Training a voting classifier
The Voting Classifier class is not yet supported by SHAP

In [None]:
scoring = ['f1_weighted']
params_grid_rfc =  {  'bootstrap': [True]
                  , 'max_depth': [5, 10, 30, None]
                  , 'criterion': ['gini'] # , 'entropy'
                  , 'max_features': ['auto']
                  , 'min_samples_split': [5, 10]
                  , 'n_estimators': [100, 150, 300]
                  , 'min_impurity_decrease': [0.0]
                 }

params_grid_svc = {
    'kernel': ['linear', 'sigmoid']
    , 'class_weight': ['balanced']
}

params_grid_gbc = {
              'n_estimators' : [100, 400]
              , 'learning_rate': [0.005 ,0.05]
              , 'max_depth': [30, None]
              , 'max_features': ['auto']
              , 'min_impurity_decrease': [0.0]
              }

In [None]:
rfc = RandomForestClassifier(random_state=random_state)
gbc = GradientBoostingClassifier(random_state=random_state)

# Given that we will use soft voting, which uses predicted probabilities for each class instead of 
# predicted labels, we must set the probability =True
svc = SVC(probability=True, random_state=random_state)


params = {}
params.update({"rfc__" + k: v for k, v in params_grid_rfc.items()})
params.update({"gbc__" + k: v for k, v in params_grid_gbc.items()})
params.update({"svc__" + k: v for k, v in params_grid_svc.items()})

ensemble_classifier = VotingClassifier(estimators=[("rfc", rfc),
                                    ("gbc", gbc),
                                    ("svc", svc)],
                        voting="soft")

ensemble_grid_searcher = GridSearchCV(estimator = ensemble_classifier , param_grid = params, cv = kfold, n_jobs = 1
                                      , verbose = 2, scoring = 'f1_weighted', return_train_score=True)

ensemble_grid_searcher.fit(X_train, y_train);

In [None]:
best_eclf_grid = ensemble_grid_searcher.best_estimator_

best_eclf_grid_train_score = f1_score(best_eclf_grid.predict(X_train), y_train , average='weighted' )
best_eclf_grid_test_score  = f1_score(best_eclf_grid.predict(X_test), y_test , average='weighted' )


print("Best pamateters: {}".format(ensemble_grid_searcher.best_params_))
print("Best score: {}".format(ensemble_grid_searcher.best_score_))
print("Best train score (F1-weigthed): {}".format(best_eclf_grid_train_score))
print("Best test score (F1-weigthed): {}".format(best_eclf_grid_test_score))

# Combining models trained with differently stratified splits.

## Create several splits based on selected features.
The splits can be based on different columns (activity, and some selected properties). For instance: 
* One mode trained on stratified split for 'Activity'
* Create bins for certain properties (e.g.: the 3-5 of the most important features based on SHAP), and build stratified train/test and train a model on it.

## Create train/test splits, and K-fold splits using Fingerprints through RDKit
* Reference(s):
>- **Picking Diverse Molecules Using Fingerprints (rdkit.SimDivFilters):** https://www.rdkit.org/docs/GettingStartedInPython.html
>- **Squonk: RDKit MaxMin Picker:** https://squonk.it/docs/cells/RDKit%20MaxMin%20Picker/
>- **Revisting the MaxMinPicker (2017)** http://rdkit.blogspot.com/2017/11/revisting-maxminpicker.html
>- **RDKit Blog - MaxMinPicker**: https://github.com/greglandrum/rdkit_blog/blob/master/notebooks/MaxMinPickerRevisited.ipynb

In [None]:
from rdkit import Chem
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprint, GetAtomPairFingerprint, GetTopologicalTorsionFingerprint
from rdkit.Chem import PandasTools, MolFromSmiles
from rdkit import DataStructs
from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker

In [None]:
compounds_fname = "{}/compounds/nr-ahr.tab".format(ROOT_DIR)
compounds_df   = pd.read_csv(compounds_fname, sep='\t')
compounds_df.head()

In [None]:
# smiles = smiles_df['SMILES']
# mols = [mol for mol in suppl if x is not None]
PandasTools.AddMoleculeColumnToFrame(compounds_df,'SMILES','Molecule',includeFingerprints=True)
compounds_df.iloc[:1,:]

### Interesting RDKit capabilitiy for subtrucutre search
A substructure filter can be applied on the dataframe using the RDKit molecule column, because the “>=” operator has been modified to work as a substructure check. Such the antibiotics containing the tributylamine group ("CCCCN(CCCC)CCCC") can be obtained by the call below:

In [None]:
tributylamine = MolFromSmiles("CCCCN(CCCC)CCCC")
mols_with_tributylamine = compounds_df[smiles_df['Molecule'] >=tributylamine]

In [None]:
mols_with_tributylamine

### Doing the MixMax Picking
Pick() uses hierarchical clustering to pick compounds, while LazyPick() uses a user-defined function.

In [None]:
mols = [mol for mol in compounds_df['Molecule'] if not mol is None]
n_compounds_to_pick = round(nfps*test_ratio)
picker = MaxMinPicker()

fp_types = { "morgan": "GetMorganFingerprint", "atom_pair": "GetAtomPairFingerprint", "top_torso": "GetTopologicalTorsionFingerprint"} 

#### MinMax Picking with Morgan Fingperprints

In [None]:
fps_morgan = [GetMorganFingerprint(x,3) for x in mols]

## Calculate the Dice dissimilarity between compounds
def distij(i,j,fps=fps_morgan):
    return 1-DataStructs.DiceSimilarity(fps_morgan[i],fps_morgan[j])

nfps_morgan = len(fps_morgan)
pickTestIndices_morgan = picker.LazyPick(distij, nfps_morgan, n_compounds_to_pick ,seed=random_state)
# list(pickTestIndices_morgan)
indices_in_dataset_morgan = [i for i in list(pickTestIndices_morgan) if i in dataset.index]
test_minmax_morgan = dataset.iloc[indices_in_dataset_morgan]
train_minmax_morgan = dataset[~dataset.index.isin(indices_in_dataset_morgan)]

X_train_minmax_morgan, y_train_minmax_morgan = train_minmax_morgan[train_minmax_morgan.columns.difference([target])], train_minmax_morgan[target]
X_test_minmax_morgan, y_test_minmax_morgan = test_minmax_morgan[test_minmax_morgan.columns.difference([target])], test_minmax_morgan[target]

In [None]:
print("Morgan:\n\tTrain: {}\n\tTest: {}".format(train_minmax_morgan.shape, test_minmax_morgan.shape))

#### MinMax Picking with Atom Pair Fingperprints

In [None]:
fps_atom_pair = [GetAtomPairFingerprint(x) for x in mols]

## Calculate the Dice dissimilarity between compounds
def distij(i,j,fps=fps_atom_pair):
    return 1-DataStructs.DiceSimilarity(fps_atom_pair[i],fps_atom_pair[j])

nfps_atom_pair = len(fps_atom_pair)
pickTestIndices_atom_pair = picker.LazyPick(distij, nfps_atom_pair, n_compounds_to_pick ,seed=random_state)
# list(pickTestIndices_morgan)
indices_in_dataset_atom_pair = [i for i in list(pickTestIndices_atom_pair) if i in dataset.index]
test_minmax_atom_pair = dataset.iloc[indices_in_dataset_atom_pair]
train_minmax_atom_pair = dataset[~dataset.index.isin(indices_in_dataset_atom_pair)]

X_train_minmax_atom_pair, y_train_minmax_atom_pair = train_minmax_atom_pair[train_minmax_atom_pair.columns.difference([target])], train_minmax_atom_pair[target]
X_test_minmax_atom_pair, y_test_minmax_atom_pair = test_minmax_atom_pair[test_minmax_atom_pair.columns.difference([target])], test_minmax_atom_pair[target]

In [None]:
print("Atom Pair:\n\tTrain: {}\n\tTest: {}".format(train_minmax_atom_pair.shape, test_minmax_atom_pair.shape))

#### MinMax Picking with Topological Torsional Fingperprints

In [None]:
fps_top_torso = [GetTopologicalTorsionFingerprint(x) for x in mols]

## Calculate the Dice dissimilarity between compounds
def distij(i,j,fps=fps_top_torso):
    return 1-DataStructs.DiceSimilarity(fps_top_torso[i],fps_top_torso[j])

nfps_top_torso = len(fps_top_torso)
pickTestIndices_top_torso = picker.LazyPick(distij, nfps_top_torso, n_compounds_to_pick ,seed=random_state)
# list(pickTestIndices_morgan)
indices_in_dataset_top_torso = [i for i in list(pickTestIndices_top_torso) if i in dataset.index]
test_minmax_top_torso = dataset.iloc[indices_in_dataset_top_torso]
train_minmax_top_torso = dataset[~dataset.index.isin(indices_in_dataset_top_torso)]

X_train_minmax_top_torso, y_train_minmax_top_torso = train_minmax_top_torso[train_minmax_top_torso.columns.difference([target])], train_minmax_top_torso[target]
X_test_minmax_top_torso, y_test_minmax_top_torso = test_minmax_top_torso[test_minmax_top_torso.columns.difference([target])], test_minmax_top_torso[target]

In [None]:
print("Topological Torsional:\n\tTrain: {}\n\tTest: {}".format(train_minmax_atom_pair.shape, test_minmax_atom_pair.shape))

In [None]:
intersecton_morgan_atom_pair = [p for p in indices_in_dataset_morgan if p in indices_in_dataset_atom_pair]
intersecton_morgan_top_torso = [p for p in indices_in_dataset_morgan if p in indices_in_dataset_top_torso]
intersecton_atom_pair_top_torso = [p for p in indices_in_dataset_atom_pair if p in indices_in_dataset_top_torso]
print(len(intersecton_morgan_atom_pair))
print("Morgan/AtomPair: {}".format(len(intersecton_morgan_atom_pair)/ len(set(indices_in_dataset_atom_pair+indices_in_dataset_morgan)) ))
print("Morgan/TopologicalTorsional: {}".format( len(intersecton_morgan_top_torso)/ len(set(indices_in_dataset_top_torso+indices_in_dataset_morgan))))
print("TopologicalTorsional/AtomPair: {}".format( len(intersecton_atom_pair_top_torso)/ len(set(indices_in_dataset_atom_pair+indices_in_dataset_top_torso))))

### Training Models
#### After MinMax Picking with Morgan Fingperprints

In [None]:
rfc_morgan = RandomForestClassifier(random_state=random_state)
grid_searcher_morgan = GridSearchCV(estimator = rfc_morgan, param_grid = params_grid_rf, cv = kfold, n_jobs = 1, verbose = 0, scoring = 'f1_weighted', return_train_score=True)
grid_searcher_morgan.fit(X_train_minmax_morgan, y_train_minmax_morgan)

In [None]:
best_rf_grid_morgan = grid_searcher_morgan.best_estimator_
best_rf_grid_morgan_train_score = f1_score(best_rf_grid_morgan.predict(X_train_minmax_morgan), y_train_minmax_morgan , average='weighted' )
best_rf_grid_morgan_test_score  = f1_score(best_rf_grid_morgan.predict(X_test_minmax_morgan), y_test_minmax_morgan , average='weighted' )
print("Best RF pamateters: {}".format(grid_searcher_morgan.best_params_))
print("Best RF score: {}".format(grid_searcher_morgan.best_score_))
print("Best RF train score (F1-weigthed): {}".format(best_rf_grid_morgan_train_score))
print("Best RF test score (F1-weigthed): {}".format(best_rf_grid_morgan_test_score))

In [None]:
############ SHAP
# explain the model's predictions using SHAP
explainer_rfc_morgan = shap.TreeExplainer(best_rf_grid_morgan, feature_perturbation="interventional", model_output="raw")
shap_values_train_morgan = explainer_rfc_morgan.shap_values(X_train_minmax_morgan)

In [None]:
# Summary plot on the train set
print("Expected values: {}".format(explainer_rfc_morgan.expected_value))
shap.summary_plot(shap_values_train_morgan, X_train_minmax_morgan, plot_type='bar');

#### After MinMax Picking with Atom Pair Fingperprints

In [None]:
rfc_atom_pair = RandomForestClassifier(random_state=random_state)
grid_searcher_atom_pair = GridSearchCV(estimator = rfc_atom_pair, param_grid = params_grid_rf, cv = kfold, n_jobs = 1, verbose = 0, scoring = 'f1_weighted', return_train_score=True)
grid_searcher_atom_pair.fit(X_train_minmax_atom_pair, y_train_minmax_atom_pair)

In [None]:
best_rf_grid_atom_pair = grid_searcher_atom_pair.best_estimator_
best_rf_grid_atom_pair_train_score = f1_score(best_rf_grid_atom_pair.predict(X_train_minmax_atom_pair), y_train_minmax_atom_pair , average='weighted' )
best_rf_grid_atom_pair_test_score  = f1_score(best_rf_grid_atom_pair.predict(X_test_minmax_atom_pair), y_test_minmax_atom_pair , average='weighted' )

print("Best RF pamateters: {}".format(grid_searcher_atom_pair.best_params_))
print("Best RF score: {}".format(grid_searcher_atom_pair.best_score_))
print("Best RF train score (F1-weigthed): {}".format(best_rf_grid_atom_pair_train_score))
print("Best RF test score (F1-weigthed): {}".format(best_rf_grid_atom_pair_test_score))

In [None]:
############ SHAP
# explain the model's predictions using SHAP
explainer_rfc_atom_pair = shap.TreeExplainer(best_rf_grid_atom_pair, feature_perturbation="interventional", model_output="raw")
shap_values_train_atom_pair = explainer_rfc_atom_pair.shap_values(X_train_minmax_atom_pair)

In [None]:
# Summary plot on the train set
print("Expected values: {}".format(explainer_rfc_atom_pair.expected_value))
shap.summary_plot(shap_values_train_atom_pair, X_train_minmax_atom_pair, plot_type='bar');

#### After MinMax Picking with Topological Torsional Fingperprints

In [None]:
rfc_top_torso = RandomForestClassifier(random_state=random_state)
grid_searcher_top_torso = GridSearchCV(estimator = rfc_top_torso, param_grid = params_grid_rf, cv = kfold, n_jobs = 1, verbose = 0, scoring = 'f1_weighted', return_train_score=True)
grid_searcher_top_torso.fit(X_train_minmax_top_torso, y_train_minmax_top_torso)

In [None]:
best_rf_grid_top_torso = grid_searcher_top_torso.best_estimator_
best_rf_grid_top_torso_train_score = f1_score(best_rf_grid_top_torso.predict(X_train_minmax_top_torso), y_train_minmax_top_torso , average='weighted' )
best_rf_grid_top_torso_test_score  = f1_score(best_rf_grid_top_torso.predict(X_test_minmax_top_torso), y_test_minmax_top_torso , average='weighted' )
print("Best RF pamateters: {}".format(grid_searcher_top_torso.best_params_))
print("Best RF score: {}".format(grid_searcher_top_torso.best_score_))
print("Best RF train score (F1-weigthed): {}".format(best_rf_grid_top_torso_train_score))
print("Best RF test score (F1-weigthed): {}".format(best_rf_grid_top_torso_test_score))

In [None]:
############ SHAP
# explain the model's predictions using SHAP
explainer_rfc_top_torso = shap.TreeExplainer(best_rf_grid_top_torso, feature_perturbation="interventional", model_output="raw")
shap_values_train_top_torso = explainer_rfc_top_torso.shap_values(X_train_minmax_top_torso)

In [None]:
# Summary plot on the train set
print("Expected values: {}".format(explainer_rfc_top_torso.expected_value))
shap.summary_plot(shap_values_train_top_torso, X_train_minmax_top_torso, plot_type='bar');

#### Combining the best estimators
Training and evaluation will be made on a random train/test split.

In [None]:
models = list()
models.append(('best_rf_grid_morgan', best_rf_grid_morgan))
models.append(('best_rf_grid_atom_pair', best_rf_grid_atom_pair))
models.append(('best_rf_grid_top_torso', best_rf_grid_top_torso))


<p style="color:green; font-size:12"><b>Hard Voting</b></p>

In [None]:
ensemble_hard = VotingClassifier(models, voting='hard')
ensemble_hard.fit(X_train, y_train)

In [None]:
print("Hard Voter Train Score (F1-weigthed): {}".format(f1_score(ensemble_hard.predict(X_train), y_train , average='weighted' )))
print("Hard Voter Test Score (F1-weigthed): {}".format(f1_score(ensemble_hard.predict(X_test), y_test , average='weighted' )))

<p style="color:green; font-size:12"><b>Soft Voting</b></p>

In [None]:
ensemble_soft = VotingClassifier(models, voting='soft')
ensemble_soft.fit(X_train, y_train)

In [None]:
ensemble_soft_train_score = f1_score(ensemble_soft.predict(X_train), y_train , average='weighted' )
ensemble_soft_test_score  = f1_score(ensemble_soft.predict(X_test), y_test , average='weighted' )
print("Soft Voter Train Score (F1-weigthed): {}".format(ensemble_soft_train_score))
print("Soft Voter Test Score (F1-weigthed): {}".format(ensemble_soft_test_score))

In [None]:
print("This soft voting ensemble classifier was built using three random forest classifiers trained on train/test \
split obtained using different compound selection method. Its weighted **F1 score is {}**, which is higher than the \
**F1 score ({})** of the previous soft voting ensemble classifier that combibed a random forest classifier, a \
support vector classifier, and a gradient boosting classifier.".format(ensemble_soft_test_score, best_eclf_grid_test_score))

# Conclusion

In [None]:
print("1) Diversification w.r.t. splitting technique could provide better results compared to diversification w.r.t classification algortihms.")
print("2) Splitting train/test sets with MinMax diversity selection could improve model performance over random splitting.")
print("3) The best random forest models achieved a weighted F1-score of {} (RF random split),\
{} (Morgan FP Pick), {} (Atom Pair FP Pick), {} (Topol. Torsional Pick)".format(best_rf_grid_test_score, best_rf_grid_morgan_test_score
                                                                                , best_rf_grid_atom_pair_test_score, best_rf_grid_top_torso_test_score))