##### Prediction of leaderboard external set by the old model (old training set)

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import _pickle as cPickle
import gzip

from stats import *

from collections import Counter

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem import MACCSkeys

from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.model_selection import permutation_test_score, StratifiedKFold

def warn(*args, **kwargs):
    pass
import warnings
warnings.filterwarnings("ignore")
warnings.warn = warn

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
%reload_ext autoreload
%autoreload 2
Draw.DrawingOptions.atomLabelFontFace = "DejaVu Sans"
Draw.DrawingOptions.atomLabelFontSize = 18

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

### Import leaderboard set

In [2]:
# Set file path and format
file = 'leaderboard_set_curated.sdf'

# Read SDF
sdfInfo = dict(molColName='ROMol')
moldf = PandasTools.LoadSDF(file, **sdfInfo);
#print('Original data: ', moldf.shape)
# Rename ROMol
moldf = moldf.rename(columns={'ROMol': 'Mol'})
# Remove missing RDKit molecules
moldf = moldf[pd.notnull(moldf['Mol'])]
if 'StandardizerResult' in moldf.columns:
    moldf = moldf.drop(columns='StandardizerResult')

##### Calculate Morgan Fingerprints

In [3]:
def calcfp(mol,funcFPInfo=dict(radius=2, nBits=2048, useFeatures=False, useChirality=False)):
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, **funcFPInfo)
    fp = pd.Series(np.asarray(fp))
    fp = fp.add_prefix('Bit_')
    return fp

##### Calculate MACCS

In [4]:
def maccs(mol):
    fp = MACCSkeys.GenMACCSKeys(mol)
    fp = pd.Series(np.asarray(fp))
    fp = fp.add_prefix('Bit_')
    return fp

In [5]:
X_vs_morgan = moldf.Mol.apply(calcfp)
X_vs_maccs = moldf.Mol.apply(maccs)



In [6]:
X_vs_morgan.shape
X_vs_maccs.shape

(183, 2048)

(183, 167)

In [7]:
with gzip.open('model_train_set_morgan-rf.pgz', 'rb') as f:
    morgan_model = cPickle.load(f)

In [8]:
with gzip.open('model_train_set_maccs-rf.pgz', 'rb') as f:
    maccs_model= cPickle.load(f)

In [9]:
# Make predictions
y_ext = moldf['activity'].to_numpy()

morgan_pred = morgan_model.predict(X_vs_morgan)
maccs_pred = maccs_model.predict(X_vs_maccs)

# Prepare data
predictions = pd.DataFrame({'morgan_prediction': morgan_pred, 'maccs_prediction': maccs_pred})
predictions.sort_index(inplace=True)
predictions['y_ext'] = pd.DataFrame(y_ext)

In [10]:
# ext_set Morgan stats
ext_set_morgan_stats = pd.DataFrame(regression_stats(predictions['y_ext'], predictions['morgan_prediction']))
ext_set_maccs_stats = pd.DataFrame(regression_stats(predictions['y_ext'], predictions['maccs_prediction']))
consensus_pred = (morgan_pred + maccs_pred) /2
consensus_stats = pd.DataFrame(regression_stats(predictions['y_ext'], consensus_pred))
all_stats = pd.concat([ext_set_morgan_stats, ext_set_maccs_stats, consensus_stats], axis=0) 
all_stats['model'] = ['Morgan-RF', 'Maccs-RF', 'Consensus' ]

# Print stats
print('\033[1m' + 'Statistical Characteristics of the Predictions for the External Leaderboard Set' + '\n' + '\033[0m')
all_stats

[1mStatistical Characteristics of the Predictions for the External Leaderboard Set
[0m


Unnamed: 0,Mean Absolute Error (MAE),Mean Squared Error (MSE),Root Mean Squared Error (RMSE),R-squared (R2),model
0,20.29,677.3,26.03,0.4,Morgan-RF
0,19.74,651.27,25.52,0.42,Maccs-RF
0,19.71,633.86,25.18,0.44,Consensus
