##### Consensus prediction (Morgan + MACCS - RF) from the new model including the leaderboard set in the training set

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import _pickle as cPickle
import gzip

from stats import *

from collections import Counter

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem import MACCSkeys

from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.model_selection import permutation_test_score, StratifiedKFold

def warn(*args, **kwargs):
    pass
import warnings
warnings.filterwarnings("ignore")
warnings.warn = warn

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
%reload_ext autoreload
%autoreload 2
Draw.DrawingOptions.atomLabelFontFace = "DejaVu Sans"
Draw.DrawingOptions.atomLabelFontSize = 18

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

### Import Blind set


In [2]:
# Set file path and format
file = 'test_set_all_kept_curated.sdf'

# Read SDF
sdfInfo = dict(molColName='ROMol')
moldf = PandasTools.LoadSDF(file, **sdfInfo);
#print('Original data: ', moldf.shape)
# Rename ROMol
moldf = moldf.rename(columns={'ROMol': 'Mol'})
# Remove missing RDKit molecules
moldf = moldf[pd.notnull(moldf['Mol'])]
if 'StandardizerResult' in moldf.columns:
    moldf = moldf.drop(columns='StandardizerResult')

##### Calculate Morgan Fingerprints


In [3]:
def calcfp(mol,funcFPInfo=dict(radius=2, nBits=2048, useFeatures=False, useChirality=False)):
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, **funcFPInfo)
    fp = pd.Series(np.asarray(fp))
    fp = fp.add_prefix('Bit_')
    return fp

##### Calculate MACCS

In [4]:
def maccs(mol):
    fp = MACCSkeys.GenMACCSKeys(mol)
    fp = pd.Series(np.asarray(fp))
    fp = fp.add_prefix('Bit_')
    return fp

In [5]:
X_vs_morgan = moldf.Mol.apply(calcfp)
X_vs_maccs = moldf.Mol.apply(maccs)



In [6]:
X_vs_morgan.shape
X_vs_maccs.shape

(500, 2048)

(500, 167)

#### Load Models

In [7]:
with gzip.open('model_new_train_set_morgan-rf.pgz', 'rb') as f:
    morgan_model = cPickle.load(f)

In [8]:
with gzip.open('model_new_train_set_maccs-rf.pgz', 'rb') as f:
    maccs_model= cPickle.load(f)

##### Make predictions

In [9]:
morgan_pred = morgan_model.predict(X_vs_morgan)
maccs_pred = maccs_model.predict(X_vs_maccs)

# Prepare data
predictions = pd.DataFrame({'morgan_prediction': morgan_pred, 'maccs_prediction': maccs_pred})
predictions.sort_index(inplace=True)

In [10]:
consensus_pred = (morgan_pred + maccs_pred) /2
consensus = pd.DataFrame({'prediction': consensus_pred})

In [11]:
consensus.sort_index(inplace=True)

In [12]:
consensus

Unnamed: 0,prediction
0,18.054887
1,72.577799
2,64.272950
3,23.228350
4,50.316790
...,...
495,47.237222
496,35.298842
497,55.260708
498,48.551837


In [13]:
# Save predictions as CSV file
consensus.to_csv("morgan_maccs_rf_consensus_blind-set_predictions.csv", index=False)