In [None]:
#import libraries
from mordred import Calculator, descriptors, is_missing
# from sklearn.externals.joblib import load
from joblib import load
import pandas as pd
import math
from sklearn.linear_model import LinearRegression
from pandas import DataFrame
from rdkit.Chem.inchi import *
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import PandasToolsmo
from rdkit.Chem import inchi
import numpy as np
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect
from rdkit.Chem.rdMolDescriptors import CalcMolFormula
from rdkit.Chem.rdMolDescriptors import CalcExactMolWt
from rdkit.DataStructs.cDataStructs import ConvertToNumpyArray
import rdkit
# from tqdm import tqdm
from tqdm.notebook import tqdm
import xgboost as xgb
from pandas import Panel
tqdm.pandas()
PandasTools.RenderImagesInAllDataFrames(images=True)

In [None]:
# function for descriptors using MORDERED
calc = Calculator(descriptors, ignore_3D=True)
def mord(mol, nBits=1613, errors_as_zeros=True):
    try:
        result = calc(mol)
        desc_list = [r if not is_missing(r) else 0 for r in result]
        np_arr = np.array(desc_list)
        return np_arr
    except:
        return np.NaN if not errors_as_zeros else np.zeros((nBits,), dtype=np.float32)

In [None]:
#Load standard_scaler and pre-trained models
s=load('../models/std_scaler.bin')
mord_bin= xgb.Booster()
mord_reg = xgb.Booster()
mord_bin.load_model('../models/mordred_bin.model')
mord_reg.load_model('../models/mordred_reg.model')

In [None]:
data = pd.DataFrame(columns=['Name', 'InChI', 'Class', 'Direct Parent', 'Subclass', 'Superclass', 'Polar', 'ID'])
sdf = open('../resources/bmdms_annotated_polarity.sdf', 'rb')
# sdf = open('../resources/biopurify_annotated-714cpds_PreADMET_PP_Canvas.sdf', 'rb')
fsuppl = Chem.ForwardSDMolSupplier(sdf)

def get_prop(mol, property_name):
    return_value = mol.GetProp(property_name)
    if return_value:
        return return_value
    else:
        return ''

for mol in fsuppl:
    if mol:
        name_value = get_prop(mol, '_Name')
        inchi_value = inchi.MolToInchi(mol)
        try:
            class_value = mol.GetProp('Class')
        except:
            class_value = ''
        try:
            direct_parent_value = mol.GetProp('Direct Parent')
        except:
            direct_parent_value = ''
        try:
            subclass_value = mol.GetProp('Subclass')
        except:
            subclass_value = ''
        try:
            superclass_value = mol.GetProp('Superclass')
        except:
            superclass_value = ''
        polar_value = mol.GetProp('Polar')
        try:
            id_value = mol.GetProp('BMDRC_ID')
        except:
            id_value = ''

        data = data.append({'Name':name_value, 'InChI':inchi_value, 'Class':class_value, 'Direct Parent':direct_parent_value, 'Subclass':subclass_value, 'Superclass':superclass_value, 'Polar':polar_value, 'ID':id_value}, ignore_index=True)
    

In [None]:
mord_reg.save_model('../models/mord_reg.model')
mord_reg.load_model('../models/mord_reg.model')

In [None]:
# data

In [None]:
# Converting to appropriate format:
data['mol'] = data['InChI'].progress_apply(MolFromInchi)
data['MolFormula'] = data['mol'].progress_apply(CalcMolFormula)
data['MolExactWt'] = data['mol'].progress_apply(CalcExactMolWt)
data['mordred'] = data['mol'].progress_apply(mord)
sample = xgb.DMatrix(np.vstack(data['mordred'].values))


In [None]:
pred_bin = mord_bin.predict(sample)
pred_reg = mord_reg.predict(sample)
pred_reg= s.inverse_transform(pred_reg).reshape(-1,1)

In [None]:
# pred_reg

In [None]:
data['Binary_retained'] = pred_bin
data['Predicted_RT'] = pred_reg

out = pd.DataFrame(list(zip(data['Name'], data['mol'], data['MolFormula'], data['MolExactWt'], data['InChI'], data['Class'], data['Direct Parent'], data['Subclass'], data['Superclass'], data['Polar'], data['ID'], data['Binary_retained'], data['Predicted_RT'])), columns=['Name', 'mol','MolFormula', 'MolExactWt', 'InChI', 'Class', 'Direct Parent', 'Subclass', 'Superclass', 'Polar', 'ID', 'Binary_retained', 'Predicted_RT'])

In [None]:
# PandasTools.SaveXlsxFromFrame(out, "../out/output_biopurify.xlsx", molCol="mol", size=(200,200))
PandasTools.SaveXlsxFromFrame(out, "../out/output_bmdms.xlsx", molCol="mol", size=(200,200))

In [None]:
data