In [1]:
import os
import pandas as pd
import rdkit
from rdkit import RDLogger
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdFMCS
from rdkit import DataStructs
from rdkit.Chem import PandasTools
from rdkit.Chem import Lipinski
from matplotlib import pyplot as plt

In [2]:
# Disable log messages from RDKit
RDLogger.DisableLog('rdApp.*')

In [3]:
# table with rmsd values, smiles and mcs smarts
#rmsd_df = pd.read_csv('../data/rmsd_values_base.csv')
rmsd_df = pd.read_csv('../../../Analysis_of_Docking/data/rmsd_values_plus_smiles.csv')

In [4]:
def HDonors_in_substructure(mol, subs):
    HDonor_atoms = mol.GetSubstructMatches(Lipinski.HDonorSmarts)
    if HDonor_atoms == ():
        return 0
    atoms_in_substructure = mol.GetSubstructMatches(subs)[0]
    count = 0
    for HDonor_atom in HDonor_atoms:
        if HDonor_atom[0] in atoms_in_substructure:
            count += 1
    return count

def HAcceptors_in_substructure(mol, subs):
    HAcceptor_atoms = mol.GetSubstructMatches(Lipinski.HAcceptorSmarts)
    if HAcceptor_atoms == ():
        return 0
    atoms_in_substructure = mol.GetSubstructMatches(subs)[0]
    count = 0
    for HAcceptor_atom in HAcceptor_atoms:
        if HAcceptor_atom[0] in atoms_in_substructure:
            count += 1
    return count

def NHOH_in_substructure(mol, subs):
    NHOH_atoms = mol.GetSubstructMatches(Lipinski.NHOHSmarts)
    if NHOH_atoms == ():
        return 0
    atoms_in_substructure = mol.GetSubstructMatches(subs)[0]
    count = 0
    for NHOH_atom in NHOH_atoms:
        if NHOH_atom[0] in atoms_in_substructure:
            count += 1
    return count

def NumRotableBonds_NotStrict_in_substructure(mol, subs):
    RotableBonds_pairs = mol.GetSubstructMatches(Lipinski.RotatableBondSmarts)
    if RotableBonds_pairs == ():
        return 0
    atoms_in_substructure = mol.GetSubstructMatches(subs)[0]
    count = 0
    for RotableBonds_pair in RotableBonds_pairs:
        if RotableBonds_pair[0] in atoms_in_substructure and \
                RotableBonds_pair[1] in atoms_in_substructure:
            count += 1
    return count

def NumRotableBonds_minusAmides_in_substructure(mol, subs):
    patt1 = Chem.MolFromSmarts("[!$([NH]!@C(=O))&!D1&!$(*#*)]-&!@[!$([NH]!@C(=O))&!D1&!$(*#*)]")
    patt2 = Chem.MolFromSmarts("[!$(C(=O)!@[NH])&!D1&!$(*#*)]-&!@[!$(C(=O)!@[NH])&!D1&!$(*#*)]")
    RotableBonds_pairs = mol.GetSubstructMatches(patt1) + mol.GetSubstructMatches(patt2)
    if RotableBonds_pairs == ():
        return 0
    RotableBonds_pairs = set(RotableBonds_pairs)
    atoms_in_substructure = mol.GetSubstructMatches(subs)[0]
    count = 0
    for RotableBonds_pair in RotableBonds_pairs:
        if RotableBonds_pair[0] in atoms_in_substructure and \
                RotableBonds_pair[1] in atoms_in_substructure:
            count += 1
    return count

def NumRotableBonds_minusAmides(mol):
    patt1 = Chem.MolFromSmarts("[!$([NH]!@C(=O))&!D1&!$(*#*)]-&!@[!$([NH]!@C(=O))&!D1&!$(*#*)]")
    patt2 = Chem.MolFromSmarts("[!$(C(=O)!@[NH])&!D1&!$(*#*)]-&!@[!$(C(=O)!@[NH])&!D1&!$(*#*)]")
    RotableBonds_pairs = mol.GetSubstructMatches(patt1) + mol.GetSubstructMatches(patt2)
    if RotableBonds_pairs == ():
        return 0
    RotableBonds_pairs = set(RotableBonds_pairs)
    return len(RotableBonds_pairs)

In [5]:
# Attempt 1: use Lipinski parameters as features
# define Lipinski parameters:

lipinski_parameters = {'HeavyAtomCount': Lipinski.HeavyAtomCount, 'NHOHCount': Lipinski.NHOHCount, 'NOCount': Lipinski.NOCount,
                       'RingCount': Lipinski.RingCount, 'NumHAcceptors': Lipinski.NumHAcceptors, 'NumHDonors': Lipinski.NumHDonors,
                       'NumHeteroAtoms': Lipinski.NumHeteroatoms, 'NumRotatableBonds': NumRotableBonds_minusAmides}

lipinski_parameters_mcs = {'mcs_HeavyAtomCount': Lipinski.HeavyAtomCount, 'mcs_NOCount': Lipinski.NOCount,
                           'mcs_RingCount': Lipinski.RingCount,  'mcs_NumHeteroAtoms': Lipinski.NumHeteroatoms,
                           'mcs_template_NHOHCount': NHOH_in_substructure, 'mcs_docked_NHOHCount': NHOH_in_substructure,
                           'mcs_template_NumHAcceptors': HAcceptors_in_substructure, 'mcs_docked_NumHAcceptors': HAcceptors_in_substructure,
                           'mcs_template_NumHDonors': HDonors_in_substructure, 'mcs_docked_NumHDonors': HDonors_in_substructure,
                           'mcs_template_NumRotatableBonds': NumRotableBonds_minusAmides_in_substructure, 'mcs_docked_NumRotatableBonds': NumRotableBonds_minusAmides_in_substructure}

In [6]:
for type in ['template', 'docked']:
    for lipinski_parameter in lipinski_parameters:
        rmsd_df[type + '_' + lipinski_parameter] = None

for lipinski_parameter in lipinski_parameters_mcs:
        rmsd_df[lipinski_parameter] = None

In [7]:
for index, template, docked, smiles_template, smiles_docked, mcs_smarts in \
        rmsd_df[['template', 'docked', 'smiles_template', 'smiles_docked', 'mcs_smartsString']].itertuples():

    template_mol = Chem.MolFromSmiles(smiles_template)
    docked_mol = Chem.MolFromSmiles(smiles_docked)
    mcs_mol = Chem.MolFromSmarts(mcs_smarts)
    mcs_mol.UpdatePropertyCache(strict=False)
    Chem.GetSymmSSSR(mcs_mol)

    template_fp = AllChem.GetMorganFingerprint(template_mol, 3)
    docked_fp = AllChem.GetMorganFingerprint(docked_mol, 3)
    rmsd_df.at[index, 'ec_tanimoto_similarity'] = DataStructs.TanimotoSimilarity(template_fp, docked_fp)

    template_fp = AllChem.GetMorganFingerprint(template_mol, 3, useFeatures=True)
    docked_fp = AllChem.GetMorganFingerprint(docked_mol, 3, useFeatures=True)
    rmsd_df.at[index, 'fc_tanimoto_similarity'] = DataStructs.TanimotoSimilarity(template_fp, docked_fp)

    for type in ['template', 'docked', 'mcs']:
        if type == 'template':
            for lipinski_parameter in lipinski_parameters:
                try:
                    rmsd_df.at[index, type + '_' + lipinski_parameter] =\
                        lipinski_parameters[lipinski_parameter](template_mol)
                except:
                    rmsd_df.at[index, type + '_' + lipinski_parameter] = -1
        if type == 'docked':
            for lipinski_parameter in lipinski_parameters:
                try:
                    rmsd_df.at[index, type + '_' + lipinski_parameter] =\
                        lipinski_parameters[lipinski_parameter](docked_mol)
                except:
                    rmsd_df.at[index, type + '_' + lipinski_parameter] = -1
        if type == 'mcs':
            for lipinski_parameter in lipinski_parameters_mcs:
                try:
                    if 'template' in lipinski_parameter:
                        rmsd_df.at[index, lipinski_parameter] = \
                            lipinski_parameters_mcs[lipinski_parameter](template_mol, mcs_mol)
                    elif 'docked' in lipinski_parameter:
                        rmsd_df.at[index, lipinski_parameter] = \
                            lipinski_parameters_mcs[lipinski_parameter](docked_mol, mcs_mol)
                    else: 
                        rmsd_df.at[index, lipinski_parameter] = \
                            lipinski_parameters_mcs[lipinski_parameter](mcs_mol)
                except:
                    rmsd_df.at[index, lipinski_parameter] = -1

print('Done!')

Done!


In [11]:
#rmsd_df.to_csv('../data/rmsd_values_featurized.csv', index=False)
rmsd_df.to_csv('../../../Analysis_of_Docking/data/rmsd_values_featurized.csv', index=False)
