In [6]:
#!/usr/bin/env python3

# This script takes a sdf  file for a set molecules with a SMILES column as input. OpenEye 
# functionalities are used to convert SMILES format to .mae format of Schrodinger. Schrodinger 
# Epik is used to generate protomers, tautomers and pKas.

from openmoltools import openeye as omtoe, schrodinger
import pandas as pd
import os
import numpy as np
from ast import literal_eval
from openeye.oechem import *
import pickle

In [62]:
##### IMPORT STARTING SET OF MOLECULES AS SMILES #####

# Import list of available molecules which were exported from eMolecules website.
df = pd.read_csv("./df_eMol_sim_unique_molecules_smiles.smi")
df.columns = ["index", "canonical isomeric SMILES", "eMolecules SKU"]
initial_number_of_molecules = df.shape[0]
print("Starting from isomeric SMILES of {} molecules.".format(initial_number_of_molecules))

# Get SMILES and eMolecules SKU as a dictionary. 

print("Extracting SMILES and eMolecules SKU from input file...")

eMolSKU_smiles_dict = {}

#for i in range(initial_number_of_molecules):
for i in range(10):
    smiles = df.loc[i,"canonical isomeric SMILES"]
    emol_sku = df.loc[i,"eMolecules SKU"]

    eMolSKU_smiles_dict[emol_sku] = smiles

print(eMolSKU_smiles_dict)

# Save "eMolecules SKU: canonical isomeric SMILES" dictionary as a pickle file
pickle.dump(eMolSKU_smiles_dict, open("eMolSKU_can_iso_smiles_dict.pickle", "wb"))


Starting from isomeric SMILES of 623 molecules.
Extracting SMILES and eMolecules SKU from input file...
{140622184: 'c1ccc(c(c1)C(=O)Nc2nnc(s2)SCc3ccc(cc3)F)F', 103600837: 'COc1ccc(cc1OC)C(=O)Nc2c(c3c(s2)CCCC3)C(=O)N', 146582116: 'CC(C)c1ccc(cc1)/C=C\\2/c3ccccc3NC2=O', 146653637: 'CS(=O)(=O)c1ccc(nn1)c2cccc(c2)NC(=O)c3cccc(c3)C(F)(F)F', 151098039: 'c1ccc(cc1)c2c(n3ccsc3n2)C=CC=C(C#N)C#N', 103864712: 'c1cc(cc(c1)Cl)Nc2c3c([nH]cn3)nc(n2)N', 166195289: 'c1ccc(cc1)CC(=O)Nc2nc3ccccc3s2', 129249626: 'CC1(Cc2c(cn(c(=O)c2C(=O)Nc3ccc(c(c3)Cl)OC)c4ccc(cc4)F)C(=O)C1)C', 154972989: 'Cc1ccccc1c2nnc(o2)SCc3ccccc3', 177271519: 'c1ccc2c(c1)N(c3cc(ccc3S2)C(F)(F)F)CCCN4CCN(CC4)CCO'}


In [63]:
##### CONVERT SMILES TO OEMOL #####

print("Converting SMILES to OEMol...")

eMolSKU_oemol_dict = {} 

for key, value in eMolSKU_smiles_dict.items():
    # Create a OEMolBuilder from a smiles string.
    oemol_molecule = omtoe.smiles_to_oemol(smiles=value)
    eMolSKU_oemol_dict[key] = oemol_molecule
    
print(eMolSKU_oemol_dict)

Converting SMILES to OEMol...
{103864712: <openeye.oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x11af3d3c0> >, 146582116: <openeye.oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x1144b0a50> >, 103600837: <openeye.oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x1144b0f00> >, 151098039: <openeye.oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x11af3d5a0> >, 140622184: <openeye.oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x11651e8d0> >, 166195289: <openeye.oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x11af3d510> >, 129249626: <openeye.oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x11af3d270> >, 177271519: <openeye.oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x11af3d1e0> >, 154972989: <openeye.oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x11af3d660> >, 146653637: <openeye.oechem.OEMol; proxy of <Swig Object of type 'OEMo

In [64]:
##### GENERATE CHARGED CONFORMERS AND SAVE AS MOL2 FILE #####

mol2_directory_path = "./mol2_files"
if not os.path.exists(mol2_directory_path):
    os.makedirs(mol2_directory_path)
    print("{} directory created.".format(mol2_directory_path))

print("Generating charged OEMol molecules...")

# Dictionary to keep track of failed molecules
failed_molecules_dict = {}

# Generate charges for an OpenEye OEMol molecule. It will return  molecule with OpenEye's recommended AM1BCC
# charge selection scheme.

for key, value in eMolSKU_oemol_dict.items():
    print("Generating conformer for ", key, "...")
    try:
        oe_molecule = omtoe.get_charges(value, keep_confs=1)
    except RuntimeError:
        print("Conformation generation failed for {}.".format(key))
        # Save failed molecule to failed_molecules_dict
        failed_molecules_dict[key] = value
    
    mol2_filename = mol2_directory_path + "/" + str(key) + ".mol2"
    omtoe.molecule_to_mol2(oe_molecule, tripos_mol2_filename=mol2_filename)
    print("Mol2 file {} generated.".format(mol2_filename))

print("")
print("Conformer generation for {} molecules failed.".format(len(failed_molecules_dict)))

# Remove failed molecules from oMolID_oemol_dict dictionary
for key, value in failed_molecules_dict.items():
    eMolSKU_oemol_dict.pop(key, None)
    
print("{} molecules removed from the list.".format(len(failed_molecules_dict)))

# Save dictionary of successful conformers as spickle file
pickle.dump(eMolSKU_oemol_dict, open("eMolSKU_oemol_dict.pickle", "wb"))
# Save dictionary of failed molecules as confromer generation as a pickle file
pickle.dump(failed_molecules_dict, open("failed_molecules_dict.pickle", "wb"))

./mol2_files directory created.
Generating charged OEMol molecules...
Generating conformer for  103864712 ...
Mol2 file ./mol2_files/103864712.mol2 generated.
Generating conformer for  146582116 ...
Mol2 file ./mol2_files/146582116.mol2 generated.
Generating conformer for  103600837 ...
Mol2 file ./mol2_files/103600837.mol2 generated.
Generating conformer for  151098039 ...
Conformation generation failed for 151098039.
Mol2 file ./mol2_files/151098039.mol2 generated.
Generating conformer for  140622184 ...
Mol2 file ./mol2_files/140622184.mol2 generated.
Generating conformer for  166195289 ...
Mol2 file ./mol2_files/166195289.mol2 generated.
Generating conformer for  129249626 ...
Mol2 file ./mol2_files/129249626.mol2 generated.
Generating conformer for  177271519 ...
Mol2 file ./mol2_files/177271519.mol2 generated.
Generating conformer for  154972989 ...
Mol2 file ./mol2_files/154972989.mol2 generated.
Generating conformer for  146653637 ...
Mol2 file ./mol2_files/146653637.mol2 gener

In [65]:
##### RUN EPIK #####

print("Running Epik with sequencial pKa prediction method...")

mae_directory_path = "./mae_files"
if not os.path.exists(mae_directory_path):
    os.makedirs(mae_directory_path)
    print("{} directory created.".format(mae_directory_path))

# Sequencial pKa calculation method is used starting form pH 7.0.
count=0
for key in eMolSKU_oemol_dict.keys():
    print("Running Epik for molecule {} ...".format(key))
    mol2_file_path = mol2_directory_path + "/" + str(key) + ".mol2"
    mae_file_path = mae_directory_path + "/" + str(key) + ".mae"
    schrodinger.run_epik(mol2_file_path, mae_file_path, max_structures=100, ph=7.0, ph_tolerance=None,
                         tautomerize=True, extract_range=None, max_atoms=150, scan=True)
    count=count+1
  
    print("Epik calculation for %s out of %s molecules finished."%(count,len(eMolSKU_oemol_dict)))

Running Epik with sequencial pKa prediction method...
./mae_files directory created.
Running Epik for molecule 103864712 ...
Epik calculation for 1 out of 9 molecules finished.
Running Epik for molecule 146582116 ...
Epik calculation for 2 out of 9 molecules finished.
Running Epik for molecule 103600837 ...
Epik calculation for 3 out of 9 molecules finished.
Running Epik for molecule 140622184 ...
Epik calculation for 4 out of 9 molecules finished.
Running Epik for molecule 166195289 ...
Epik calculation for 5 out of 9 molecules finished.
Running Epik for molecule 129249626 ...
Epik calculation for 6 out of 9 molecules finished.
Running Epik for molecule 177271519 ...
Epik calculation for 7 out of 9 molecules finished.
Running Epik for molecule 154972989 ...
Epik calculation for 8 out of 9 molecules finished.
Running Epik for molecule 146653637 ...
Epik calculation for 9 out of 9 molecules finished.


In [66]:
##### CONVERT EPIK OUTPUT (.MAE FILE) TO SDF #####

#sdf_directory_path = "./sdf_files"
#if not os.path.exists(sdf_directory_path):
#    os.makedirs(sdf_directory_path)
#    print("{} directory created.".format(sdf_directory_path))

#for key in eMolID_oemol_dict.keys():
#    mae_file_path = mae_directory_path + "/" + str(key) + ".mae"
#    sdf_file_path = sdf_directory_path + "/" + str(key) + ".sdf"
#    # Run Schrodinger's structconvert command line utility to convert mae file to sdf
#    print("Converting Epik output to SDF for molecule {} ...".format(key))
#    schrodinger.run_structconvert(input_file_path = mae_file_path, output_file_path = sdf_file_path)

In [67]:
##### RUN PROPLISTER TO EXTRACT PKAS #####

print("Running proplister to extract pKas from Epik output.")

# Create a dictionary to store predicted pKas
predicted_pKa_dict = {}

# Iterate over molecules
for key in eMolSKU_oemol_dict.keys():
    mae_file_path = mae_directory_path + "/" + str(key) + ".mae"
    proplister = schrodinger.run_proplister(input_file_path=mae_file_path)

    # Iterate over properties of each molecule
    # Record predicted pKa values in a list
    pKa_list = []
    for propkey, value in proplister[0].items():
        if propkey.startswith("r_epik_pKa"):
            pKa = float(value)
            pKa_list.append(pKa)

    pKa_list = sorted(pKa_list, key=float)
    predicted_pKa_dict[key] = pKa_list
    
print("Predicted pKa dictionary: {eMolecules SKU : pKas}")
print(predicted_pKa_dict)

Running proplister to extract pKas from Epik output.
Predicted pKa dictionary: {eMolecules SKU : pKas}
{140622184: [-0.417, -0.375, 6.881], 146653637: [-2.169, -0.906, 1.186, 12.184], 146582116: [3.158, 13.733], 103600837: [-1.464, 0.334, 0.438, 12.435, 14.553], 103864712: [1.152, 4.231, 11.29, 11.964, 13.609], 166195289: [0.192, 2.134, 9.045], 129249626: [-2.101, -1.102, 1.893, 11.952], 154972989: [], 177271519: [1.555, 6.069, 8.134, 14.754]}


In [68]:
##### ANALYZE PKA PREDICTIONS TO COUNT 3 <= PKAS <= 11 #####

# Create a pandas dataframe to store pKa information
df_pKa = pd.DataFrame(list(predicted_pKa_dict.items()), columns=["eMolecules SKU", "predicted pKas"])
df_pKa["pKas in [3,11]"]=np.NaN
df_pKa["pKa count in [3,11]"]=np.NaN

for i, row in df_pKa.iterrows():
    
    # Count pKas that are within 3-11 interval
    pKa_in_interval_count = 0
    pKas_in_interval = []
    
    pKas = row["predicted pKas"]
    for pKa in pKas:    
        if (3<= pKa) and (pKa <= 11):
            pKa_in_interval_count = int(pKa_in_interval_count + 1)
            pKas_in_interval.append(pKa)
    
    df_pKa.loc[i,"pKa count in [3,11]"] = pKa_in_interval_count
    #print(pKas_in_interval)
    df_pKa.loc[i,"pKas in [3,11]"] = str(pKas_in_interval)
    

# Flag molecules with pKas that are closer than 1 log unit
df_pKa["pKas closer than 1 unit"]=False

for index, row in df_pKa.iterrows():
    # print(row["pKas in [3,11]"])
    pKas = literal_eval(row["pKas in [3,11]"])
    
    if len(pKas)> 1:
        # The difference between consecutive pKas must be >= 1. If not, we will mark True.
        for i, pKa in enumerate(pKas[0:(len(pKas)-1)]):
            pKa_difference = float(pKas[i+1]) - float(pKas[i])
            
            if pKa_difference < 1:
                df_pKa.loc[index, "pKas closer than 1 unit"]=True
            else:
                continue

# Add Canonical Isomeric SMILES to dataframe
df_pKa["canonical isomeric SMILES"] = np.NAN
for i, row in df_pKa.iterrows():
    key = row["eMolecules SKU"]
    smiles = eMolSKU_smiles_dict[key] 
    df_pKa.loc[i,"canonical isomeric SMILES"] = smiles
                
df_pKa.to_csv("df_pKa.csv")               
#print(df_pKa)
df_pKa

Unnamed: 0,eMolecules SKU,predicted pKas,"pKas in [3,11]","pKa count in [3,11]",pKas closer than 1 unit,canonical isomeric SMILES
0,140622184,"[-0.417, -0.375, 6.881]",[6.881],1.0,False,c1ccc(c(c1)C(=O)Nc2nnc(s2)SCc3ccc(cc3)F)F
1,146653637,"[-2.169, -0.906, 1.186, 12.184]",[],0.0,False,CS(=O)(=O)c1ccc(nn1)c2cccc(c2)NC(=O)c3cccc(c3)...
2,146582116,"[3.158, 13.733]",[3.158],1.0,False,CC(C)c1ccc(cc1)/C=C\2/c3ccccc3NC2=O
3,103600837,"[-1.464, 0.334, 0.438, 12.435, 14.553]",[],0.0,False,COc1ccc(cc1OC)C(=O)Nc2c(c3c(s2)CCCC3)C(=O)N
4,103864712,"[1.152, 4.231, 11.29, 11.964, 13.609]",[4.231],1.0,False,c1cc(cc(c1)Cl)Nc2c3c([nH]cn3)nc(n2)N
5,166195289,"[0.192, 2.134, 9.045]",[9.045],1.0,False,c1ccc(cc1)CC(=O)Nc2nc3ccccc3s2
6,129249626,"[-2.101, -1.102, 1.893, 11.952]",[],0.0,False,CC1(Cc2c(cn(c(=O)c2C(=O)Nc3ccc(c(c3)Cl)OC)c4cc...
7,154972989,[],[],0.0,False,Cc1ccccc1c2nnc(o2)SCc3ccccc3
8,177271519,"[1.555, 6.069, 8.134, 14.754]","[6.069, 8.134]",2.0,False,c1ccc2c(c1)N(c3cc(ccc3S2)C(F)(F)F)CCCN4CCN(CC4...


In [69]:
##### REMOVE COMPOUNDS THAT DON'T HAVE PKAS WITHIN 3-11 INTERVAL #####
df_pKa_interval = df_pKa.loc[df_pKa["pKa count in [3,11]"] >= 1.0].reset_index(drop=True)

df_pKa_interval.to_csv("df_pKa_interval_3-11.csv")
print("Number of molecules with at least 1 pKa in 3-11 interval: ", df_pKa_interval.shape[0])

##### REMOVE COMPOUNDS THAT HAVE MORE THAN 4 PKAS WITHIN 3-11 INTERVAL #####
df_pKa_interval = df_pKa_interval.loc[df_pKa_interval["pKa count in [3,11]"] <= 4.0].reset_index(drop=True)

df_pKa_interval.to_csv("df_pKa_interval_3-11.csv")
print("Number of molecules with at most 4 pKa in 3-11 interval: ", df_pKa_interval.shape[0])

#print(df_pKa_interval)
df_pKa_interval

Number of molecules with at least 1 pKa in 3-11 interval:  5
Number of molecules with at most 4 pKa in 3-11 interval:  5


Unnamed: 0,eMolecules SKU,predicted pKas,"pKas in [3,11]","pKa count in [3,11]",pKas closer than 1 unit,canonical isomeric SMILES
0,140622184,"[-0.417, -0.375, 6.881]",[6.881],1.0,False,c1ccc(c(c1)C(=O)Nc2nnc(s2)SCc3ccc(cc3)F)F
1,146582116,"[3.158, 13.733]",[3.158],1.0,False,CC(C)c1ccc(cc1)/C=C\2/c3ccccc3NC2=O
2,103864712,"[1.152, 4.231, 11.29, 11.964, 13.609]",[4.231],1.0,False,c1cc(cc(c1)Cl)Nc2c3c([nH]cn3)nc(n2)N
3,166195289,"[0.192, 2.134, 9.045]",[9.045],1.0,False,c1ccc(cc1)CC(=O)Nc2nc3ccccc3s2
4,177271519,"[1.555, 6.069, 8.134, 14.754]","[6.069, 8.134]",2.0,False,c1ccc2c(c1)N(c3cc(ccc3S2)C(F)(F)F)CCCN4CCN(CC4...


In [70]:
#####  REMOVE COMPOUNDS WITH PKA CLOSER THAN 1 LOG UNIT #####
df_pKa_interval_spread = df_pKa_interval.loc[df_pKa_interval["pKas closer than 1 unit"]==False].reset_index(drop=True)

df_pKa_interval_spread.to_csv("df_pKa_interval_3-11_spread.csv")
print("Number of molecules with pKa in 3-11 interval and spread*: ", df_pKa_interval_spread.shape[0])
print("* pKa values of each molecule are not closer than 1 log unit.")
#print(df_pKa_interval_spread)
df_pKa_interval_spread

Number of molecules with pKa in 3-11 interval and spread*:  5
* pKa values of each molecule are not closer than 1 log unit.


Unnamed: 0,eMolecules SKU,predicted pKas,"pKas in [3,11]","pKa count in [3,11]",pKas closer than 1 unit,canonical isomeric SMILES
0,140622184,"[-0.417, -0.375, 6.881]",[6.881],1.0,False,c1ccc(c(c1)C(=O)Nc2nnc(s2)SCc3ccc(cc3)F)F
1,146582116,"[3.158, 13.733]",[3.158],1.0,False,CC(C)c1ccc(cc1)/C=C\2/c3ccccc3NC2=O
2,103864712,"[1.152, 4.231, 11.29, 11.964, 13.609]",[4.231],1.0,False,c1cc(cc(c1)Cl)Nc2c3c([nH]cn3)nc(n2)N
3,166195289,"[0.192, 2.134, 9.045]",[9.045],1.0,False,c1ccc(cc1)CC(=O)Nc2nc3ccccc3s2
4,177271519,"[1.555, 6.069, 8.134, 14.754]","[6.069, 8.134]",2.0,False,c1ccc2c(c1)N(c3cc(ccc3S2)C(F)(F)F)CCCN4CCN(CC4...


In [71]:
print("Done.")

Done.


In [13]:
# Example to read pickle files
# import pickle
# dictionary = pickle.load(open("eMolID_can_iso_smiles_dict.pickle", "rb"))
