# Selection of Extra Compounds for logD Challenge

Some of the compounds in pKa challenge compounds were not soluble enough for logD experiments. I will select 15 more compounds prioritizing ones likely to have high solubility.

Criteria:
- 150 <= mw <= 350
- rotatable bonds <= 3
- 4 < predicted pKa < 10
- only one titratable group betweeb pH 4 and 10

- -1 <= predicted logP <= 4
- -3 <= logS 



In [39]:
import pandas as pd
import numpy as np
import re
from openeye import oechem, oedepict, oemolprop
import oenotebook as oenb
import matplotlib.pyplot as plt
import pickle
%matplotlib inline

In [2]:
compound_list_file = "df_pKa_interval_3-11_spread.csv"

df_pKa = pd.read_csv(compound_list_file )
df_pKa.head()

Unnamed: 0.1,Unnamed: 0,level_0,index,eMolecules ID,predicted pKas,"pKas in [3,11]","pKa count in [3,11]",pKas closer than 1 unit,canonical isomeric SMILES,eMolecules SMILES
0,0,0,0,7616001,"[-2.982, 0.414, 3.454, 10.685, 13.943]","[3.454, 10.685]",2.0,False,c1cc2c(c(sc2nc1)C(=O)N)N,NC(=O)c1sc2c(c1N)cccn2
1,1,1,3,1570822,"[0.636, 3.95, 11.576]",[3.95],1.0,False,c1cc2cccnc2c(c1)NC(=O)c3ccc(cc3)Cl,Clc1ccc(cc1)C(=O)Nc1cccc2c1nccc2
2,2,2,4,44750813,"[0.435, 3.718, 6.011, 13.583]","[3.718, 6.011]",2.0,False,Cc1cnc(s1)Nc2ccccn2,Cc1cnc(s1)Nc1ccccn1
3,3,3,5,859659,"[-1.464, 1.397, 9.967, 14.623]",[9.967],1.0,False,c1ccc(cc1)c2cc(c(s2)N)C(=O)N,NC(=O)c1cc(sc1N)c1ccccc1
4,4,4,6,1155596,"[5.1, 13.807]",[5.1],1.0,False,COc1ccc(cc1)NC(=O)/C=C/c2ccccc2,COc1ccc(cc1)NC(=O)/C=C/c1ccccc1


In [7]:
df_pKa_values = pd.DataFrame()
df_pKa_values["eMolecules ID"] = df_pKa["eMolecules ID"]
df_pKa_values["predicted pKas"] = df_pKa["predicted pKas"]
df_pKa_values["pKa count in [3,11]"] = df_pKa["pKa count in [3,11]"]
df_pKa_values["pKas closer than 1 unit"] = df_pKa["pKas closer than 1 unit"]
df_pKa_values["canonical isomeric SMILES"] = df_pKa["canonical isomeric SMILES"]
df_pKa_values["eMolecules SMILES"] = df_pKa["eMolecules SMILES"]

print(df_pKa_values.shape[0])
df_pKa_values.head()

180


Unnamed: 0,eMolecules ID,predicted pKas,"pKa count in [3,11]",pKas closer than 1 unit,canonical isomeric SMILES,eMolecules SMILES
0,7616001,"[-2.982, 0.414, 3.454, 10.685, 13.943]",2.0,False,c1cc2c(c(sc2nc1)C(=O)N)N,NC(=O)c1sc2c(c1N)cccn2
1,1570822,"[0.636, 3.95, 11.576]",1.0,False,c1cc2cccnc2c(c1)NC(=O)c3ccc(cc3)Cl,Clc1ccc(cc1)C(=O)Nc1cccc2c1nccc2
2,44750813,"[0.435, 3.718, 6.011, 13.583]",2.0,False,Cc1cnc(s1)Nc2ccccn2,Cc1cnc(s1)Nc1ccccn1
3,859659,"[-1.464, 1.397, 9.967, 14.623]",1.0,False,c1ccc(cc1)c2cc(c(s2)N)C(=O)N,NC(=O)c1cc(sc1N)c1ccccc1
4,1155596,"[5.1, 13.807]",1.0,False,COc1ccc(cc1)NC(=O)/C=C/c2ccccc2,COc1ccc(cc1)NC(=O)/C=C/c1ccccc1


## 1. pKa filter
### 1.a. One pKa in [3, 11]

In [6]:
df_one_pKa = df_pKa_values[df_pKa_values["pKa count in [3,11]"]==1.0]
df_one_pKa.shape[0]

140

In [None]:
df_one_pKa

### 1.b. pKa in [4,10]

In [16]:
df_one_pKa.head()

Unnamed: 0,eMolecules ID,predicted pKas,"pKa count in [3,11]",pKas closer than 1 unit,canonical isomeric SMILES,eMolecules SMILES,"pKa count in [4,10]"
1,1570822,"[0.636, 3.95, 11.576]",1.0,False,c1cc2cccnc2c(c1)NC(=O)c3ccc(cc3)Cl,Clc1ccc(cc1)C(=O)Nc1cccc2c1nccc2,
3,859659,"[-1.464, 1.397, 9.967, 14.623]",1.0,False,c1ccc(cc1)c2cc(c(s2)N)C(=O)N,NC(=O)c1cc(sc1N)c1ccccc1,
4,1155596,"[5.1, 13.807]",1.0,False,COc1ccc(cc1)NC(=O)/C=C/c2ccccc2,COc1ccc(cc1)NC(=O)/C=C/c1ccccc1,
5,4038159,"[-1.464, 0.867, 4.663, 13.619, 14.227]",1.0,False,Cc1ccc(cc1C)c2csc(c2C(=O)N)NC(=O)CCCC(=O)O,OC(=O)CCCC(=O)Nc1scc(c1C(=O)N)c1ccc(c(c1)C)C,
6,2259473,"[-0.417, -0.375, 6.525]",1.0,False,c1ccc(c(c1)C(=O)Nc2nnc(s2)SCc3cccc(c3)F)Cl,Fc1cccc(c1)CSc1nnc(s1)NC(=O)c1ccccc1Cl,


In [30]:
df_one_pKa["pKa count in [4,10]"]=np.NaN

for i, row in df_one_pKa.iterrows():
    
    # Count pKas that are within 4-10 interval
    pKa_in_interval_count = 0
    pKas_in_interval = []
    
    # parse pKas string as a list
    pKas = row["predicted pKas"]
    # remove brackets
    pKas = pKas[1:]
    pKas = pKas[:-1]
    pKa_list = re.split(', | [ | ]',pKas)
    
    for pKa in pKa_list:   
        pKa = float(pKa)
        if (4<= pKa) and (pKa <= 10):
            pKa_in_interval_count = int(pKa_in_interval_count + 1)
            pKas_in_interval.append(pKa)
    
    df_one_pKa.loc[i,"pKa count in [4,10]"] = pKa_in_interval_count
    #print(pKas_in_interval)
    df_one_pKa.loc[i,"pKas in [4,10]"] = str(pKas_in_interval)
    
print(df_one_pKa.shape[0])
df_one_pKa.head()

140


Unnamed: 0,eMolecules ID,predicted pKas,"pKa count in [3,11]",pKas closer than 1 unit,canonical isomeric SMILES,eMolecules SMILES,"pKa count in [4,10]","pKas in [4,10]"
1,1570822,"[0.636, 3.95, 11.576]",1.0,False,c1cc2cccnc2c(c1)NC(=O)c3ccc(cc3)Cl,Clc1ccc(cc1)C(=O)Nc1cccc2c1nccc2,0.0,[]
3,859659,"[-1.464, 1.397, 9.967, 14.623]",1.0,False,c1ccc(cc1)c2cc(c(s2)N)C(=O)N,NC(=O)c1cc(sc1N)c1ccccc1,1.0,[9.967]
4,1155596,"[5.1, 13.807]",1.0,False,COc1ccc(cc1)NC(=O)/C=C/c2ccccc2,COc1ccc(cc1)NC(=O)/C=C/c1ccccc1,1.0,[5.1]
5,4038159,"[-1.464, 0.867, 4.663, 13.619, 14.227]",1.0,False,Cc1ccc(cc1C)c2csc(c2C(=O)N)NC(=O)CCCC(=O)O,OC(=O)CCCC(=O)Nc1scc(c1C(=O)N)c1ccc(c(c1)C)C,1.0,[4.663]
6,2259473,"[-0.417, -0.375, 6.525]",1.0,False,c1ccc(c(c1)C(=O)Nc2nnc(s2)SCc3cccc(c3)F)Cl,Fc1cccc(c1)CSc1nnc(s1)NC(=O)c1ccccc1Cl,1.0,[6.525]


In [33]:
##### REMOVE COMPOUNDS THAT DON'T HAVE PKAS WITHIN 3-11 INTERVAL #####
df_pKa_interval = df_one_pKa.loc[df_one_pKa["pKa count in [4,10]"] >= 1.0].reset_index()

df_pKa_interval.to_csv("df_pKa_interval_4-10.csv")
print("Number of molecules with one pKa in 4-10 interval: ", df_pKa_interval.shape[0])
#print(df_pKa_interval)
df_pKa_interval.head()

Number of molecules with one pKa in 4-10 interval:  117


Unnamed: 0,index,eMolecules ID,predicted pKas,"pKa count in [3,11]",pKas closer than 1 unit,canonical isomeric SMILES,eMolecules SMILES,"pKa count in [4,10]","pKas in [4,10]"
0,3,859659,"[-1.464, 1.397, 9.967, 14.623]",1.0,False,c1ccc(cc1)c2cc(c(s2)N)C(=O)N,NC(=O)c1cc(sc1N)c1ccccc1,1.0,[9.967]
1,4,1155596,"[5.1, 13.807]",1.0,False,COc1ccc(cc1)NC(=O)/C=C/c2ccccc2,COc1ccc(cc1)NC(=O)/C=C/c1ccccc1,1.0,[5.1]
2,5,4038159,"[-1.464, 0.867, 4.663, 13.619, 14.227]",1.0,False,Cc1ccc(cc1C)c2csc(c2C(=O)N)NC(=O)CCCC(=O)O,OC(=O)CCCC(=O)Nc1scc(c1C(=O)N)c1ccc(c(c1)C)C,1.0,[4.663]
3,6,2259473,"[-0.417, -0.375, 6.525]",1.0,False,c1ccc(c(c1)C(=O)Nc2nnc(s2)SCc3cccc(c3)F)Cl,Fc1cccc(c1)CSc1nnc(s1)NC(=O)c1ccccc1Cl,1.0,[6.525]
4,7,6766101,"[-0.417, -0.375, 7.202]",1.0,False,c1ccc(cc1)CSc2nnc(s2)NC(=O)c3ccc(cc3)F,Fc1ccc(cc1)C(=O)Nc1nnc(s1)SCc1ccccc1,1.0,[7.202]


## 2. logP filter
-1 <= predicted logP <= 4

In [37]:
df_XlogP = df_pKa_interval.loc[:, ("eMolecules ID", "canonical isomeric SMILES", "eMolecules SMILES", "pKas in [4,10]")]

print("Initial number of molecules: ", df_XlogP.shape[0])

# make new column for XlogP data
df_XlogP["XlogP"] = np.NaN

for i, row in enumerate(df_XlogP.iterrows()):
    smiles = df_XlogP.loc[i,"canonical isomeric SMILES"]
    mol = oechem.OEGraphMol()
    oechem.OESmilesToMol(mol, str(smiles))

    try:
        logP=oemolprop.OEGetXLogP(mol)
    except RuntimeError:
            print("Error: i=",i," ", df_molecules.ix[i, "NAME"])
            logP=float("NaN")

    df_XlogP.loc[i,"XlogP"] = float(logP)
    del mol

##### ELIMINATE MOLECULES OUTSIDE ACCEPTABLE LOGP INTEVAL: -1 <= logP <= 4  #####

# Select molecules within desired logP interval
df_XlogP_interval = df_XlogP.loc[(-1 <= df_XlogP["XlogP"])]
df_XlogP_interval = df_XlogP_interval.loc[(df_XlogP_interval["XlogP"] <= 4)].reindex()

print("Number of molecules in logP interval: ", df_XlogP_interval.shape[0])

# Save dataframe of molecules that are withing XlogP interval criteria.
df_XlogP_interval.to_csv("df_XlogP_interval.csv")
print("df_XlogP_interval.csv file generated.")

Initial number of molecules:  117
Number of molecules in logP interval:  85
df_XlogP_interval.csv file generated.


In [38]:
df_XlogP_interval.head()

Unnamed: 0,eMolecules ID,canonical isomeric SMILES,eMolecules SMILES,"pKas in [4,10]",XlogP
0,859659,c1ccc(cc1)c2cc(c(s2)N)C(=O)N,NC(=O)c1cc(sc1N)c1ccccc1,[9.967],2.157
1,1155596,COc1ccc(cc1)NC(=O)/C=C/c2ccccc2,COc1ccc(cc1)NC(=O)/C=C/c1ccccc1,[5.1],3.48
2,4038159,Cc1ccc(cc1C)c2csc(c2C(=O)N)NC(=O)CCCC(=O)O,OC(=O)CCCC(=O)Nc1scc(c1C(=O)N)c1ccc(c(c1)C)C,[4.663],2.314
5,31653344,c1ccc(cc1)n2cnc3c2ccc(c3)N,Nc1ccc2c(c1)ncn2c1ccccc1,[6.348],2.333
7,1421743,CCOc1ccc2c(c1)sc(n2)NC(=O)Cc3ccc(cc3)OC,CCOc1ccc2c(c1)sc(n2)NC(=O)Cc1ccc(cc1)OC,[9.167],3.795


## 3. filter by molecular weight

In [40]:
df_eMol = pd.read_csv("isosmiles-list-search-tier1-100mg.csv")
df_eMol_mw = df_eMol.loc[:,("eMolecules ID","MolWt","Availability (mg)", "Price")]

# Merge dataframes based on eMolecules ID
df_mw = df_XlogP_interval.merge(df_eMol_mw, on="eMolecules ID")
df_mw.head()

Unnamed: 0,eMolecules ID,canonical isomeric SMILES,eMolecules SMILES,"pKas in [4,10]",XlogP,MolWt,Availability (mg),Price
0,859659,c1ccc(cc1)c2cc(c(s2)N)C(=O)N,NC(=O)c1cc(sc1N)c1ccccc1,[9.967],2.157,218.275,19471.5,223.0
1,1155596,COc1ccc(cc1)NC(=O)/C=C/c2ccccc2,COc1ccc(cc1)NC(=O)/C=C/c1ccccc1,[5.1],3.48,253.296,100.0,168.0
2,4038159,Cc1ccc(cc1C)c2csc(c2C(=O)N)NC(=O)CCCC(=O)O,OC(=O)CCCC(=O)Nc1scc(c1C(=O)N)c1ccc(c(c1)C)C,[4.663],2.314,360.427,100.0,168.0
3,31653344,c1ccc(cc1)n2cnc3c2ccc(c3)N,Nc1ccc2c(c1)ncn2c1ccccc1,[6.348],2.333,209.247,50213.0,148.0
4,1421743,CCOc1ccc2c(c1)sc(n2)NC(=O)Cc3ccc(cc3)OC,CCOc1ccc2c(c1)sc(n2)NC(=O)Cc1ccc(cc1)OC,[9.167],3.795,342.412,724.0,168.0


In [41]:
# I will group compounds according to molecular weight into 2 groups.
# Fragment-like: 150<= mw <350
# Drug-like: 350< mw <=500

df_mw["group"] = None
for i, row in enumerate(df_mw.iterrows()):
    mw =df_mw.loc[i,"MolWt"]
    if (150 <= mw) and (mw < 350):
        df_mw.loc[i,"group"]= "fragment-like"
    elif (380 < mw) and (mw <= 500):
        df_mw.loc[i,"group"]= "drug-like"
    else:
        continue

df_mw.to_csv("df_mw.csv", index=False)        
df_mw.head()

Unnamed: 0,eMolecules ID,canonical isomeric SMILES,eMolecules SMILES,"pKas in [4,10]",XlogP,MolWt,Availability (mg),Price,group
0,859659,c1ccc(cc1)c2cc(c(s2)N)C(=O)N,NC(=O)c1cc(sc1N)c1ccccc1,[9.967],2.157,218.275,19471.5,223.0,fragment-like
1,1155596,COc1ccc(cc1)NC(=O)/C=C/c2ccccc2,COc1ccc(cc1)NC(=O)/C=C/c1ccccc1,[5.1],3.48,253.296,100.0,168.0,fragment-like
2,4038159,Cc1ccc(cc1C)c2csc(c2C(=O)N)NC(=O)CCCC(=O)O,OC(=O)CCCC(=O)Nc1scc(c1C(=O)N)c1ccc(c(c1)C)C,[4.663],2.314,360.427,100.0,168.0,
3,31653344,c1ccc(cc1)n2cnc3c2ccc(c3)N,Nc1ccc2c(c1)ncn2c1ccccc1,[6.348],2.333,209.247,50213.0,148.0,fragment-like
4,1421743,CCOc1ccc2c(c1)sc(n2)NC(=O)Cc3ccc(cc3)OC,CCOc1ccc2c(c1)sc(n2)NC(=O)Cc1ccc(cc1)OC,[9.167],3.795,342.412,724.0,168.0,fragment-like


In [42]:
df_frag = df_mw.loc[df_mw["group"] == "fragment-like"]
print("Number of compounds suitable for fragment-like group: ", df_frag.shape[0])

df_drug = df_mw.loc[df_mw["group"] == "drug-like"]
print("Number of compounds suitable for drug-like group: ", df_drug.shape[0])

Number of compounds suitable for fragment-like group:  74
Number of compounds suitable for drug-like group:  6


In [45]:
df_frag.head()

Unnamed: 0,eMolecules ID,canonical isomeric SMILES,eMolecules SMILES,"pKas in [4,10]",XlogP,MolWt,Availability (mg),Price,group,N_Rot
0,859659,c1ccc(cc1)c2cc(c(s2)N)C(=O)N,NC(=O)c1cc(sc1N)c1ccccc1,[9.967],2.157,218.275,19471.5,223.0,fragment-like,2.0
1,1155596,COc1ccc(cc1)NC(=O)/C=C/c2ccccc2,COc1ccc(cc1)NC(=O)/C=C/c1ccccc1,[5.1],3.48,253.296,100.0,168.0,fragment-like,5.0
3,31653344,c1ccc(cc1)n2cnc3c2ccc(c3)N,Nc1ccc2c(c1)ncn2c1ccccc1,[6.348],2.333,209.247,50213.0,148.0,fragment-like,
4,1421743,CCOc1ccc2c(c1)sc(n2)NC(=O)Cc3ccc(cc3)OC,CCOc1ccc2c(c1)sc(n2)NC(=O)Cc1ccc(cc1)OC,[9.167],3.795,342.412,724.0,168.0,fragment-like,
5,5616156,c1cc(cc(c1)F)CSc2nnc(o2)c3ccncc3,Fc1cccc(c1)CSc1nnc(o1)c1ccncc1,[4.902],3.441,287.312,428.5,148.0,fragment-like,


## 4. filtering based on number of rotatable bonds

In [48]:
# Number of non-terminal rotatable bonds (excluding -OH)
df_mw = df_frag.reset_index(drop=True)
df_mw["N_Rot"] = np.NaN

for i, row in enumerate(df_mw.iterrows()):
    smiles = df_mw.loc[i,"canonical isomeric SMILES"]
    
    mol = oechem.OEGraphMol()
    oechem.OESmilesToMol(mol, smiles)
    
    rotcounts = []
    nrots = oechem.OECount(mol, oechem.OEIsRotor())
    while nrots >= len(rotcounts):
        rotcounts.append(0)
    rotcounts[nrots] += 1
    
    del mol
    number_of_rotatable_bonds = len(rotcounts) - 1
    #print ("Max rotors:", number_of_rotatable_bonds)
    #print(df_mw.loc[i,"eMolecules ID"])
    
    df_mw.loc[i,"N_Rot"]= number_of_rotatable_bonds 

df_mw.head()

Unnamed: 0,eMolecules ID,canonical isomeric SMILES,eMolecules SMILES,"pKas in [4,10]",XlogP,MolWt,Availability (mg),Price,group,N_Rot
0,859659,c1ccc(cc1)c2cc(c(s2)N)C(=O)N,NC(=O)c1cc(sc1N)c1ccccc1,[9.967],2.157,218.275,19471.5,223.0,fragment-like,2.0
1,1155596,COc1ccc(cc1)NC(=O)/C=C/c2ccccc2,COc1ccc(cc1)NC(=O)/C=C/c1ccccc1,[5.1],3.48,253.296,100.0,168.0,fragment-like,5.0
2,31653344,c1ccc(cc1)n2cnc3c2ccc(c3)N,Nc1ccc2c(c1)ncn2c1ccccc1,[6.348],2.333,209.247,50213.0,148.0,fragment-like,1.0
3,1421743,CCOc1ccc2c(c1)sc(n2)NC(=O)Cc3ccc(cc3)OC,CCOc1ccc2c(c1)sc(n2)NC(=O)Cc1ccc(cc1)OC,[9.167],3.795,342.412,724.0,168.0,fragment-like,7.0
4,5616156,c1cc(cc(c1)F)CSc2nnc(o2)c3ccncc3,Fc1cccc(c1)CSc1nnc(o1)c1ccncc1,[4.902],3.441,287.312,428.5,148.0,fragment-like,4.0


In [50]:
# Number of rotatable bonds in fragment-like group shoud be <= 4
df_frag = df_mw.loc[df_mw["group"] == "fragment-like"]
df_frag_nrot_interval = df_frag.loc[df_frag["N_Rot"] <= 4]

print("Number of compounds suitable for fragment-like group: ", df_frag_nrot_interval.shape[0])

Number of compounds suitable for fragment-like group:  56


## 5. Eliminate compounds with NaN price or availability

In [51]:
# Eliminate entries without price in fragment-like set
df_frag = df_frag[np.isfinite(df_frag["Price"])].reset_index(drop=True)

# Eliminate entries with availability less than 100 mg
df_frag = df_frag[df_frag["Availability (mg)"] >= 100.0].reset_index(drop=True)
print("Number of fragment-like molecules with price and 100 mg availability:",df_frag.shape[0])

Number of fragment-like molecules with price and 100 mg availability: 66


## 6. Sort molecules by their price

In [52]:
df_frag = df_frag.sort_values(by = "Price", ascending = True, inplace=False).reset_index(drop=True)

## 7. Eliminate compounds determined to have experimental logP reported

I run cycles of selecting 25 compounds for fragment-like group and checking which ones have experimental logPs reported. eMolecules IDs of compounds with reported logPs were saved in a list, so that we can avoid them.

In [53]:
# List of molecules with experimental logPs reported in Chemspider
list_of_paths_to_exp_logP_file =  [
    "../20170808_zinc15_eMolecules_subset_exp_logP_check/fragments_with_exp_logP_round1.pickle",
    "../20170808_zinc15_eMolecules_subset_exp_logP_check/fragments_with_exp_logP_round2.pickle",
    "../20170808_zinc15_eMolecules_subset_exp_logP_check/fragments_with_exp_logP_round3.pickle",
    "../20170808_zinc15_eMolecules_subset_exp_logP_check/fragments_with_exp_logP_round4.pickle"]

print("Number of molecules before filtering by experimental logP:, ",df_frag.shape[0])

for file in list_of_paths_to_exp_logP_file:
    exp_logP_list = pickle.load(open(file, "rb"))

    print("eMolecules IDs of compounds eliminated:")
    for eMol_id in exp_logP_list:
        print(eMol_id)
        df_frag = df_frag[df_frag["eMolecules ID"] != eMol_id].reset_index(drop=True)

print("Number of molecules after eliminating compounds with experimental logP: ",df_frag.shape[0])

Number of molecules before filtering by experimental logP:,  66
eMolecules IDs of compounds eliminated:
490595
532754
4934119
2867544
2727697
493608
508324
2042282
1698122
703997
eMolecules IDs of compounds eliminated:
45490344
27013168
1503405
17004732
2266357
1202048
3374919
eMolecules IDs of compounds eliminated:
3605943
2546448
1716329
eMolecules IDs of compounds eliminated:
1532460
Number of molecules after eliminating compounds with experimental logP:  60


## 8. Eliminate compounds previously have been selected for pKa challenge

## 9. Include molecules with desired groups
Murcko fragments of FDA approved Protein Kinase Inhibitors were created. I analyzed the frequency of appearance of ring fragments. The following rings were seen multiple times:
1. pyridine
2. piperazine
3. quinazoline
4. quinoline
5. pyrimidine
6. indazole
7. pyrazole
8. imidazole

Three FDA approved kinase inhibitors have sulfonamide group: 
9. sulfonamide

I aim to include at least one molecule of each of these frequent rings in the SAMPL6 logP fragment-like set.

In [58]:
smarts = {'pyridine':"c1cccc[nX2]1",
               'piperazine': "[NX3]1CC[NX3]CC1",
               'quinazoline': "c:1:c:2:c(:c:c:c:1):n:c:n:c:2",
               'quinoline': "c:1:c:2:c(:c:c:c:1):n:c:c:c:2",
               'pyrimidine': "c1[nX2]ccc[nX2]1",
               'indazole': "c1c2c(ccc1)nnc2",
               'pyrazole': "c1cnnc1",
               'imidazole': "c1nccn1",
                'sulfanomide': "S(=O)(=O)N"}

#How to record selected compounds
picked_ids=[]
df_frag["Selection"] = None
df_frag["Bin index"] = None
df_frag["Priority"] = None

# Find the cheapest compound that matches each ring 
for key, value in smarts.items():
    queried_substructure = value
    
    # Iterate over compounds in the increasing price order
    for i, row in df_frag.iterrows():
        smiles = row["canonical isomeric SMILES"]
        
        # Substructure search
        mol = oechem.OEGraphMol()
        oechem.OESmilesToMol(mol, str(smiles))
        ss = oechem.OESubSearch(queried_substructure)
        oechem.OEPrepareSearch(mol, ss)

        # loop over matches to count
        matched_ss_list=[]
        count = 0
        for index, match in enumerate(ss.Match(mol)):
            if ss.SingleMatch(mol) == True:
                matched_ss_list.append((index, match))
                count = len(matched_ss_list)
        
        # If substructure matches and molecule already not selected
        if (count >= 1) and (df_frag.loc[i, "Selection"] != "picked"):
            df_frag.loc[i,"Selection"] = "picked"
            df_frag.loc[i,"Priority"] = 1
            print("Molecule that has {}: {}".format(key, smiles))
            break
 
    # If no match found
    if count == 0:
        print("Could not found a molecule that has {}!".format(key))
 

Molecule that has quinazoline: c1ccc2c(c1)c(ncn2)NCc3ccc(cc3)Cl
Molecule that has sulfanomide: Cc1ccc(cc1)S(=O)(=O)Nc2cccc(c2)C(F)(F)F
Molecule that has imidazole: c1ccc(cc1)n2cnc3c2ccc(c3)N
Molecule that has quinoline: Cc1ccc2c(c1)c(c(c(=O)[nH]2)CC(=O)O)c3ccccc3
Molecule that has indazole: c1ccc(cc1)NC(=O)c2c3ccccc3[nH]n2
Could not found a molecule that has pyrazole!
Molecule that has pyrimidine: COc1cccc(c1)Nc2c3ccccc3ncn2.Cl
Could not found a molecule that has piperazine!
Molecule that has pyridine: c1cc(cc(c1)F)CSc2nnc(o2)c3ccncc3
