In [1]:
#!/usr/bin/env python3

# This script takes a sdf for a set molecules with a SMILES column as input and filters out repeating entries, 
# molecules without prices and molecules with lower availability than 100 mg. 

import pandas as pd
import numpy as np
from openeye import oechem, oedepict, oemolprop
import oenotebook as oenb
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Convert SDF file of full similarity list to csv for easier manipulation

ifs = oechem.oemolistream()
ofs = oechem.oemolostream()

ifs.SetFormat(oechem.OEFormat_SDF)
ofs.SetFormat(oechem.OEFormat_CSV)

#for mol in ifs.GetOEGraphMols():

if ifs.open("eMol_similarity_set_2017_07.sdf"):
    if ofs.open("eMol_similarity_set_2017_07.csv"):
        for mol in ifs.GetOEGraphMols():
            oechem.OEWriteMolecule(ofs, mol)
    else:
        OEThrow.Fatal("Unable to create 'output.mol2'")
else:
    OEThrow.Fatal("Unable to open 'input.sdf'")
    
print("SDF file converted to CSV: eMol_similarity_set_2017_07.csv")

df_eMol_sim = pd.read_csv("eMol_similarity_set_2017_07.csv") # This file contains full starting set of compounds
print("Number of molecules: ", df_eMol_sim.shape[0])
df_eMol_sim

SDF file converted to CSV: eMol_similarity_set_2017_07.csv
Number of molecules:  2332


Unnamed: 0,SMILES,TITLE,TANIMOTO,AVG_TANIMOTO,Version_ID,Parent_ID,Supplier_ID,Compound_ID,CAS_number,Chemical_name,Catalog_ID,Supplier_status,Price_code,Salt_comment,Availability_amount,Availability_units,Supplier_name,Supplier_tier,Price_USD,Amount_mg
0,c1cc(cnc1)C(=O)Nc2cc(cc3c2ccnc3)F,181419324,0.544118,0.133022,181419323,181419322,64.0,6474375,,,1818.0,standard,Tangible SC,,,,Otava,3.0,440.0,50.0
1,c1ccc2c(c1)ncn2C3C(C(C(O3)CO)O)O,190171015,0.680000,0.096437,33399894,33399893,1380.0,STT-00189660,,,1758.0,favorites,,,,,InnovaPharm - Screening Compounds,1.0,,
2,c1cc(cc(c1)Cl)Nc2c3c([nH]cn3)nc(n2)N,103864712,1.000000,0.075597,13594970,13594969,48.0,STOCK6S-42112,,"N6-(3-chlorophenyl)-9H-purine-2,6-diamine",83.0,favorites,SC,,300.0,mg,InterBioScreen,1.0,593.0,100.0
3,Cc1ccccc1c2nnc(o2)SCc3ccccc3,154972989,1.000000,0.107519,2210636,2210635,104.0,7407604,,"2-(benzylthio)-5-(2-methylphenyl)-1,3,4-oxadia...",994.0,favorites,2,,50.0,mg,ChemBridge,1.0,304.0,100.0
4,c1ccc(cc1)c2c(n3ccsc3n2)C=CC=C(C#N)C#N,151098039,0.531250,0.081640,26161271,26161270,40.0,Z56768097,,,1018.0,favorites,HTS_Collection,,60.9,mg,Enamine Screening Compounds,1.0,272.0,100.0
5,c1ccc2c(c1)N(c3cc(ccc3S2)C(F)(F)F)CCCN4CCN(CC4...,133190549,1.000000,0.116009,732810,538377,61.0,OSSK_579035,,,163.0,standard,,2HCl,380.0,mg,Princeton BioMolecular Research,3.0,,
6,c1ccc2c(c1)N(c3cc(ccc3S2)C(F)(F)F)CCCN4CCN(CC4...,177271519,1.000000,0.116009,2724817,538377,40.0,Z2784093501,,,1018.0,favorites,HTS_Collection,,,,Enamine Screening Compounds,1.0,272.0,100.0
7,c1ccc(cc1)CC(=O)Nc2nc3ccccc3s2,138902653,1.000000,0.140443,1171893,1171892,580.0,PB30277195,,,1095.0,standard,,,397.5,mg,UORSY,3.0,,
8,c1ccc(cc1)CC(=O)Nc2nc3ccccc3s2,143204456,1.000000,0.140443,1171893,1171892,61.0,OSSL_178402,,N-Benzothiazol-2-yl-2-phenyl-acetamide,163.0,standard,,,624.0,mg,Princeton BioMolecular Research,3.0,,
9,c1ccc(cc1)CC(=O)Nc2nc3ccccc3s2,153698825,1.000000,0.140443,1171893,1171892,332.0,IVK/8147169,,,778.0,standard,,,,,Alinda Chemical,3.0,,


In [3]:
# Eliminate repeating entries
df_eMol_sim.drop_duplicates(inplace=True)
df_eMol_sim = df_eMol_sim.reset_index(drop=True)
print("Number of unique enteries :", df_eMol_sim.shape[0])

# Eliminate entries without price
df_eMol_sim_price = df_eMol_sim[np.isfinite(df_eMol_sim["Price_USD"])].reset_index(drop=True)
print("Number of unique enteries with price :", df_eMol_sim_price.shape[0])

# Eliminate entries not in Tier1
df_eMol_sim_price_tier1 = df_eMol_sim_price[df_eMol_sim_price["Supplier_tier"] == 1.0].reset_index(drop=True)
print("Number of unique enteries with price from Tier1 :",df_eMol_sim_price_tier1.shape[0])

# Eliminate entries with availability less than 100 mg
df_eMol_sim_price_tier1_100mg = df_eMol_sim_price_tier1[df_eMol_sim_price_tier1["Amount_mg"] >= 100.0].reset_index(drop=True)
print("Number of unique enteries with price from Tier1, 100 mg availability:",df_eMol_sim_price_tier1_100mg.shape[0])

df_eMol_sim_price_tier1_100mg.to_csv("df_eMol_sim_price_tier1_100mg.csv") 
df_eMol_sim_price_tier1_100mg 

Number of unique enteries : 2233
Number of unique enteries with price : 1395
Number of unique enteries with price from Tier1 : 1093
Number of unique enteries with price from Tier1, 100 mg availability: 990


Unnamed: 0,SMILES,TITLE,TANIMOTO,AVG_TANIMOTO,Version_ID,Parent_ID,Supplier_ID,Compound_ID,CAS_number,Chemical_name,Catalog_ID,Supplier_status,Price_code,Salt_comment,Availability_amount,Availability_units,Supplier_name,Supplier_tier,Price_USD,Amount_mg
0,c1cc(cc(c1)Cl)Nc2c3c([nH]cn3)nc(n2)N,103864712,1.000000,0.075597,13594970,13594969,48.0,STOCK6S-42112,,"N6-(3-chlorophenyl)-9H-purine-2,6-diamine",83.0,favorites,SC,,300.0,mg,InterBioScreen,1.0,593.0,100.0
1,Cc1ccccc1c2nnc(o2)SCc3ccccc3,154972989,1.000000,0.107519,2210636,2210635,104.0,7407604,,"2-(benzylthio)-5-(2-methylphenyl)-1,3,4-oxadia...",994.0,favorites,2,,50.0,mg,ChemBridge,1.0,304.0,100.0
2,c1ccc(cc1)c2c(n3ccsc3n2)C=CC=C(C#N)C#N,151098039,0.531250,0.081640,26161271,26161270,40.0,Z56768097,,,1018.0,favorites,HTS_Collection,,60.9,mg,Enamine Screening Compounds,1.0,272.0,100.0
3,c1ccc2c(c1)N(c3cc(ccc3S2)C(F)(F)F)CCCN4CCN(CC4...,177271519,1.000000,0.116009,2724817,538377,40.0,Z2784093501,,,1018.0,favorites,HTS_Collection,,,,Enamine Screening Compounds,1.0,272.0,100.0
4,c1ccc(cc1)CC(=O)Nc2nc3ccccc3s2,166195289,1.000000,0.140443,1171893,1171892,119.0,STK414477,,,1529.0,favorites,Historical,,574.0,mg,Vitas M Labs,1.0,233.0,100.0
5,c1ccc(cc1)CC(=O)Nc2nc3ccccc3s2,168623294,1.000000,0.140443,1171893,1171892,7.0,Y030-1142,,,9.0,favorites,DC,,0.0,mg,ChemDiv,1.0,1205.0,1000.0
6,CS(=O)(=O)c1ccc(nn1)c2cccc(c2)NC(=O)c3cccc(c3)...,146653637,0.541176,0.120637,5552124,5552123,42.0,F2098-0025,,,652.0,favorites,General,,82.5,mg,Life Chemicals,1.0,400.0,100.0
7,CC(C)c1ccc(cc1)/C=C\2/c3ccccc3NC2=O,146582116,1.000000,0.132163,45809595,45809594,42.0,F0807-0563,,,652.0,favorites,General,,283.7,mg,Life Chemicals,1.0,400.0,100.0
8,CC(C)c1ccc(cc1)/C=C\2/c3ccccc3NC2=O,165973703,1.000000,0.132163,45809595,45809594,119.0,STL282299,,,1530.0,favorites,Historical,,55.0,mg,Vitas M Labs,1.0,233.0,100.0
9,COc1ccc(cc1OC)C(=O)Nc2c(c3c(s2)CCCC3)C(=O)N,103600837,1.000000,0.138121,1452944,1452943,48.0,STOCK2S-64515,,"2-(3,4-dimethoxybenzamido)-4,5,6,7-tetrahydrob...",83.0,favorites,RSC,,118.0,mg,InterBioScreen,1.0,740.0,100.0


In [5]:
# Eliminate repeating molecules based on canonical isomeric SMILES
df_eMol_sim_unique_molecules = df_eMol_sim_price_tier1_100mg.drop_duplicates(subset = "SMILES")
print("Number of unique molecules: ", df_eMol_sim_unique_molecules.shape[0])

# Eliminate compound with eMolecules SKU 112319653, because it takes too long for conformer generation.
df_eMol_sim_unique_molecules = df_eMol_sim_unique_molecules[df_eMol_sim_unique_molecules["TITLE"] != 112319653]
print("Number of unique molecules: ", df_eMol_sim_unique_molecules.shape[0])

df_eMol_sim_unique_molecules.to_csv("df_eMol_sim_unique_molecules.csv")

df_eMol_sim_unique_molecules_smiles =  df_eMol_sim_unique_molecules.loc[:, ("SMILES", "TITLE")]
df_eMol_sim_unique_molecules_smiles.to_csv("df_eMol_sim_unique_molecules_smiles.smi")
df_eMol_sim_unique_molecules_smiles

Number of unique molecules:  623
Number of unique molecules:  622


Unnamed: 0,SMILES,TITLE
0,c1cc(cc(c1)Cl)Nc2c3c([nH]cn3)nc(n2)N,103864712
1,Cc1ccccc1c2nnc(o2)SCc3ccccc3,154972989
2,c1ccc(cc1)c2c(n3ccsc3n2)C=CC=C(C#N)C#N,151098039
3,c1ccc2c(c1)N(c3cc(ccc3S2)C(F)(F)F)CCCN4CCN(CC4...,177271519
4,c1ccc(cc1)CC(=O)Nc2nc3ccccc3s2,166195289
6,CS(=O)(=O)c1ccc(nn1)c2cccc(c2)NC(=O)c3cccc(c3)...,146653637
7,CC(C)c1ccc(cc1)/C=C\2/c3ccccc3NC2=O,146582116
9,COc1ccc(cc1OC)C(=O)Nc2c(c3c(s2)CCCC3)C(=O)N,103600837
15,CC1(Cc2c(cn(c(=O)c2C(=O)Nc3ccc(c(c3)Cl)OC)c4cc...,129249626
16,c1ccc(c(c1)C(=O)Nc2nnc(s2)SCc3ccc(cc3)F)F,140622184
