### Convert .sdf file from enamine to csv file with costs for sparrow

In [5]:
from rdkit import Chem 
from tqdm import tqdm 
import pandas as pd
import re 

In [6]:
def update_inventory(smi, cost, amt_in_g, inventory): 
    if not smi: 
        return inventory
    if smi not in inventory: 
        inventory[smi] = {}
        inventory[smi][amt_in_g] = cost
    else: 
        inventory[smi][amt_in_g] = cost
    return inventory 

In [7]:
supplfile = 'Enamine_FullCatalogue_USD_122024.sdf'
suppl = Chem.SDMolSupplier(supplfile)
inventory = {}
for i, mol in tqdm(enumerate(suppl), total=len(suppl)):
    if mol: 
        smi = Chem.MolToSmiles(mol)
        price_props = [prop for prop in mol.GetPropNames() if prop.startswith('Price')]
        for prop in price_props:
            price_str = prop.split('Price_USD_')[1]
            amt, unit, _ = re.split('([a-z]+)', price_str)
            amt = float(amt)
            if unit == 'mg': 
                amt = amt/1000
            elif unit != 'g': 
                print(f'unknown unit: {unit}')
                continue
            price = mol.GetPropsAsDict()[prop]
            inventory = update_inventory(smi, price, amt, inventory)

 12%|█▏        | 38159/319341 [00:26<03:13, 1452.38it/s][10:55:58] Explicit valence for atom # 11 C, 5, is greater than permitted
[10:55:58] ERROR: Could not sanitize molecule ending on line 2894599
[10:55:58] ERROR: Explicit valence for atom # 11 C, 5, is greater than permitted
 36%|███▌      | 115376/319341 [01:16<02:09, 1580.27it/s][10:56:49] Explicit valence for atom # 35 N greater than permitted
[10:56:49] ERROR: Could not sanitize molecule ending on line 8644626
[10:56:49] ERROR: Explicit valence for atom # 35 N greater than permitted
[10:56:49] Explicit valence for atom # 14 C, 6, is greater than permitted
[10:56:49] ERROR: Could not sanitize molecule ending on line 8647116
[10:56:49] ERROR: Explicit valence for atom # 14 C, 6, is greater than permitted
 38%|███▊      | 120548/319341 [01:20<02:14, 1480.31it/s][10:56:52] Explicit valence for atom # 2 O, 4, is greater than permitted
[10:56:52] ERROR: Could not sanitize molecule ending on line 9025862
[10:56:52] ERROR: Explicit val

In [8]:
print(f'failure rate: {1-len(inventory)/len(suppl): 0.3f}')
print(f'total inventory size: {len(inventory)}')

failure rate:  0.004
total inventory size: 317934


In [9]:
df = pd.DataFrame(inventory).transpose()
df.head()

Unnamed: 0,0.10,0.25,0.50,1.00,2.50,5.00,10.00
CCOc1cc(C=O)ccc1OC(C)C(=O)N(C)C,73.0,103.0,196.0,284.0,558.0,825.0,1224.0
C=CCn1c(S)nnc1-c1cc(-c2ccc(C)cc2)nc2ccccc12,66.0,92.0,175.0,256.0,503.0,743.0,1101.0
CCCCn1c(S)nc2cc(C(=O)OC)ccc2c1=O,83.0,116.0,218.0,314.0,614.0,908.0,1346.0
COc1cc(C=O)ccc1OCC(=O)c1ccc(Cl)c(Cl)c1,73.0,103.0,196.0,284.0,558.0,825.0,1224.0
Cn1c(NCCO)nc2ccccc21,32.0,45.0,70.0,90.0,155.0,265.0,395.0


In [10]:
# df.to_csv('complete_inventory_g.csv')

In [20]:
min_per_g = [
    min([
        df[col][i]/col for col in df.columns
    ]) 
    for i in tqdm(range(len(df)))
]

  df[col][i]/col for col in df.columns
100%|██████████| 317934/317934 [00:16<00:00, 19756.84it/s]


In [None]:
df_min_per_g = pd.DataFrame({'SMILES': df.index, 'Cost per g': min_per_g})
df_min_per_g.head()

Unnamed: 0,SMILES,Cost
0,CCOc1cc(C=O)ccc1OC(C)C(=O)N(C)C,122.4
1,C=CCn1c(S)nnc1-c1cc(-c2ccc(C)cc2)nc2ccccc12,110.1
2,CCCCn1c(S)nc2cc(C(=O)OC)ccc2c1=O,134.6
3,COc1cc(C=O)ccc1OCC(=O)c1ccc(Cl)c(Cl)c1,122.4
4,Cn1c(NCCO)nc2ccccc21,39.5


In [24]:
df_min_per_g.to_csv('enamine_per_g_122024.csv', index=False)