In [25]:
import sys
import time
import chembl_downloader
from chembl_downloader.contrib import get_target_smi_df
import pandas as pd
from tqdm.auto import tqdm
import pystow
from rdkit import Chem
import zipfile

In [26]:
print(sys.version)

3.10.8 (main, Oct 13 2022, 10:17:43) [Clang 14.0.0 (clang-1400.0.29.102)]


In [27]:
print(time.asctime())

Sun Oct 30 21:05:33 2022


Table 1 from Cortés-Ciriano and Bender's [*Deep Confidence: A Computationally Efficient Framework for Calculating Reliable Prediction Errors for Deep Neural Networks*](https://pubs.acs.org/doi/10.1021/acs.jcim.8b00542)

![]("cortes-cirano-table-1.png")

In [16]:
mappings = [
    ("A2a", "CHEMBL1867", 203),
    ("ABL1", "CHEMBL1862", 773),
    ("Acetylcholinesterase", "CHEMBL220", 3159),
    ("Aurora-A", "CHEMBL4722", 2125),
    ("B-raf", "CHEMBL5145", 1730),
    ("Cannabinoid", "CHEMBL218", 1116),
    ("Carbonic", "CHEMBL205", 603),
    ("Caspase", "CHEMBL2334", 1606),
    ("Coagulation", "CHEMBL204", 1700),
    ("COX-1", "CHEMBL221", 1343),
    ("COX-2", "CHEMBL230", 2855),
    ("Dihydrofolate", "CHEMBL202", 584),
    ("Dopamine", "CHEMBL217", 479),
    ("Ephrin", "CHEMBL222", 1740),
    ("erbB1", "CHEMBL203", 4868),
    ("Estrogen", "CHEMBL206", 1705),
    ("Glucocorticoid", "CHEMBL2034", 1447),
    ("Glycogen", "CHEMBL262", 1757),
    ("HERG", "CHEMBL240", 5207),
    ("JAK2", "CHEMBL2971", 2655),
    ("LCK", "CHEMBL258", 1352),
    ("Monoamine", "CHEMBL1951", 1379),
    ("opioid", "CHEMBL233", 840),
    ("Vanilloid", "CHEMBL4794", 1923),
]

## Load Old Data

In [6]:
url = "https://pubs.acs.org/doi/suppl/10.1021/acs.jcim.8b00542/suppl_file/ci8b00542_si_001.zip"
# path = pystow.ensure("cheminf", url=url)
path = "../data/ci8b00542_si_001.zip"

In [24]:
old_dfs = {}
with zipfile.ZipFile(path) as zip_file:
    for stem, target_chembl_id, _ in tqdm(mappings, unit="file"):
        with zip_file.open(f"Datasets/{stem}.sdf") as file:
            rows = [
                (
                    Chem.MolToSmiles(molecule),
                    molecule.GetProp("ChEMBL_ID"),
                    molecule.GetProp("pIC50"),
                )
                for molecule in tqdm(
                    Chem.ForwardSDMolSupplier(file),
                    desc=f"{target_chembl_id}",
                    leave=False,
                    unit="molecule",
                    unit_scale=True,
                )
            ]
        old_df = old_dfs[target_chembl_id] = pd.DataFrame(
            rows,
            columns=[
                "smiles",
                "chembl_id",
                "pchembl",
            ],
        )

  0%|          | 0/24 [00:00<?, ?file/s]

CHEMBL1867: 0.00molecule [00:00, ?molecule/s]

CHEMBL1862: 0.00molecule [00:00, ?molecule/s]

CHEMBL220: 0.00molecule [00:00, ?molecule/s]

CHEMBL4722: 0.00molecule [00:00, ?molecule/s]

CHEMBL5145: 0.00molecule [00:00, ?molecule/s]

CHEMBL218: 0.00molecule [00:00, ?molecule/s]

CHEMBL205: 0.00molecule [00:00, ?molecule/s]

CHEMBL2334: 0.00molecule [00:00, ?molecule/s]

CHEMBL204: 0.00molecule [00:00, ?molecule/s]

CHEMBL221: 0.00molecule [00:00, ?molecule/s]

CHEMBL230: 0.00molecule [00:00, ?molecule/s]

CHEMBL202: 0.00molecule [00:00, ?molecule/s]

CHEMBL217: 0.00molecule [00:00, ?molecule/s]

CHEMBL222: 0.00molecule [00:00, ?molecule/s]

CHEMBL203: 0.00molecule [00:00, ?molecule/s]

CHEMBL206: 0.00molecule [00:00, ?molecule/s]

CHEMBL2034: 0.00molecule [00:00, ?molecule/s]

CHEMBL262: 0.00molecule [00:00, ?molecule/s]

CHEMBL240: 0.00molecule [00:00, ?molecule/s]

CHEMBL2971: 0.00molecule [00:00, ?molecule/s]

CHEMBL258: 0.00molecule [00:00, ?molecule/s]

CHEMBL1951: 0.00molecule [00:00, ?molecule/s]

CHEMBL233: 0.00molecule [00:00, ?molecule/s]

CHEMBL4794: 0.00molecule [00:00, ?molecule/s]

## Refresh Data

In [4]:
version = chembl_downloader.latest()
new_dfs = {}
rows = []
for abbr, target_chembl_id, old in tqdm(mappings):
    df = new_dfs[target_chembl_id] = get_target_smi_df(
        target_chembl_id,
        version=version,
        aggregate="mean",
    )
    new = len(df.index)
    rows.append((abbr, target_chembl_id, old, new, new - old))

summary_df = pd.DataFrame(
    rows,
    columns=["abbrevation", "target_chembl_id", "old_count", "new_count", "delta"],
)
summary_df

  0%|          | 0/24 [00:00<?, ?it/s]

Unnamed: 0,abbrevation,target_chembl_id,old_count,new_count,delta
0,A2a,CHEMBL1867,203,917,714
1,ABL1,CHEMBL1862,773,2425,1652
2,Acetylcholinestease,CHEMBL220,3159,5067,1908
3,Aurora-A,CHEMBL4722,2125,3202,1077
4,B-raf,CHEMBL5145,1730,4039,2309
5,Cannabinoid,CHEMBL218,1116,5007,3891
6,Carbonic,CHEMBL205,603,7297,6694
7,Caspase,CHEMBL2334,1606,1913,307
8,Coagulation,CHEMBL204,1700,5162,3462
9,COX-1,CHEMBL221,1343,1551,208
