# Map IDC Compounds to DrugBank

[MicroSource International Drug Collection](http://www.msdiscovery.com/intdrug.html)

In [1]:
import re
import logging

import pandas
import requests

## Read IDC compounds from tamai screen spreadsheet

In [2]:
idc_df = pandas.read_excel(
    io="data/tamai-screen/IDC_Period_24-120.xlsx",
    sheet_name="MicroSource Discovery社311品目",
).add_prefix("idc_")
# fix messy column names
idc_df.columns = idc_df.columns.str.strip().str.lower().str.replace(" ", "_")
# http://www.msdiscovery.com/pharmakon.html
idc_df["pharmakon_id"] = idc_df.idc_idnumber.map("Pharmakon1600-{:08d}".format)
idc_df.head(2)


Unnamed: 0,idc_idnumber,idc_plate,idc_position,idc_name,idc_structure,idc_formula,idc_moecularl_weight,idc_saltdata,idc_medical_use,idc_reference,idc_cas,idc_status,idc_etc,pharmakon_id
0,1506107,151126-01,A02,SULFACARBAMIDE,,C7H9N3O3S,215.23208,,antibacterial,Antimicrob Agents Chemother 19:82 (1981),547-44-4,synthetic,"INN, BAN",Pharmakon1600-01506107
1,210369,151126-01,A03,GALLIC ACID,,C7H6O5,170.12287,,"antineoplastic, astringent, antibacterial",J Chem Soc 1961:1854; Agric Biol Chem 30:617 (...,149-91-7,insect galls,"INN, NF-VII",Pharmakon1600-00210369


## Map IDC compounds to PubChem and Drugbank

Uses the `Pharmakon1600-` style IDs for the [MicroSource Pharmakon Collection](http://www.msdiscovery.com/pharmakon.html), which includes the IDC. If this fails, fallback to plain name search.

[PubChem](https://pubchempy.readthedocs.io/en/latest/guide/substance.html#retrieving-substances) has the Pharmakon identifiers deposited as substances. For example, [search PubChem](https://pubchem.ncbi.nlm.nih.gov/#query=Pharmakon1600-00300607) for `Pharmakon1600-00300607`.

In [3]:
def query_pubchem(compound_name):
    """
    https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/Pharmakon1600-01500621/xrefs/SBURL/JSON
    """
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{compound_name}/xrefs/SBURL/JSON"
    response = requests.get(url)
    if not response.ok:
        logging.info(f"query for {compound_name} returned status code {response.status_code}\n{response.url}")
        return None
    return response.json()


def url_to_drugbank_id(url):
    pattern = re.compile(
        r"https?://(www.)?drugbank.ca/drugs/(?P<drugbank_id>DB[0-9]+)")
    match = pattern.match(url)
    if match:
        return match.group("drugbank_id")


def query_pubchem_for_drugbank(compound_name):
    results = query_pubchem(compound_name)
    if not results:
        return None
    results = results["InformationList"]["Information"][0]
    drugbank_ids = sorted(set(filter(None, map(url_to_drugbank_id, results["SBURL"]))))
    if len(drugbank_ids) > 1:
        logging.info(f"{compound_name} maps to multiple DrugBank IDs: {drugbank_ids}")
    row = dict(
        query_compound=compound_name,
        pubchem_cid=results["CID"],
        drugbank_id=drugbank_ids[0] if drugbank_ids else None,
    )
    return row

In [4]:
# %%time
rows = []
for idc_compound in idc_df.itertuples():
    for compound_name in [idc_compound.pharmakon_id, idc_compound.idc_name]:
        row = query_pubchem_for_drugbank(compound_name)
        if row:
            break
    else:
        logging.warning(f"pharmakon_id and idc_name lookup failed for {idc_compound.idc_name}")
        continue
    row["idc_name"] = idc_compound.idc_name
    row["pharmakon_id"] = idc_compound.pharmakon_id
    rows.append(row)
len(rows)



310

In [5]:
idc_map_df = (
    idc_df
    [["pharmakon_id", "idc_name", "idc_plate", "idc_position"]]
    .merge(pandas.DataFrame(rows), how="left")
    .convert_dtypes()
)
idc_map_df.head()

Unnamed: 0,pharmakon_id,idc_name,idc_plate,idc_position,query_compound,pubchem_cid,drugbank_id
0,Pharmakon1600-01506107,SULFACARBAMIDE,151126-01,A02,Pharmakon1600-01506107,11033,
1,Pharmakon1600-00210369,GALLIC ACID,151126-01,A03,Pharmakon1600-00210369,370,
2,Pharmakon1600-00210866,KHELLIN,151126-01,A04,Pharmakon1600-00210866,3828,
3,Pharmakon1600-01503127,DEQUALINIUM CHLORIDE,151126-01,A05,Pharmakon1600-01503127,5351247,
4,Pharmakon1600-00212064,HYDROXYTOLUIC ACID,151126-01,A06,Pharmakon1600-00212064,6738,


In [7]:
# show only records that have drugbank_id
idc_map_df.dropna(subset=["drugbank_id"]).head()

Unnamed: 0,pharmakon_id,idc_name,idc_plate,idc_position,query_compound,pubchem_cid,drugbank_id
5,Pharmakon1600-00212151,CHLORQUINALDOL,151126-01,A07,Pharmakon1600-00212151,6301,DB13306
7,Pharmakon1600-00300546,BERGAPTEN,151126-01,A09,Pharmakon1600-00300546,2355,DB12216
11,Pharmakon1600-01502024,PIPEMIDIC ACID,151126-01,B03,Pharmakon1600-01502024,4831,DB13823
14,Pharmakon1600-00330012,COUMOPHOS,151126-01,B06,Pharmakon1600-00330012,2871,DB11390
15,Pharmakon1600-01506157,NIKETHAMIDE,151126-01,B07,Pharmakon1600-01506157,5497,DB13655


In [8]:
# IDC compounds not mapped to PubChem
idc_map_df[idc_map_df.pubchem_cid.isna()]

Unnamed: 0,pharmakon_id,idc_name,idc_plate,idc_position,query_compound,pubchem_cid,drugbank_id
172,Pharmakon1600-01503740,BUTYLATED HYDROXYANISOLE,151126-03,B05,,,


In [17]:
print(f"""
{(idc_map_df.query_compound == idc_map_df.pharmakon_id).sum():,} compounds mapped to pubchem via pharmakon_id
{(idc_map_df.query_compound == idc_map_df.idc_name).sum():,} compounds mapped to pubchem via idc_name
{sum(idc_map_df.pubchem_cid.isna()):,} compounds failed to map to pubchem

{sum(idc_map_df.drugbank_id.notna()):,} compounds mapped to DrugBank
""".strip())

273 compounds mapped to pubchem via pharmakon_id
37 compounds mapped to pubchem via idc_name
1 compounds failed to map to pubchem

120 compounds mapped to DrugBank


In [18]:
idc_map_df.to_csv("data/compounds/idc-to-pubchem.tsv", sep="\t", index=False)

## Map compounds to DrugBank using MyChem.info

Uses https://mychem.info/

Alternative to the PubChem method.

In [3]:
def solr_escape(text):
    """
    Escape reserved characters for pysolr queries.
    https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html#TheStandardQueryParser-EscapingSpecialCharacters
    https://docs.mychem.info/en/latest/doc/chem_query_service.html#escaping-reserved-characters
    """
    import re
    reserved_chars = '+ - = && || > < ! ( ) { } [ ] ^ " ~ * ? : \ /'.split()
    pattern = re.compile('|'.join(map(re.escape, reserved_chars)))
    return pattern.sub(repl= lambda m: f"\\{m.group()}", string=text)


def get_mychem_query(name):
    """
    https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html
    """
    escaped_name = solr_escape(name)
    search_fields = ["drugbank.name", "drugbank.synonyms"]
    query = " OR ".join(f'{field}:"{escaped_name}"' for field in search_fields)
    query = f"_exists_:drugbank AND ({query})"
    return query


def query_mychem(name):
    url = "https://mychem.info/v1/query"
    params = dict(
        q=get_mychem_query(name),
        fields="drugbank.name,drugbank.id,drugbank.synonyms,drugbank.formula,drugbank.weight",
        size=1,
    )
    response = requests.get(url, params)
    # solr doesn't have a whole target string match mode.
    # For example, "Gallic acid" will match "gallic acid bismuth basic salt"
    return response



In [4]:
idc_df.Name[:5]

0          SULFACARBAMIDE
1             GALLIC ACID
2                 KHELLIN
3    DEQUALINIUM CHLORIDE
4      HYDROXYTOLUIC ACID
Name: Name, dtype: object

In [5]:
rows = []
for name in idc_df.Name:
    response = query_mychem(name)
    # print(response.url)
    for hit in response.json()["hits"]:
        hit["query_compound"] = name
        rows.append(hit)
query_df = pandas.json_normalize(rows)
query_df.head(2)

Unnamed: 0,_id,_score,query_compound,drugbank._license,drugbank.formula,drugbank.id,drugbank.name,drugbank.synonyms,drugbank.weight.average,drugbank.weight.monoisotopic
0,JAONZGLTYYUPCT-UHFFFAOYSA-K,8.772476,GALLIC ACID,http://bit.ly/2PSfZTD,C7H5BiO6,DB13909,Bismuth subgallate,"[basic bismuth 3,4,5-trihydroxybenzoate, basis...",394.091,393.98901
1,GPTXWRGISTZRIO-UHFFFAOYSA-N,14.978941,CHLORQUINALDOL,http://bit.ly/2PSfZTD,C10H7Cl2NO,DB13306,Chlorquinaldol,"[2-methyl-5,7-dichloro-8-hydroxyquinoline, 5,7...",228.07,226.990469


In [6]:
query_df = query_df.reindex(columns=["query_compound", "drugbank.id", "drugbank.name", "drugbank.formula", "drugbank.weight.average"])

In [7]:
merged_df = (
    idc_df
    .reindex(columns=["IDNUMBER", "Name", "Formula", "moecularl weight"])
    .add_prefix("idc_")
    .merge(query_df, left_on="idc_Name", right_on="query_compound")
)
merged_df.head(3)

Unnamed: 0,idc_IDNUMBER,idc_Name,idc_Formula,idc_moecularl weight,query_compound,drugbank.id,drugbank.name,drugbank.formula,drugbank.weight.average
0,210369,GALLIC ACID,C7H6O5,170.12287,GALLIC ACID,DB13909,Bismuth subgallate,C7H5BiO6,394.091
1,212151,CHLORQUINALDOL,C10H7Cl2NO,228.07939,CHLORQUINALDOL,DB13306,Chlorquinaldol,C10H7Cl2NO,228.07
2,300546,BERGAPTEN,C12H8O4,216.19516,BERGAPTEN,DB12216,Bergapten,C12H8O4,216.192


In [8]:
merged_df.to_csv("data/compounds/idc-to-drugbank.tsv", sep="\t", index=False)

In [11]:
name_mismatch_df = merged_df[merged_df.idc_Name.str.lower() != merged_df["drugbank.name"].str.lower()]
name_mismatch_df.head(2)

Unnamed: 0,idc_IDNUMBER,idc_Name,idc_Formula,idc_moecularl weight,query_compound,drugbank.id,drugbank.name,drugbank.formula,drugbank.weight.average
0,210369,GALLIC ACID,C7H6O5,170.12287,GALLIC ACID,DB13909,Bismuth subgallate,C7H5BiO6,394.091
11,1500621,AMINOPYRINE,C13H17N3O,231.29994,AMINOPYRINE,DB01424,Aminophenazone,C13H17N3O,231.2936


In [13]:
# mismatched names with different weights
name_mismatch_df[(name_mismatch_df["idc_moecularl weight"] - name_mismatch_df["drugbank.weight.average"]).abs() > 1.0]

Unnamed: 0,idc_IDNUMBER,idc_Name,idc_Formula,idc_moecularl weight,query_compound,drugbank.id,drugbank.name,drugbank.formula,drugbank.weight.average
0,210369,GALLIC ACID,C7H6O5,170.12287,GALLIC ACID,DB13909,Bismuth subgallate,C7H5BiO6,394.091
26,1501111,PROTOPORPHYRIN IX,C34H34N4O4,562.67448,PROTOPORPHYRIN IX,DB02949,2-Acetyl-Protoporphyrin Ix,C34H34FeN4O5,634.503
32,1501173,ACETANILIDE,C8H9NO,135.16703,ACETANILIDE,DB01297,Practolol,C14H22N2O3,266.3361
55,1506163,SELENOMETHIONINE,C5H11NO2Se,196.10892,SELENOMETHIONINE,DB09400,Selenomethionine Se-75,C5H11NO2Se,192.071
84,1503921,CYPROTERONE,C22H27ClO3,374.91169,CYPROTERONE,DB04839,Cyproterone acetate,C24H29ClO4,416.938
85,1503941,THIOCTIC ACID,C8H14O2S2,206.32758,THIOCTIC ACID,DB06253,Thioctic acid tromethamine,C12H25NO5S2,327.45
100,1502248,GLUTATHIONE,C10H17N3O6S,307.32749,GLUTATHIONE,DB03310,Glutathione disulfide,C20H32N6O12S2,612.631
131,1701028,CORTISONE,C21H28O5,360.45431,CORTISONE,DB01380,Cortisone acetate,C23H30O6,402.4807
138,1503973,2-THIOURACIL,C4H4N2OS,128.15328,2-THIOURACIL,DB00550,Propylthiouracil,C7H10N2OS,170.232
