In [3]:
import pandas
import pubchempy

In [44]:
# Read DrugBank compounds
drugbank_df = pandas.read_table('data/drugbank.tsv')
drugbank_df = drugbank_df[-drugbank_df.inchi.isnull()]

In [38]:
# map DrugBank compounds to pubchem using InChI 
rows = list()
for i, row in drugbank_df.iterrows():
    try:
        compounds = pubchempy.get_compounds(row.inchi, namespace='inchi')
    except pubchempy.BadRequestError:
        print('BadRequestError', row)
        continue
    try:
        compound, = compounds
    except ValueError:
        print(row, compounds)
        continue
    row['pubchem_cid'] = compound.cid
    rows.append(row)

BadRequestError drugbank_id                                              DB00115
name                                              Cyanocobalamin
type                                              small molecule
groups                                    approved|nutraceutical
atc_codes                                        B03BB01|B03BA01
categories         Vitamin B Complex|Vitamins|Anti-anemic Agents
inchikey                    InChIKey=SEKGMJVHSBBHRD-WZHZPDAFSA-M
inchi          InChI=1S/C62H90N13O14P.CN.Co/c1-29-20-39-40(21...
Name: 109, dtype: object
BadRequestError drugbank_id                                              DB00116
name                                        Tetrahydrofolic acid
type                                              small molecule
groups                                    approved|nutraceutical
atc_codes                                                    NaN
categories        Dietary Supplements|Micronutrients|Supplements
inchikey                    InChI

In [52]:
# Create a DataFrame of the mapping
mapped_df = pandas.DataFrame(rows)
mapping_df = mapped_df[['drugbank_id', 'pubchem_cid']].dropna()
mapping_df['pubchem_cid'] = mapping_df['pubchem_cid'].astype(int)
mapping_df.head()

Unnamed: 0,drugbank_id,pubchem_cid
13,DB00014,5311128
34,DB00035,16051933
48,DB00050,25074887
86,DB00091,5280754
88,DB00093,14257662


In [53]:
# Save mapping
mapping_df.to_csv('data/pubchem-mapping.tsv', index=False, sep='\t')

In [54]:
# The number of DrugBank compounds that did not uniquely map to PubChem
len(drugbank_df) - len(mapping_df)

510