# Summary

Download the IDD Dataset from Mendeley data and transform it into a format useful to literature and pathway analysis

See: https://www.sciencedirect.com/science/article/pii/S2352340921009768

In [6]:
import pandas as pd
import sys
import json
import requests

In [7]:
!{sys.executable} -m pip install openpyxl


distutils: /opt/conda/include/python3.8/UNKNOWN
sysconfig: /opt/conda/include/python3.8[0m
user = False
home = None
root = None
prefix = None[0m
distutils: /opt/conda/include/python3.8/UNKNOWN
sysconfig: /opt/conda/include/python3.8[0m
user = False
home = None
root = None
prefix = None[0m


In [8]:
def download_file(url, local_filename):
    # NOTE the stream=True parameter
    r = requests.get(url, stream=True, allow_redirects=True)
    with open(local_filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024): 
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
                f.flush()
    return local_filename

In [18]:
file = download_file("https://data.mendeley.com/public-files/datasets/9nmgzttxhm/files/f1c4e9b4-a79c-4cd1-9bfc-b13612c8f557/file_downloaded", "idd.xlsx")

In [19]:
df = pd.read_excel(file, header=1)

In [20]:
df.head()

Unnamed: 0,DRUGNAME,RXAUI,RXCUI,STR,SAB,TTY,CODE
0,(4-isobutylphenyl)-a-methylacetic acid,12254458,5640,ibuprofen,RXNORM,IN,5640
1,"(4Meq sodium + 3mmol phosphate)/ml, 100 ml",464336,36709,sodium phosphate,RXNORM,IN,36709
2,(4R)-1-methyl-4-isopropenylcyclohex-1-ene,12256019,1426476,"limonene, (+)-",RXNORM,PIN,1426476
3,"(4R)-2'-Deoxy-2',2'-difluoro-3,4,5,6-tetrahydr...",12415812,2384449,cedazuridine,RXNORM,IN,2384449
4,(4R)-4-isopropenyl-1-methylcyclohexene,12256019,1426476,"limonene, (+)-",RXNORM,PIN,1426476


In [21]:
df['DRUGNAME'] = df['DRUGNAME'].str.lower()

In [22]:
additional_synonyms = {
    '5 fluoruoracil': 'fluorouracil',
    "5 fu": "fluorouracil",
    "5 fluorouracil": "fluorouracil",
    "c1 folfiri": "fluorouracil|leucovorin|irinotecan",
    "folfiri": "fluorouracil|leucovorin|irinotecan",
    "folfox": "fluorouracil|leucovorin|oxaliplatin",
    "erbitux": "cetuximab",
    "avastin": "bevacizumab",
    "folinic acid": "leucovorin",
    "leucovorin calcium": "leucovorin",
    'levcovorin': 'leucovorin',
    "mayo 425-20": "fluorouracil|leucovorin",
    "amg 655": "conatumumab",
    "oxaliplatinum": "oxaliplatin",
    "fluorouracilum": "fluorouracil",
    "5 fluorouracilum": "fluorouracil",
    "calcium foliatum": "leucovorin",
    "irinotecan hcl": "irinotecan",
    "pegfilgrastim (peg g-csf)": "pegfilgrastim",
    "filgrastim (g-csf)": "filgrastim",
    'dexamethassone': 'dexamethasone',
    'camptosar': 'irinotecan',
    "cpt-11": 'irinotecan',
    'xeloda': 'capecitabine',
    'mitomycin c': 'mitomycin',
    'folfirinox': 'fluorouracil|leucovorin|irinotecan|oxaliplatin'
}
for syn, name in additional_synonyms.items():
    if len(df[df['DRUGNAME'] == syn]) == 0:
        df.loc[len(df.index)] = [syn, None, None, name, None, None, None]

In [23]:
df[df['RXCUI'] == 11289]

Unnamed: 0,DRUGNAME,RXAUI,RXCUI,STR,SAB,TTY,CODE
2517,4-hydroxy-3-(3-oxo-1-phenylbutyl)coumarin,12253984,11289,warfarin,RXNORM,IN,11289
16219,aldocumar,12253984,11289,warfarin,RXNORM,IN,11289
16220,aldocumar 1 mg comprimidos,12253984,11289,warfarin,RXNORM,IN,11289
16221,aldocumar 10 mg comprimidos,12253984,11289,warfarin,RXNORM,IN,11289
16222,aldocumar 3 mg comprimidos,12253984,11289,warfarin,RXNORM,IN,11289
...,...,...,...,...,...,...,...
448857,warfin,12253984,11289,warfarin,RXNORM,IN,11289
448858,warfmadin,12253984,11289,warfarin,RXNORM,IN,11289
448859,"warfmadi̇n 10 mg tablet, 28 adet",12253984,11289,warfarin,RXNORM,IN,11289
448860,"warfmadi̇n 5 mg tablet, 28 adet",12253984,11289,warfarin,RXNORM,IN,11289


In [26]:
synonyms = { row.DRUGNAME: row.STR for _, row in df.iterrows()}

In [29]:
rx_norm = { row.DRUGNAME: row.RXCUI for _, row in df.iterrows() }

In [30]:
agg = df.groupby('STR', as_index=False).aggregate(lambda x: x.unique().tolist())

In [31]:
agg.head()

Unnamed: 0,STR,DRUGNAME,RXAUI,RXCUI,SAB,TTY,CODE
0,(2-benzhydryloxyethyl)diethyl-methylammonium i...,[(2-benzhydryloxyethyl)diethyl-methylammonium ...,[5477182],[1429915],[ATC],[IN],[A03AB16]
1,(3S)-3-methyl-D-aspartic acid,"[(3s)-3-methyl-d-aspartic acid, 3-methyl-beta-...",[9704237],[1992843],[DRUGBANK],[IN],[DB04313]
2,(R)-Praziquantel,"[(-)-praziquantel, (r)-(-)-praziquantel, (r)-p...",[9183371],[1923634],[DRUGBANK],[IN],[DB11749]
3,(S)-Propafenone,"[(-)-(s)-propafenone, (-)-propafenone, (s)-pro...",[11419633],[2169450],[DRUGBANK],[IN],[DB15410]
4,(S)-Warfarin,"[(s)-warfarin, (-)-warfarin, (s)-4-hydroxy-3-(...",[10277859],[2048011],[DRUGBANK],[IN],[DB14055]


In [28]:
df[df['STR'] == 'birinapant']['DRUGNAME'].tolist()

[]

In [32]:
standard_name_to_synonyms = { row.STR: row.DRUGNAME for _, row in agg.iterrows() }

In [33]:
with open('./drug_synonyms.json', 'w') as file:
    file.write(json.dumps({"synonyms": synonyms, "standard_name_to_synonyms": standard_name_to_synonyms, 'rx_norm_codes': rx_norm}))

In [34]:
with open('./drug_synonyms.json') as file:
    syn_json = json.load(file)

In [35]:
[(key, value) for i, (key, value) in enumerate(syn_json['standard_name_to_synonyms'].items()) if i < 3]

[('(2-benzhydryloxyethyl)diethyl-methylammonium iodide',
  ['(2-benzhydryloxyethyl)diethyl-methylammonium iodide',
   'ethylbenzhydramine methyl iodide',
   'emetonium iodide']),
 ('(3S)-3-methyl-D-aspartic acid',
  ['(3s)-3-methyl-d-aspartic acid',
   '3-methyl-beta-d-aspartic acid',
   'd-methyl aspartic acid']),
 ('(R)-Praziquantel',
  ['(-)-praziquantel',
   '(r)-(-)-praziquantel',
   '(r)-praziquantel',
   'l-praziquantel',
   'l-pzq'])]