In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import os
from functools import reduce
from itertools import chain
import pandas as pd
import json
from tqdm import tqdm
from pprint import pprint
from collections import defaultdict
from ema.utils import get_names_synonyms_doid, get_doid_names, clean_text, contains_word
import pickle
from wikidataintegrator import wdi_helpers, wdi_core
pd.set_option('display.max_colwidth', -1)

In [2]:
source_url = 'http://ec.europa.eu/health/documents/community-register/html/alforphreg.htm'
scrap_res = pd.read_html(source_url, attrs={'id': 'wmtable'}, header=0, index_col=None)

In [3]:
eu_orphan_drugs = scrap_res[0] # get first table in table list
eu_orphan_drugs = eu_orphan_drugs.loc[eu_orphan_drugs['Product'].notnull(), :]
eu_orphan_drugs.reset_index(inplace=True)
eu_orphan_drugs = eu_orphan_drugs.drop('index', 1)
#eu_orphan_drugs.Product = eu_orphan_drugs.Product.str.lower()
eu_orphan_drugs.head(10)

Unnamed: 0,Product,EU Designation,Designated Orphan Indication,Sponsor,Designation date,TradenameEU Centralised NrImplemented on
0,"11-(2-pyrrolidin-1-yl-ethoxy)-14,19-dioxa-5,7,26-triaza-tetracyclo[19.3.1.1(2,6).1(8,12)] heptacosa-1(25),2(26),3,5,8,10,12(27),16,21,23-decaene",EU/3/10/767,Treatment of post-essential thrombocythaemia myelofibrosis,CTI Life Sciences Ltd,25/08/2010,
1,"11-(2-pyrrolidin-1-yl-ethoxy)-14,19-dioxa-5,7,26-triaza-tetracyclo[19.3.1.1(2,6).1(8,12)] heptacosa-1(25),2(26),3,5,8,10,12(27),16,21,23-decaene",EU/3/10/768,Treatment of primary myelofibrosis,CTI Life Sciences Ltd,25/08/2010,
2,"11-(2-pyrrolidin-1-yl-ethoxy)-14,19-dioxa-5,7,26-triaza-tetracyclo[19.3.1.1(2,6).1(8,12)] heptacosa-1(25),2(26),3,5,8,10,12(27),16,21,23-decaene",EU/3/10/769,Treatment of post-polycythaemia vera myelofibrosis,CTI Life Sciences Ltd,25/08/2010,
3,"11-(4-Dimethylamino-3-hydroxy-6-methyl-tetrahydro-pyran-2-yloxy)-2-ethyl-3,4,10-trihydroxy-3,5,6,8,10,12,14-heptamethyl-1-oxa-6-aza-cyclopentadecane-13,15-dione",EU/3/14/1239,Treatment of cystic fibrosis,Synovo GmbH,19/02/2014,
4,"1-(2,2-difluoro-1,3-benzodioxol-5-yl)-N-{1-[(2R)-2,3-dihydroxypropyl]-6-fluoro-2-(1-hydroxy-2-methylpropan-2-yl)-1H-indol-5-yl}cyclopropanecarboxamide",EU/3/14/1281,Treatment of cystic fibrosis,Vertex Pharmaceuticals (Europe) Limited,04/07/2014,
5,"1,2:5,6-Dianhydrogalactitol",EU/3/12/1093,Treatment of glioma,IDIS Ltd,24/01/2013,
6,"1,2-bis(methylsulphonyl)-1-(2-chloroethyl)-2-[(methylamino)carbonyl]hydrazine",EU/3/05/332,Treatment of acute myeloid leukaemia,"Vion (UK) Limited, ℅ i3 Research",14/12/2005,
7,"1-[(2-Chloro-4-methoxyphenoxy)methyl]-4-[(2,6-dichlorophenoxy)methyl]benzene",EU/3/12/1021,Prevention of poliomyelitis in patients with immunodeficiencies deemed at risk,ViroDefense Ltd,17/07/2012,
8,"1-(2-isopropoxyethyl)-2-thioxo-1,2,3,5-tetrahydro-pyrrolo[3,2-d]pyrimidin-4-one",EU/3/14/1404,Treatment of multiple system atrophy,AstraZeneca AB,16/12/2014,
9,"1-{3-[3-(4-chlorophenyl)propoxy]propyl}piperidine, hydrochloride",EU/3/07/459,Treatment of narcolepsy,Bioprojet Pharma,10/07/2007,WakixEU/1/15/106804/04/2016


In [4]:
len(eu_orphan_drugs)

1433

In [5]:
products = list(set(eu_orphan_drugs.Product))
len(products)

1148

In [10]:
# use OPSIN to convert IUPAC names to InChI keys https://bitbucket.org/dan2097/opsin/
with open('input.txt', 'w') as f:
    for x in products:
        f.write(x + '\n')

In [11]:
!java -jar data/opsin-2.2.0-jar-with-dependencies.jar -ostdinchikey input.txt output.txt 2> output.err

In [12]:
prod_inchi = dict()
with open('output.txt', 'r') as f:
    for count, x in enumerate(f.readlines()):
        ikey = x.strip()
        if ikey:
            prod_inchi[products[count]] = ikey

In [13]:
len(prod_inchi)

154

In [14]:
list(prod_inchi.items())[:10]

[('Adeno-associated viral vector serotype 8 containing the human glucose-6-phosphatase gene',
  'POMVPJBWDDJCMP-RUKDTIIFSA-M'),
 ('Autologous CD34+ cells transduced with lentiviral vector encoding the human beta globin gene',
  'OLDWOVJAXRWCGT-FBHGDYMESA-N'),
 ('3-methoxy-pregnenolone', 'MDSQOJYHHZBZKA-GBXCKJPGSA-N'),
 ('2-((2-ethyl-6-(4-(2-(3-hydroxyazetidin-1-yl)-2-oxoethyl)-piperazin-1-yl)-8-methylimidazo[1,2-alpha]pyridin-3-yl)-(methyl)amino)-4-(4-fluorophenyl)-thiazole-5-carbonitrile',
  'SUKJFIGYRHOWBL-UHFFFAOYSA-N'),
 ('Tideglusib', 'PVCULFYROUOVGJ-UHFFFAOYSA-N'),
 ('Budesonide', 'WUBBRNOQWQTFEX-UHFFFAOYSA-N'),
 ('Recombinant human tissue non-specific alkaline phosphatase - Fc - deca-aspartate fusion protein',
  'RCQXSQPPHJPGOF-UHFFFAOYSA-N'),
 ('Autologous T cells transduced with lentiviral vector containing a chimeric antigen receptor directed against CD19',
  'PXHANKVTFWSDSG-QLOBERJESA-N'),
 ('1-deoxygalactonojirimycin hydrochloride', 'TUWMKPVJGGWGNL-UHFFFAOYSA-N'),
 ('Pegyla

In [15]:
inchi_wd = wdi_helpers.id_mapper("P235")
wd_inchi = {v:k for k,v in inchi_wd.items()}

In [16]:
# map inchikey back to wdid
prod_wdid = {prod: inchi_wd[inchi] for prod,inchi in prod_inchi.items() if inchi in inchi_wd}
len(prod_wdid)

97

In [17]:
def get_drug_qid_map():
    # https://github.com/sebotic/wikidata_notebooks/blob/master/ema_annotations.ipynb
    drug_query = '''
    SELECT ?compound ?label ?who_name (GROUP_CONCAT(DISTINCT(?alias); separator="|") AS ?aliases) WHERE {{
      {{?compound wdt:P31 wd:Q11173 .}} UNION  # chemical compound
      {{?compound wdt:P31 wd:Q12140 .}} UNION  # pharmaceutical drug
      {{?compound wdt:P31 wd:Q79529 .}} UNION  # chemical substance
      {{?compound wdt:P2275 ?who_name FILTER (LANG(?who_name) = "en") .}}

      OPTIONAL {{
        ?compound rdfs:label ?label FILTER (LANG(?label) = "en") .
      }}
      OPTIONAL {{
        ?compound skos:altLabel ?alias FILTER (LANG(?alias) = "en") .
      }}
    }}
    GROUP BY ?compound ?label ?who_name ?aliases
    OFFSET {0}
    LIMIT 100000
    '''
    drug_qid_map = {}
    cc = 0
    while True:
        print(cc)
        r = wdi_core.WDItemEngine.execute_sparql_query(query=drug_query.format(100000 * cc))
        print(cc)
        cc += 1
        if len(r['results']['bindings']) == 0:
            break
        for x in r['results']['bindings']:
            qid = x['compound']['value']

            if 'who_name' in x:
                drug_qid_map.update({x['who_name']['value'].lower(): qid})

            if 'label' in x:
                drug_qid_map.update({x['label']['value'].lower(): qid})

            if 'aliases' in x:
                drug_qid_map.update({y.lower(): qid for y in x['aliases']['value'].split('|')})

    print('Drug to QID map has {} entries!'.format(len(drug_qid_map)))
    drug_qid_map = {k: v.replace("http://www.wikidata.org/entity/", "") for k, v in drug_qid_map.items()}
    with open(os.path.join(DATA_DIR, "drug_qid_map.pkl"), 'wb') as f:
        pickle.dump(drug_qid_map, f)

In [19]:
drug_qid_map = pickle.load(open("../data/drug_qid_map.pkl", 'rb'))
#drug_qid_map = get_drug_qid_map()
wdid_search_name = {x:drug_qid_map[x.lower()] for x in products if x.lower() in drug_qid_map}

In [21]:
len(wdid_search_name)

310

In [22]:
prod_wdid.update(wdid_search_name)

In [23]:
eu_orphan_drugs['drug_qid'] = eu_orphan_drugs.Product.apply(prod_wdid.get)

In [24]:
eu_orphan_drugs.count()

Product                                     1433
EU Designation                              1433
Designated Orphan Indication                1433
Sponsor                                     1433
Designation date                            1433
TradenameEU Centralised NrImplemented on    113 
drug_qid                                    536 
dtype: int64

In [25]:
eu_orphan_drugs.to_csv("eu_orphan.csv")