In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import os
from functools import reduce
from itertools import chain
import pandas as pd
import json
from tqdm import tqdm
from pprint import pprint
from collections import defaultdict
from ema.utils import get_names_synonyms_doid, get_doid_names, clean_text, contains_word
from wikidataintegrator import wdi_helpers, wdi_core

DATA_DIR = "../data"

In [2]:
# run match_drugs first

In [3]:
df = pd.read_csv("eu_orphan.csv", index_col=0)
df.head()

Unnamed: 0,Product,EU Designation,Designated Orphan Indication,Sponsor,Designation date,TradenameEU Centralised NrImplemented on,drug_qid
0,"11-(2-pyrrolidin-1-yl-ethoxy)-14,19-dioxa-5,7,...",EU/3/10/767,Treatment of post-essential thrombocythaemia m...,CTI Life Sciences Ltd,25/08/2010,,
1,"11-(2-pyrrolidin-1-yl-ethoxy)-14,19-dioxa-5,7,...",EU/3/10/768,Treatment of primary myelofibrosis,CTI Life Sciences Ltd,25/08/2010,,
2,"11-(2-pyrrolidin-1-yl-ethoxy)-14,19-dioxa-5,7,...",EU/3/10/769,Treatment of post-polycythaemia vera myelofibr...,CTI Life Sciences Ltd,25/08/2010,,
3,11-(4-Dimethylamino-3-hydroxy-6-methyl-tetrahy...,EU/3/14/1239,Treatment of cystic fibrosis,Synovo GmbH,19/02/2014,,
4,"1-(2,2-difluoro-1,3-benzodioxol-5-yl)-N-{1-[(2...",EU/3/14/1281,Treatment of cystic fibrosis,Vertex Pharmaceuticals (Europe) Limited,04/07/2014,,


In [4]:
first_word = df['Designated Orphan Indication'].str.split(" ").apply(lambda x:x[0].lower())
first_word.value_counts().head()

treatment       1341
prevention        73
diagnosis         12
conditioning       2
adjunctive         2
Name: Designated Orphan Indication, dtype: int64

In [5]:
# these don't fit the pattern!! Skip them for now
baddies = ~first_word.isin({'treatment', 'prevention', 'diagnosis'})
df[baddies]

Unnamed: 0,Product,EU Designation,Designated Orphan Indication,Sponsor,Designation date,TradenameEU Centralised NrImplemented on,drug_qid
134,5-aminolevulinic acid hydrochloride,EU/3/02/121,Intra-operative photodynamic diagnosis of resi...,medac Gesellschaft für klinische Spezialpräpar...,13/11/2002,GliolanEU/1/07/41312/09/2007,
185,Adeno-associated viral vector serotype 2 conta...,EU/3/14/1278,'Treatment of choroideremia,Alan Boyd Consultants Ltd,04/06/2014,,
408,Carboxypeptidase G2,EU/3/02/128,Adjunctive treatment in patients at risk of me...,BTG Management Services Limited,03/02/2003,,Q5572303
650,Herpes simplex 1 virus-thymidine kinase and tr...,EU/3/03/168,Adjunctive treatment in hematopoietic cell tra...,MolMed S.p.A.,20/10/2003,ZalmoxisEU/1/16/112123/08/2016,
1306,Somatropin,EU/3/00/001,AIDS wasting,Merck Serono Europe Limited,08/08/2000,,Q20801765
1353,Thiotepa,EU/3/06/424,Conditioning treatment prior to haematopoietic...,ADIENNE S.r.l.,29/01/2007,TepadinaEU/1/10/62217/03/2010,Q416507
1373,Treosulfan,EU/3/04/186,Conditioning treatment prior to haematopoietic...,medac Gesellschaft für klinische Spezialpräpar...,23/02/2004,,Q7838652


In [6]:
df = df[~baddies]

In [7]:
df.loc[:, 'indication'] = df['Designated Orphan Indication'].str.split(" ").apply(lambda x: " ".join(x[2:]).lower())
df.head()

Unnamed: 0,Product,EU Designation,Designated Orphan Indication,Sponsor,Designation date,TradenameEU Centralised NrImplemented on,drug_qid,indication
0,"11-(2-pyrrolidin-1-yl-ethoxy)-14,19-dioxa-5,7,...",EU/3/10/767,Treatment of post-essential thrombocythaemia m...,CTI Life Sciences Ltd,25/08/2010,,,post-essential thrombocythaemia myelofibrosis
1,"11-(2-pyrrolidin-1-yl-ethoxy)-14,19-dioxa-5,7,...",EU/3/10/768,Treatment of primary myelofibrosis,CTI Life Sciences Ltd,25/08/2010,,,primary myelofibrosis
2,"11-(2-pyrrolidin-1-yl-ethoxy)-14,19-dioxa-5,7,...",EU/3/10/769,Treatment of post-polycythaemia vera myelofibr...,CTI Life Sciences Ltd,25/08/2010,,,post-polycythaemia vera myelofibrosis
3,11-(4-Dimethylamino-3-hydroxy-6-methyl-tetrahy...,EU/3/14/1239,Treatment of cystic fibrosis,Synovo GmbH,19/02/2014,,,cystic fibrosis
4,"1-(2,2-difluoro-1,3-benzodioxol-5-yl)-N-{1-[(2...",EU/3/14/1281,Treatment of cystic fibrosis,Vertex Pharmaceuticals (Europe) Limited,04/07/2014,,,cystic fibrosis


In [25]:
# get rid of ' ’ -
rep = {'’': '', '\'': '', '-': '', 'ae': 'e', '`': '', '‘': '', 'ou': 'o', 'ze': 'se', '&#8217;': '', '&#945;': 'α'}
replace_multi = lambda s: reduce(lambda a, kv: a.replace(*kv), rep.items(), s)
df.indication = df.indication.str.lower().apply(replace_multi)
diseases = set(list(df['indication'].dropna().str.strip())) - {""}
len(diseases)

525

## orphanet
http://data.bioontology.org/ontologies/ORDO/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=csv

In [26]:
ordo = pd.read_csv("ORDO.csv.gz")
ordo.dropna(subset=['Preferred Label'], inplace=True)
name_ordo = dict(zip(ordo['Preferred Label'].str.lower(), ordo['Class ID']))
name_ordo = {replace_multi(k): v for k,v in name_ordo.items()}
len(name_ordo)

13318

In [27]:
for _, row in ordo[ordo.Synonyms.notnull()].iterrows():
    for syn in row.Synonyms.split("|"):
        name_ordo[replace_multi(syn.lower())] = row['Class ID']
del name_ordo['disease']
len(name_ordo)

33194

In [28]:
ordo_matches = {x:name_ordo[x] for x in diseases if x in name_ordo}
len(ordo_matches)

213

### DO and HPO

In [29]:
# match to DO or HPO
def onto_synonyms(json_path):
    doid = json.loads(open(json_path).read())
    graph = doid['graphs'][0]
    nodes = graph['nodes']
    name_doid = dict()
    for node in nodes:
        if 'meta' in node and 'deprecated' in node['meta'] and node['meta']['deprecated']:
            continue
        if 'lbl' not in node:
            continue
        name_doid[replace_multi(node['lbl'].lower())] = node['id']
        if 'meta' in node and 'synonyms' in node['meta']:
            for syn in node['meta']['synonyms']:
                name_doid[replace_multi(syn['val'].lower())] = node['id']
    return name_doid
name_doid = onto_synonyms("../data/doid.json")
name_hpo = onto_synonyms("../data/hp.json")
print(len(name_doid))
print(len(name_hpo))

21023
26569


In [30]:
doid_matches = {x:name_doid[x] for x in diseases if x in name_doid}
len(doid_matches)

193

In [31]:
hpo_matches = {x:name_hpo[x] for x in diseases if x in name_hpo}
len(hpo_matches)

69

### mesh

In [32]:
# create mesh title synonyms dict
# json is created from https://github.com/stuppie/mesh-parser
mesh = json.load(open("/home/gstupp/projects/mesh/mesh.json"))
name_mesh = dict()
for record in mesh.values():
    if record['record_type'] == 'C':
        continue
    name_mesh[replace_multi(record['term']).lower()] = record['_id']
    for syn in record.get('synonyms', []):
        name_mesh[replace_multi(syn).lower()] = record['_id']
mesh_name = {v:k for k,v in name_mesh.items()}

In [33]:
mesh_matches = {x:name_mesh[x] for x in diseases if x in name_mesh}
len(mesh_matches)

228

In [34]:
diseases_not_matched = {x for x in diseases if x not in doid_matches and x not in ordo_matches and x not in mesh_matches and x not in hpo_matches}
len(diseases_not_matched)

229

In [35]:
diseases_not_matched

{'5fluororacil overdose',
 '5q spinal muscular atrophy',
 'achromatopsia caused by mutations in the cnga3 gene',
 'achromatopsia caused by mutations in the cngb3',
 'achromatopsia caused by mutations in the cngb3 gene',
 'active ulcerative colitis',
 'acute myleoid leukemia',
 'acute peripheral arterial occlusion',
 'acute sensorineural hearing loss (acute acostic trauma, sudden deafness and surgery induced acostic trauma)',
 'adenosine deaminasedeficientsevere combined immunodeficiency',
 'adenovirus infection following hematopoietic stem cell transplantation',
 'adenovirus infection in allogeneic hematopoietic stemcell transplant recipients',
 'adenovirus infection in immunocompromised patients',
 'adrenocorticotropindependent cushings syndrome',
 'aicardigoti&egrave;res syndrome',
 'alstr&oml;m syndrome',
 'amyloid lightchain amyloidosis',
 'arteriovenos access dysfunction in hemodialysis patients',
 'arteriovenos access dysfunction in patients undergoing surgical creation of an art

In [36]:
df['doid'] = df.indication.apply(doid_matches.get)
df['hpo'] = df.indication.apply(hpo_matches.get)
df['ordo'] = df.indication.apply(ordo_matches.get)
df['mesh'] = df.indication.apply(mesh_matches.get)

In [37]:
df.head()

Unnamed: 0,Product,EU Designation,Designated Orphan Indication,Sponsor,Designation date,TradenameEU Centralised NrImplemented on,drug_qid,indication,doid,hpo,ordo,mesh
0,"11-(2-pyrrolidin-1-yl-ethoxy)-14,19-dioxa-5,7,...",EU/3/10/767,Treatment of post-essential thrombocythaemia m...,CTI Life Sciences Ltd,25/08/2010,,,postessential thrombocythemia myelofibrosis,,,,
1,"11-(2-pyrrolidin-1-yl-ethoxy)-14,19-dioxa-5,7,...",EU/3/10/768,Treatment of primary myelofibrosis,CTI Life Sciences Ltd,25/08/2010,,,primary myelofibrosis,http://purl.obolibrary.org/obo/DOID_4971,,http://www.orpha.net/ORDO/Orphanet_824,D055728
2,"11-(2-pyrrolidin-1-yl-ethoxy)-14,19-dioxa-5,7,...",EU/3/10/769,Treatment of post-polycythaemia vera myelofibr...,CTI Life Sciences Ltd,25/08/2010,,,postpolycythemia vera myelofibrosis,,,,
3,11-(4-Dimethylamino-3-hydroxy-6-methyl-tetrahy...,EU/3/14/1239,Treatment of cystic fibrosis,Synovo GmbH,19/02/2014,,,cystic fibrosis,http://purl.obolibrary.org/obo/DOID_1485,,http://www.orpha.net/ORDO/Orphanet_586,D003550
4,"1-(2,2-difluoro-1,3-benzodioxol-5-yl)-N-{1-[(2...",EU/3/14/1281,Treatment of cystic fibrosis,Vertex Pharmaceuticals (Europe) Limited,04/07/2014,,,cystic fibrosis,http://purl.obolibrary.org/obo/DOID_1485,,http://www.orpha.net/ORDO/Orphanet_586,D003550


In [38]:
doid_wd = wdi_helpers.id_mapper("P699")
ordo_wd = wdi_helpers.id_mapper("P1550")
mesh_wd = wdi_helpers.id_mapper("P486")

In [39]:
df.loc[:,'indication_qid_doid'] = df.doid.dropna().apply(lambda x:doid_wd.get(x.split("/")[-1].replace("_",":")))
df.loc[:,'indication_qid_ordo'] = df.ordo.dropna().apply(lambda x:ordo_wd.get(x.split("_")[-1]))
df.loc[:,'indication_qid_mesh'] = df.mesh.dropna().apply(mesh_wd.get)

In [40]:
df.head()

Unnamed: 0,Product,EU Designation,Designated Orphan Indication,Sponsor,Designation date,TradenameEU Centralised NrImplemented on,drug_qid,indication,doid,hpo,ordo,mesh,indication_qid_doid,indication_qid_ordo,indication_qid_mesh
0,"11-(2-pyrrolidin-1-yl-ethoxy)-14,19-dioxa-5,7,...",EU/3/10/767,Treatment of post-essential thrombocythaemia m...,CTI Life Sciences Ltd,25/08/2010,,,postessential thrombocythemia myelofibrosis,,,,,,,
1,"11-(2-pyrrolidin-1-yl-ethoxy)-14,19-dioxa-5,7,...",EU/3/10/768,Treatment of primary myelofibrosis,CTI Life Sciences Ltd,25/08/2010,,,primary myelofibrosis,http://purl.obolibrary.org/obo/DOID_4971,,http://www.orpha.net/ORDO/Orphanet_824,D055728,Q1752571,,Q1752571
2,"11-(2-pyrrolidin-1-yl-ethoxy)-14,19-dioxa-5,7,...",EU/3/10/769,Treatment of post-polycythaemia vera myelofibr...,CTI Life Sciences Ltd,25/08/2010,,,postpolycythemia vera myelofibrosis,,,,,,,
3,11-(4-Dimethylamino-3-hydroxy-6-methyl-tetrahy...,EU/3/14/1239,Treatment of cystic fibrosis,Synovo GmbH,19/02/2014,,,cystic fibrosis,http://purl.obolibrary.org/obo/DOID_1485,,http://www.orpha.net/ORDO/Orphanet_586,D003550,Q178194,,Q178194
4,"1-(2,2-difluoro-1,3-benzodioxol-5-yl)-N-{1-[(2...",EU/3/14/1281,Treatment of cystic fibrosis,Vertex Pharmaceuticals (Europe) Limited,04/07/2014,,,cystic fibrosis,http://purl.obolibrary.org/obo/DOID_1485,,http://www.orpha.net/ORDO/Orphanet_586,D003550,Q178194,,Q178194


In [42]:
df.to_csv("eu_orphan_match.csv")

In [43]:
good_df=df[df.indication_qid_doid.notnull() & df.drug_qid.notnull()]
good_df

Unnamed: 0,Product,EU Designation,Designated Orphan Indication,Sponsor,Designation date,TradenameEU Centralised NrImplemented on,drug_qid,indication,doid,hpo,ordo,mesh,indication_qid_doid,indication_qid_ordo,indication_qid_mesh
23,"17a,21-dihydroxy-16a-methyl-pregna-1,4,9(11)-t...",EU/3/14/1309,Treatment of Duchenne muscular dystrophy,ReveraGen BioPharma Limited,22/08/2014,,Q27270940,duchenne muscular dystrophy,http://purl.obolibrary.org/obo/DOID_11723,,http://www.orpha.net/ORDO/Orphanet_98896,D020388,Q1648484,,Q1648484
25,1-deoxygalactonojirimycin hydrochloride,EU/3/06/368,Treatment of Fabry disease,Amicus Therapeutics UK Ltd,22/05/2006,GalafoldEU/1/15/108231/05/2016,Q27291314,fabry disease,http://purl.obolibrary.org/obo/DOID_14499,,http://www.orpha.net/ORDO/Orphanet_324,D000795,Q615645,,Q615645
42,2-((2-ethyl-6-(4-(2-(3-hydroxyazetidin-1-yl)-2...,EU/3/16/1712,Treatment of idiopathic pulmonary fibrosis,Galapagos NV,29/08/2016,,Q407204,idiopathic pulmonary fibrosis,http://purl.obolibrary.org/obo/DOID_0050156,,http://www.orpha.net/ORDO/Orphanet_2032,D054990,Q2290446,,Q2290446
86,"3,5-diiodothyropropionic acid",EU/3/13/1193,Treatment of Allan-Herndon-Dudley syndrome,CATS Consultants GmbH,07/10/2013,,Q27252433,allanherndondudley syndrome,http://purl.obolibrary.org/obo/DOID_0050631,,http://www.orpha.net/ORDO/Orphanet_59,,Q4731121,Q4731121,
93,(3S)-1-azabicyclo[2.2.2]oct-3-yl{2-[2-(4-fluor...,EU/3/14/1310,Treatment of Fabry disease,Genzyme Europe B.V.,22/08/2014,,Q27077129,fabry disease,http://purl.obolibrary.org/obo/DOID_14499,,http://www.orpha.net/ORDO/Orphanet_324,D000795,Q615645,,Q615645
94,(3S)-1-azabicyclo[2.2.2]oct-3-yl{2-[2-(4-fluor...,EU/3/14/1374,Treatment of Gaucher disease,Genzyme Europe B.V.,19/11/2014,,Q27077129,gaucher disease,http://purl.obolibrary.org/obo/DOID_1926,,http://www.orpha.net/ORDO/Orphanet_355,D005776,Q861645,Q861645,Q861645
102,"4’-[(2-butyl-4-oxo-1,3-diazaspiro[4.4]non-1-en...",EU/3/15/1574,Treatment of focal segmental glomerulosclerosis,Retrophin Europe Limited,11/11/2015,,Q27896056,focal segmental glomerulosclerosis,http://purl.obolibrary.org/obo/DOID_1312,http://purl.obolibrary.org/obo/HP_0000097,,D005923,Q1435223,,Q1435223
166,Acadesine,EU/3/11/881,Treatment of multiple myeloma,Advancell - Advanced In Vitro Cell Technologie...,05/08/2011,,Q4671562,multiple myeloma,http://purl.obolibrary.org/obo/DOID_9538,http://purl.obolibrary.org/obo/HP_0006775,http://www.orpha.net/ORDO/Orphanet_29073,D009101,Q467635,,Q467635
168,Acalabrutinib,EU/3/16/1625,Treatment of mantle cell lymphoma,"Acerta Pharma, BV",21/03/2016,,Q27074143,mantle cell lymphoma,http://purl.obolibrary.org/obo/DOID_0050746,,http://www.orpha.net/ORDO/Orphanet_52416,D020522,Q268713,,
169,Acalabrutinib,EU/3/16/1626,Treatment of lymphoplasmacytic lymphoma,"Acerta Pharma, BV",21/03/2016,,Q27074143,lymphoplasmacytic lymphoma,http://purl.obolibrary.org/obo/DOID_0050747,,,,Q1778287,,


In [44]:
len(good_df)

300