# Description

It extracts from the EFO ontology all the xrefs from efo labels to other ontologies/datasets (such as Disease Ontology, ICD9, etc).

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict

import pandas as pd
import networkx
import obonet

import conf

# Functions

In [3]:
def groupby(data, sep=':'):
    if data is None:
        return {}
    res = defaultdict(set)
    for d in data:
        ds = d.split(sep)
        res[ds[0]].add(d)
    return res

In [4]:
test_data = [
  'DOID:2841',
  'HP:0002099',
  'ICD10:J45',
  'ICD10:J45.90',
  'ICD9:493',
  'ICD9:493.81',
  'ICD9:493.9',
  'KEGG:05310',
]

In [5]:
_tmp = groupby(test_data)
assert _tmp is not None
assert len(_tmp) == 5

assert len(_tmp['DOID']) == 1
assert len(_tmp['HP']) == 1
assert len(_tmp['ICD10']) == 2
assert len(_tmp['ICD9']) == 3
assert len(_tmp['KEGG']) == 1

In [6]:
_tmp

defaultdict(set,
            {'DOID': {'DOID:2841'},
             'HP': {'HP:0002099'},
             'ICD10': {'ICD10:J45', 'ICD10:J45.90'},
             'ICD9': {'ICD9:493', 'ICD9:493.81', 'ICD9:493.9'},
             'KEGG': {'KEGG:05310'}})

In [7]:
def get_parents(node):
    for t in graph.successors(node):
        yield t

def _is_disease_single_node(node):
    return node == 'EFO:0000408'

def is_disease(node):
    if _is_disease_single_node(node):
        return True
    
    for parent_node in get_parents(node):
        if is_disease(parent_node):
            return True
    
    return False

# Read the EFO ontology

In [8]:
url = conf.GENERAL["EFO_ONTOLOGY_OBO_FILE"]
graph = obonet.read_obo(url)

In [9]:
# Number of nodes
len(graph)

26727

In [10]:
# Number of edges
graph.number_of_edges()

52865

In [11]:
assert graph.nodes['EFO:0000270'].get('name') == 'asthma'

# Map EFO to other references

In [12]:
efo_terms = {
    node_id for node_id in graph.nodes.keys()# if node_id.startswith('EFO:')# and is_disease(node_id)
}

In [13]:
len(efo_terms)

26727

In [14]:
graph.nodes['EFO:0000270']

{'name': 'asthma',
 'def': '"Tendency of the smooth muscle of the tracheobronchial tree to contract more intensely in response to a given stimulus than it does in the response seen in normal individuals. This condition is present in virtually all symptomatic patients with asthma. The most prominent manifestation of this smooth muscle contraction is a decrease in airway caliber that can be readily measured in the pulmonary function laboratory." []',
 'synonym': ['"Airway hyperreactivity" EXACT []',
  '"asthma" EXACT []',
  '"Asthma (disorder)" EXACT []',
  '"Asthma NOS" EXACT []',
  '"Asthma NOS (disorder)" EXACT []',
  '"ASTHMA NOS W (AC) EXAC" EXACT []',
  '"Asthma unspecified" EXACT []',
  '"Asthma unspecified (disorder)" EXACT []',
  '"Asthma, Bronchial" EXACT []',
  '"Asthma, unspecified" EXACT []',
  '"Asthma, unspecified type, with acute exacerbation" EXACT []',
  '"Asthma, unspecified type, without mention of status asthmaticus" EXACT []',
  '"Asthmas" EXACT []',
  '"Asthmatic" 

## EFO to label

In [15]:
efo_full_data = []

for efo in efo_terms:
    efo_data = {}
    
    efo_data['term_id'] = efo
    efo_data['label'] = graph.nodes[efo].get('name')
    
    efo_full_data.append(efo_data)

In [16]:
efo_label = pd.DataFrame(efo_full_data).set_index('term_id')

In [17]:
efo_label.shape

(26727, 1)

In [18]:
assert efo_label.index.is_unique

In [19]:
efo_label.head()

Unnamed: 0_level_0,label
term_id,Unnamed: 1_level_1
EFO:1001070,ocular tuberculosis
Orphanet:284139,"Larsen-like syndrome, B3GAT3 type"
EFO:0007776,prothrombin fragments F1+2 measurement
FMA:66762,synovial membrane
Orphanet:2153,Hirschsprung disease - nail hypoplasia - dysmo...


In [20]:
assert efo_label.loc['EFO:0000270', 'label'] == 'asthma'

In [21]:
outfile = conf.GENERAL["TERM_ID_LABEL_FILE"]
display(outfile)

efo_label.to_csv(outfile, sep='\t')

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/term_id_labels.tsv.gz')

## Map xrefs

In [22]:
efo_full_data = []

for efo in efo_terms:
    efo_data = {}
    
    efo_data['term_id'] = efo
    
    for xref_id, xref_data in groupby(graph.nodes[efo].get('xref')).items():
        efo_data['target_id_type'] = xref_id
        
        for xref in xref_data:
            efo_data['target_id'] = xref
            efo_full_data.append(efo_data.copy())

In [23]:
efo_full_data = pd.DataFrame(efo_full_data).set_index('term_id')

In [24]:
efo_full_data.shape

(104094, 2)

In [25]:
efo_full_data.head()

Unnamed: 0_level_0,target_id_type,target_id
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
EFO:1001070,DOID,DOID:233
EFO:1001070,ICD9,ICD9:017.30
EFO:1001070,ICD9,ICD9:017.32
EFO:1001070,MESH,MESH:D014392
EFO:1001070,MONDO,MONDO:0006876


In [26]:
graph.nodes['EFO:0002669']

{'name': 'ENBREL',
 'def': '"Etanercept (trade name Enbrel) is a drug that treats autoimmune diseases by interfering with tumor necrosis factor (TNF; a soluble inflammatory cytokine) by acting as a TNF inhibitor. Pfizer describes in a SEC filing that the drug is used to treat rheumatoid, juvenile rheumatoid and psoriatic arthritis, plaque psoriasis and ankylosing spondylitis. Sales reached record $3.3 billion in 2010.[1]\\nEtanercept is a fusion protein produced through expression of recombinant DNA. That is, it is a product of a DNA \\"construct\\" engineered to link the human gene for soluble TNF receptor 2 to the gene for the Fc component of human immunoglobulin G1 (IgG1). Expression of the construct produces a continuous protein \\"fusing\\" TNF receptor 2 to IgG1. Production of Etanercept is accomplished by the large-scale culturing of cells that have been \\"cloned\\" to express this recombinant DNA construct." []',
 'synonym': ['"etanercept" EXACT []'],
 'xref': ['NCIt:C2381', '

In [27]:
efo_full_data.loc['EFO:0002669']

Unnamed: 0_level_0,target_id_type,target_id
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
EFO:0002669,NCIt,NCIt:C2381
EFO:0002669,SNOMEDCT,SNOMEDCT:387045004
EFO:0002669,SNOMEDCT,SNOMEDCT:118259007


In [28]:
# some testing
assert efo_full_data.loc['EFO:0002669'].shape[0] == 3

_tmp = efo_full_data.loc['EFO:0002669'].sort_values('target_id')

assert _tmp.iloc[0]['target_id_type'] == 'NCIt'
assert _tmp.iloc[0]['target_id'] == 'NCIt:C2381'

assert _tmp.iloc[1]['target_id_type'] == 'SNOMEDCT'
assert _tmp.iloc[1]['target_id'] == 'SNOMEDCT:118259007'

assert _tmp.iloc[2]['target_id_type'] == 'SNOMEDCT'
assert _tmp.iloc[2]['target_id'] == 'SNOMEDCT:387045004'

In [29]:
outfile = conf.GENERAL["TERM_ID_XREFS_FILE"]
display(outfile)

efo_full_data.to_csv(outfile, sep='\t')

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/term_id_xrefs.tsv.gz')