### Import modules

In [1]:
import getpass
import sys
import time

import obonet
from collections import defaultdict
from tqdm import tqdm, tqdm_notebook

import pandas as pd
import numpy as np
import networkx as nx

In [2]:
getpass.getuser()

'yojana'

### Loading data files

In [3]:
DATA_DIR = '../data'

In [4]:
plant_disease_data = pd.read_csv(
    f'{DATA_DIR}/processed/plant_disease_associations.tsv', sep='\t'
)
plant_disease_data.head(3)

Unnamed: 0,plant_curie,plant_name,disease_curie,database,evidence
0,ncbitaxon:3369,Cryptomeria japonica,mondo:0005324,bern2,10067319_8
1,ncbitaxon:3369,Cryptomeria japonica,mondo:0005324,bern2,10094290_4
2,ncbitaxon:3369,Cryptomeria japonica,mondo:0005324,bern2,10336604_1


In [5]:
diseases = set(plant_disease_data['disease_curie'])
len(diseases)

2205

### Collapse diseases based on MONDO tree

In [6]:
url = 'http://purl.obolibrary.org/obo/mondo.obo'
nx_graph = obonet.read_obo(url)

In [7]:
disease_areas_of_interest = [
    "mondo:0037940",  # auditory system disorder
    "mondo:0002657",  # breast disorder
    "mondo:0004992",  # cancer or benign tumor
    "mondo:0004995",  # cardiovascular disorder
    "mondo:0003900",  # connective tissue disorder
    "mondo:0004335",  # digestive system disorder
    "mondo:0021084",  # disorder of visual system
    "mondo:0005151",  # endocrine system disorder
    "mondo:0005570",  # hematologic disorder
    "mondo:0005046",  # immune system disorder
    "mondo:0021166",  # inflammatory disease
    "mondo:0021178",  # injury
    "mondo:0002051",  # integumentary system disorder
    "mondo:0005550",  # infectious disease or post-infectious disorder
    "mondo:0005066",  # metabolic disease
    "mondo:0002081",  # musculoskeletal system disorder
    "mondo:0005071",  # nervous system disorder
    "mondo:0005137",  # nutritional disorder
    "mondo:0024623",  # otorhinolaryngologic disease
    "mondo:0029000",  # poisoning
    "mondo:0002025",  # psychiatric disorder
    "mondo:0005087",  # respiratory system disorder
    "mondo:0002118",  # urinary system disorder
]

In [8]:
disease_mapper = defaultdict(set)

for entry in tqdm(disease_areas_of_interest):
    # Get MONDO ancestor
    d = set(
        i.lower()
        for i in nx.ancestors(nx_graph, entry.upper())
    )
    d.add(entry)
    
    # Create mapping between disease and its ancestors
    for disease_curie in d:
        disease_mapper[disease_curie].add(entry)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23/23 [00:00<00:00, 209.59it/s]


In [9]:
from pyenveda.resolver import curie_to_name

In [10]:
mapping_data = []
for i in tqdm(diseases):
    if i not in disease_mapper:
        mapping_data.append({
            'MONDO Disease id': i,
            'Disease name': curie_to_name(i, 'pathology'),
            'Parent MONDO id': '',
            'Parent name': '',
        })
    else:
        for parent in disease_mapper[i]:
            mapping_data.append({
                'MONDO Disease id': i,
                'Disease name': curie_to_name(i, 'pathology'),
                'Parent MONDO id': parent,
                'Parent name': curie_to_name(parent, 'pathology'),
            })

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2205/2205 [04:22<00:00,  8.40it/s]


In [11]:
mapped_disease_df = pd.DataFrame(mapping_data)
mapped_disease_df.head(2)

Unnamed: 0,MONDO Disease id,Disease name,Parent MONDO id,Parent name
0,mondo:0017885,chromophobe renal cell carcinoma,mondo:0004992,cancer
1,mondo:0017885,chromophobe renal cell carcinoma,mondo:0002118,urinary system disease


In [12]:
mapped_disease_df.shape

(4142, 4)

In [13]:
mapped_disease_df.to_csv(f'{DATA_DIR}/mapping/mondo_upper_level_mapping.tsv', sep='\t', index=False)

Dropping diseases with no parents

In [14]:
mapped_disease_df = mapped_disease_df[mapped_disease_df['Parent MONDO id'] != '']
mapped_disease_df.shape

(3910, 4)

In [15]:
filtered_diseases = set(mapped_disease_df['MONDO Disease id'].to_list())
len(filtered_diseases)

1973

### Saving the updated plant-disease edge list

In [16]:
plant_disease_data.shape

(97066, 5)

In [17]:
plant_disease_data = plant_disease_data[
    plant_disease_data['disease_curie'].isin(filtered_diseases)
]
plant_disease_data.shape

(83906, 5)

In [18]:
plant_disease_data.to_csv(f'{DATA_DIR}/processed/plant_disease_filtered.tsv', sep='\t', index=False)

In [19]:
plant_disease_data.head()

Unnamed: 0,plant_curie,plant_name,disease_curie,database,evidence
0,ncbitaxon:3369,Cryptomeria japonica,mondo:0005324,bern2,10067319_8
1,ncbitaxon:3369,Cryptomeria japonica,mondo:0005324,bern2,10094290_4
2,ncbitaxon:3369,Cryptomeria japonica,mondo:0005324,bern2,10336604_1
3,ncbitaxon:3311,Ginkgo biloba,mondo:0002643,bern2,10345150_1
4,ncbitaxon:203270,Berberis aquifolium,mondo:0008334,bern2,10352377_1


In [20]:
harmonized_data = []

for plant_curie, plant_name, disease_curie, database, evidence in tqdm(plant_disease_data.values):
    disease_parents = disease_mapper[disease_curie]
    
    for parent in disease_parents:
        harmonized_data.append({
            'plant_curie': plant_curie,
            'plant_name': plant_name, 
            'disease_curie': parent,
            'database': database,
            'evidence': evidence
        })

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 83906/83906 [00:00<00:00, 443149.62it/s]


In [21]:
harmonized_df = pd.DataFrame(harmonized_data)
harmonized_df.head()

Unnamed: 0,plant_curie,plant_name,disease_curie,database,evidence
0,ncbitaxon:3369,Cryptomeria japonica,mondo:0024623,bern2,10067319_8
1,ncbitaxon:3369,Cryptomeria japonica,mondo:0005087,bern2,10067319_8
2,ncbitaxon:3369,Cryptomeria japonica,mondo:0021166,bern2,10067319_8
3,ncbitaxon:3369,Cryptomeria japonica,mondo:0005046,bern2,10067319_8
4,ncbitaxon:3369,Cryptomeria japonica,mondo:0024623,bern2,10094290_4


In [22]:
harmonized_df.shape

(172702, 5)

In [23]:
harmonized_df.to_csv(f'{DATA_DIR}/processed/plant_disease_collapsed.tsv', sep='\t', index=False)