# Investigating the correlation between disease composition for an specific genus

### Import modules

In [11]:
from collections import defaultdict
from itertools import combinations
import random

import obonet
import networkx as nx

import logging
import pandas as pd
from tqdm import tqdm

# Viz
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

In [3]:
logging.disable()
pd.options.mode.chained_assignment = None

In [4]:
DATA_DIR = '../data/processed'

### Load plant-disease evidences

In [7]:
collapsed_plant_disease_df = pd.read_csv(
    f'{DATA_DIR}/plant_disease_collapsed.tsv.gz',
    compression='gzip',
    sep='\t',
    low_memory=False
)

In [8]:
literature_data = collapsed_plant_disease_df[
    collapsed_plant_disease_df['database'] == 'bern2'
]

plant_lit_ids = set(literature_data['plant_curie'].to_list())

literature_data.shape, len(plant_lit_ids)

((129539, 5), 4413)

In [9]:
# Total plant count
len(plant_lit_ids)

4413

### Get genus mapping for plants

In [13]:
graph = obonet.read_obo('http://purl.obolibrary.org/obo/ncbitaxon.obo')

# Get the childs of Viridiplantae (all plants)
plant_childs = nx.ancestors(graph, 'NCBITaxon:33090')
# Subset the graph to make it faster to the relevant part (plants only)
graph = graph.subgraph(plant_childs)

### Prepare plant-disease dicts

In [19]:
def get_non_binary_dict(df:pd.DataFrame):
    """Get plant-disease data dictionary based on citation counts"""
    
    non_binary_dict = {}
        
    for plant_curie, disease_curie, evidence in tqdm(
        df[['plant_curie', 'disease_curie', 'evidence']].values
    ):
        pmid = evidence.split('_')[0]

        if plant_curie not in non_binary_dict:
            non_binary_dict[plant_curie] = defaultdict(set)

        non_binary_dict[plant_curie][disease_curie].add(pmid)
    return non_binary_dict


def get_binary_dict(df:pd.DataFrame):
    """Get plant-disease data dictionary based on presentece/absence of association"""
    
    binary_dict = {}
        
    for plant_curie, disease_curie, evidence in tqdm(
        df[['plant_curie', 'disease_curie', 'evidence']].values
    ):
        pmid = evidence.split('_')[0]

        if plant_curie not in binary_dict:
            binary_dict[plant_curie] = defaultdict(int)

        binary_dict[plant_curie][disease_curie] = 1
        
    return binary_dict

In [20]:
disease_non_binary_dict = get_non_binary_dict(literature_data)

100%|██████████| 129539/129539 [00:00<00:00, 495397.70it/s]


In [21]:
disease_binary_dict = get_binary_dict(literature_data)

100%|██████████| 129539/129539 [00:00<00:00, 654435.32it/s]


### Calculate disease similarity for plants in a specific genus or family

In [15]:
def calculate_pearson(plant_1, plant_2, species_to_vectors):
    """Calculate pearson coeffiencient."""
    diseases_plant_1_dict = species_to_vectors[plant_1]
    diseases_plant_2_dict = species_to_vectors[plant_2]
    
    elements_in_plant_1 = []
    elements_in_plant_2 = []
    
    for disease in disease_vector:
        if disease in diseases_plant_1_dict:
            elements_in_plant_1.append(len(diseases_plant_1_dict[disease]))
        else:
            elements_in_plant_1.append(0)
            
        if disease in diseases_plant_1_dict:
            elements_in_plant_2.append(len(diseases_plant_2_dict[disease]))
        else:
            elements_in_plant_2.append(0)
    try:
        corr = pearsonr(elements_in_plant_1, elements_in_plant_2)
    except:
        corr = None, None
    
    return corr

MONDO names

In [23]:
mondo_ontology = obonet.read_obo('http://purl.obolibrary.org/obo/mondo.obo')

In [31]:
mondo_ids_to_names = {
    _id.lower(): data['name']
    for _id, data in mondo_ontology.nodes(data=True)
    if 'name' in data
}

In [34]:
mondo_ids_to_names

{'mondo:0000001': 'disease or disorder',
 'mondo:0000004': 'adrenocortical insufficiency',
 'mondo:0000005': 'alopecia, isolated',
 'mondo:0000009': 'inherited bleeding disorder, platelet-type',
 'mondo:0000014': 'colorblindness, partial',
 'mondo:0000015': 'classic complement early component deficiency',
 'mondo:0000022': 'nocturnal enuresis',
 'mondo:0000023': 'infantile liver failure',
 'mondo:0000030': 'sleep-related hypermotor epilepsy',
 'mondo:0000032': 'febrile seizures, familial',
 'mondo:0000044': 'hereditary hypophosphatemic rickets',
 'mondo:0000045': 'hypothyroidism, congenital, nongoitrous',
 'mondo:0000050': 'isolated congenital growth hormone deficiency',
 'mondo:0000060': 'microcephalic osteodysplastic primordial dwarfism',
 'mondo:0000062': 'isolated microphthalmia',
 'mondo:0000065': 'microvascular complications of diabetes, susceptibility',
 'mondo:0000066': 'mitochondrial complex deficiency',
 'mondo:0000070': 'mycobacterium tuberculosis, susceptibility',
 'mondo:0