# Import libraries

In [1]:
import pandas as pd
import obonet
import networkx

## NCBITaxonomy ontology

In [2]:
graph = obonet.read_obo('http://purl.obolibrary.org/obo/ncbitaxon.obo')

# Get the childs of Viridiplantae (all plants)
plant_childs = networkx.ancestors(graph, 'NCBITaxon:33090')
# Subset the graph to make it faster to the relevant part (plants only)
graph = graph.subgraph(plant_childs)

Load plant-disease associations

In [4]:
diseases = pd.read_csv('../data/processed/plant_disease_associations.tsv.gz', sep='\t', compression='gzip')

Quick inspection on the data structure

In [5]:
diseases.head(1)

Unnamed: 0,plant_curie,plant_name,disease_curie,database,evidence
0,ncbitaxon:3369,Cryptomeria japonica,mondo:0005324,bern2,10067319_8


## Example (get all plants of a given genus)

From https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=4059&lvl=3&lin=f&keep=1&srchmode=1&unlock

In [6]:
list_of_plants = [
    i.lower()
    for i in networkx.ancestors(graph, 'NCBITaxon:4059')
]

In [7]:
len(list_of_plants)

62

Subset the disease dataset to the plants of the genus

In [8]:
diseases[diseases['plant_curie'].isin(
    list_of_plants
)].plant_curie.unique()

array(['ncbitaxon:4060', 'ncbitaxon:403115', 'ncbitaxon:329875',
       'ncbitaxon:498201', 'ncbitaxon:545371', 'ncbitaxon:930817',
       'ncbitaxon:1217299'], dtype=object)

Count how many plants in the genus/family are medicinal plants

In [9]:
len(diseases[diseases['plant_curie'].isin(
    list_of_plants
)].plant_curie.unique())

7

Read chemical dataset (plant-chemical relations)

In [10]:
chemicals = pd.read_csv('../data/processed/plant_chemical_associations.tsv.zip', sep='\t', compression='zip')

In [11]:
chemicals.head(1)

Unnamed: 0,plant_curie,plant name,chemical_curie,chemical name,database,evidence
0,ncbitaxon:1000425,Pimpinella major,pubchem.compound:3527,"3-[(3,4-Dihydroxyphenyl)(hydroxy)methylidene]-...",coconut,Relation from COCONUT


Plant in genus/family with chemical info

In [12]:
len(chemicals[chemicals['plant_curie'].isin(
    list_of_plants
)].plant_curie.unique())

25

Overlap between chemical and disease

In [13]:
len(set(chemicals[chemicals['plant_curie'].isin(
    list_of_plants
)].plant_curie.unique()).intersection(
    set(diseases[diseases['plant_curie'].isin(
    list_of_plants
)].plant_curie.unique())
))

4

See how many plants have a specific chemical (querying by PubChem ID)

In [14]:
len(chemicals[chemicals['chemical_curie'].isin(['pubchem.compound:5770'])].plant_curie.unique())

0

See how many plants have a specific chemical (querying by name)

In [15]:
subset = chemicals.dropna(subset=['chemical name'], how='all')

In [16]:
subset[subset['chemical name'].str.contains('eserpine', case=False)]

Unnamed: 0,plant_curie,plant name,chemical_curie,chemical name,database,evidence
280900,ncbitaxon:4060,Rauvolfia serpentina,pubchem.compound:12442718,Reserpine N-Oxide,lotus,From LOTUS
303931,ncbitaxon:545371,Rauvolfia tetraphylla,pubchem.compound:5701996,Isoreserpine,lotus,From LOTUS
327962,ncbitaxon:947881,Rauvolfia grandiflora,pubchem.compound:5701996,Isoreserpine,lotus,From LOTUS
