# Generating taxonomy for the plant-chemical space

### Import modules

In [1]:
import os
from collections import defaultdict
import pandas as pd
import numpy as np
from tqdm import tqdm

# Import taxonomy
from utils import get_genus_and_family_info_for_plants, ncbitaxon_curies_to_names, create_taxon_compound_vectors
from ete3 import NCBITaxa
from io import StringIO
from Bio import Phylo

In [2]:
tqdm.pandas()
pd.set_option('display.max_columns', None)

### Load chemicals

In [3]:
DATA_DIR = '../data'

In [4]:
plant_chemical_df = pd.read_csv(
    's3://enveda-datascience/daniel_domingo/plant_chemical_associations.tsv.gz',
    compression='gzip',
    sep='\t',
    usecols=[
        'plant_curie',
        'chemical_curie',
    ]
)

In [5]:
plant_chemical_df.head(1)

Unnamed: 0,plant_curie,chemical_curie
0,ncbitaxon:1000425,pubchem.compound:3527


In [6]:
len(plant_chemical_df.chemical_curie.unique())

87019

## Generating family chemical space vectors

In [7]:
plant_disease_df = pd.read_csv(
    's3://enveda-datascience/daniel_domingo/plant_disease_associations.tsv.gz',
    compression='gzip',
    sep='\t',
    usecols=[
        'plant_curie',
        'disease_curie',
    ]
)

In [8]:
medicinal_plants = set(plant_disease_df.plant_curie.unique())

In [9]:
if os.path.exists(f'{DATA_DIR}/family_chemical_count.tsv'):
    family_df = pd.read_csv(f'{DATA_DIR}/family_chemical_count.tsv', sep='\t')
else:
    genus_to_species, family_to_species = get_genus_and_family_info_for_plants(
        set(plant_chemical_df.plant_curie.unique())
    )
    family_df = create_taxon_compound_vectors(family_to_species, plant_chemical_df, med_plants=medicinal_plants)
    family_df.to_csv(f'{DATA_DIR}/family_chemical_count.tsv', sep='\t', index=False)
    

In [10]:
family_df.head(2)

Unnamed: 0,name,# chemicals,# level specific chemicals,# med plants,# plants in level,plants in level
0,ncbitaxon:51506,6,0,0,1,['ncbitaxon:49702']
1,ncbitaxon:16712,30,16,1,1,['ncbitaxon:13523']


# Mapping families to their names

In [11]:
plant_names = ncbitaxon_curies_to_names(family_df.name.unique())
len(plant_names)

513

In [12]:
family_df['fam_name'] = family_df.name.map(plant_names)
family_df.head(2)

Unnamed: 0,name,# chemicals,# level specific chemicals,# med plants,# plants in level,plants in level,fam_name
0,ncbitaxon:51506,6,0,0,1,['ncbitaxon:49702'],Blandfordiaceae
1,ncbitaxon:16712,30,16,1,1,['ncbitaxon:13523'],Eupteleaceae


In [13]:
unique_family_ids = set(family_df['name'].unique())
len(unique_family_ids)

513

In [14]:
family_data = family_df[
    ['# chemicals', '# level specific chemicals', '# med plants', 'fam_name', '# plants in level']
].set_index('fam_name')
family_data.head(2)

Unnamed: 0_level_0,# chemicals,# level specific chemicals,# med plants,# plants in level
fam_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Blandfordiaceae,6,0,0,1
Eupteleaceae,30,16,1,1


## Normalizing the scores

In [15]:
family_data['proportion of med plants'] =  family_data['# med plants'] / family_data['# plants in level']

In [16]:
family_data['proportion of specific chemicals'] = family_data['# level specific chemicals'] / family_data['# chemicals']

In [17]:
family_data['normalized # of chemicals'] = family_data['# chemicals'] / family_data['# chemicals'].sum()
max_chem_val = family_data['normalized # of chemicals'].max()
min_chem_val = family_data['normalized # of chemicals'].min()
family_data['normalized # of chemicals'] = (
    family_data['normalized # of chemicals'] - min_chem_val) / (max_chem_val - min_chem_val)

In [18]:
norm_family_data = family_data[['normalized # of chemicals', 'proportion of med plants', 'proportion of specific chemicals']]

In [19]:
norm_family_data.to_csv(f'{DATA_DIR}/overview_chemicals_family.tsv', sep='\t')

# Generate taxnomy tree

In [20]:
ncbi = NCBITaxa()

In [21]:
all_plants = [
    ncbitax
    for ncbitax in plant_names
    if ncbitax.startswith('ncbitaxon') and plant_names[ncbitax] in family_data.index
]
len(all_plants)

513

In [22]:
lineage_dict = defaultdict(dict)

for plant in tqdm(all_plants):
    plant = int(plant.split(':')[1])

    if plant == '':
        continue 

    tree = ncbi.get_lineage(plant)
    tree = [
        i
        for i in tree
        # if i >= 33090 # Viridaeplante
    ]

    # Get the names of the nodes
    names = ncbi.get_taxid_translator(tree)
    
    # Replace ids with names
    tree = [
        names[taxid]
        for taxid in tree
    ]

    for idx in range(len(names) - 1):
        
        parent = tree[idx]
        child = tree[idx + 1]
                             
        lineage_dict[parent][child] = idx + 1

100%|██████████| 513/513 [00:00<00:00, 1754.99it/s]


In [23]:
len(lineage_dict)

238

In [24]:
def newickify(node_to_children, root_node) -> str:
    visited_nodes = set()

    def newick_render_node(name, distance: float) -> str:
        assert name not in visited_nodes, "Error: The tree may not be circular!"

        if name not in node_to_children:
            # Leafs
            return F'{name}:{distance}'
        else:
            # Nodes
            visited_nodes.add(name)
            children = node_to_children[name]
            children_strings = [newick_render_node(child, children[child]) for child in children.keys()]
            children_strings = ",".join(children_strings)
            return F'({children_strings}){name}:{distance}'

    newick_string = newick_render_node(root_node, 0) + ';'

    # Ensure no entries in the dictionary are left unused.
    assert visited_nodes == set(node_to_children.keys()), f"Error: some nodes aren't in the tree {set(node_to_children.keys())}"

    return newick_string

In [25]:
string = newickify(
    node_to_children=lineage_dict,
    root_node='root',  # root node
)

In [26]:
biophylo_tree = Phylo.read(StringIO(string), "newick")


In [27]:
len([i for i in biophylo_tree.get_terminals()])


513

In [28]:
Phylo.write(biophylo_tree, f'{DATA_DIR}/taxonomy_tree_1A.nwk', "newick")


1