# Generating taxonomy for the plant-chemical space

### Import modules

In [1]:
import os
from collections import defaultdict
import pandas as pd
import numpy as np
from tqdm import tqdm

# Import taxonomy
from utils import get_genus_and_family_info_for_plants, ncbitaxon_curies_to_names, create_taxon_compound_vectors
from ete3 import NCBITaxa

In [2]:
tqdm.pandas()
pd.set_option('display.max_columns', None)

### Load chemicals

In [3]:
DATA_DIR = '../data'

In [4]:
plant_chemical_df = pd.read_csv(
    's3://enveda-datascience/daniel_domingo/plant_chemical_associations.tsv.gz',
    compression='gzip',
    sep='\t',
    usecols=[
        'plant_curie',
        'chemical_curie',
    ]
)

In [5]:
plant_chemical_df.head(1)

Unnamed: 0,plant_curie,chemical_curie
0,ncbitaxon:1000425,pubchem.compound:3527


In [6]:
len(plant_chemical_df.chemical_curie.unique())

87019

## Generating family chemical space vectors

In [7]:
plant_disease_df = pd.read_csv(
    's3://enveda-datascience/daniel_domingo/plant_disease_associations.tsv.gz',
    compression='gzip',
    sep='\t',
    usecols=[
        'plant_curie',
        'disease_curie',
    ]
)

In [8]:
medicinal_plants = set(plant_disease_df.plant_curie.unique())

In [9]:
if os.path.exists(f'{DATA_DIR}/family_chemical_count.tsv'):
    family_df = pd.read_csv(f'{DATA_DIR}/family_chemical_count.tsv', sep='\t')
else:
    genus_to_species, family_to_species = get_genus_and_family_info_for_plants(
        set(plant_chemical_df.plant_curie.unique())
    )
    family_df = create_taxon_compound_vectors(family_to_species, plant_chemical_df, med_plants=medicinal_plants)
    family_df.to_csv(f'{DATA_DIR}/family_chemical_count.tsv', sep='\t', index=False)
    

In [10]:
family_df.head(2)

Unnamed: 0,family,# chemicals,# family specific chemicals,# med plants,# plants in fam,plants in fam
0,ncbitaxon:3536,243,59,5,15,"['ncbitaxon:122400', 'ncbitaxon:427792', 'ncbi..."
1,ncbitaxon:4037,4652,1487,105,436,"['ncbitaxon:692015', 'ncbitaxon:1534653', 'ncb..."


# Mapping families to their names

In [11]:
plant_names = ncbitaxon_curies_to_names(family_df.family.unique())
len(plant_names)

513

In [12]:
family_df['fam_name'] = family_df.family.map(plant_names)
family_df.head(2)

Unnamed: 0,family,# chemicals,# family specific chemicals,# med plants,# plants in fam,plants in fam,fam_name
0,ncbitaxon:3536,243,59,5,15,"['ncbitaxon:122400', 'ncbitaxon:427792', 'ncbi...",Nyctaginaceae
1,ncbitaxon:4037,4652,1487,105,436,"['ncbitaxon:692015', 'ncbitaxon:1534653', 'ncb...",Apiaceae


In [13]:
unique_family_ids = set(family_df['family'].unique())
len(unique_family_ids)

513

In [14]:
family_data = family_df[['# chemicals', '# family specific chemicals', '# med plants', 'fam_name']].set_index('fam_name')

In [15]:
family_data.head(2)

Unnamed: 0_level_0,# chemicals,# family specific chemicals,# med plants
fam_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Nyctaginaceae,243,59,5
Apiaceae,4652,1487,105


## Normalizing the scores

In [16]:
def convert_to_percent(df: pd.DataFrame):
    """Convert counts to percentages for each row."""
    res = df.div(df.sum(axis=1), axis=0)
    
    a = res.sum(axis=1) 
    assert np.isclose(a.values, [1]*a.shape[0]).all(), 'Not all rows sum to 1'
    return res

In [17]:
norm_family_data = convert_to_percent(family_data)

In [18]:
norm_family_data.to_csv(f'{DATA_DIR}/normalized_family_chemical_counts.tsv', sep='\t')

# Generate taxnomy tree

In [19]:
ncbi = NCBITaxa()

In [20]:
all_plants = [
    ncbitax
    for ncbitax in plant_names
    if ncbitax.startswith('ncbitaxon') and plant_names[ncbitax] in family_data.index
]
len(all_plants)

513

In [21]:
lineage_dict = defaultdict(dict)

for plant in tqdm(all_plants):
    plant = int(plant.split(':')[1])

    if plant == '':
        continue 

    tree = ncbi.get_lineage(plant)
    tree = [
        i
        for i in tree
        # if i >= 33090 # Viridaeplante
    ]

    # Get the names of the nodes
    names = ncbi.get_taxid_translator(tree)
    
    # Replace ids with names
    tree = [
        names[taxid]
        for taxid in tree
    ]

    for idx in range(len(names) - 1):
        
        parent = tree[idx]
        child = tree[idx + 1]
                             
        lineage_dict[parent][child] = idx + 1

100%|██████████| 513/513 [00:00<00:00, 2604.87it/s]


In [22]:
len(lineage_dict)

238

In [23]:
def newickify(node_to_children, root_node) -> str:
    visited_nodes = set()

    def newick_render_node(name, distance: float) -> str:
        assert name not in visited_nodes, "Error: The tree may not be circular!"

        if name not in node_to_children:
            # Leafs
            return F'{name}:{distance}'
        else:
            # Nodes
            visited_nodes.add(name)
            children = node_to_children[name]
            children_strings = [newick_render_node(child, children[child]) for child in children.keys()]
            children_strings = ",".join(children_strings)
            return F'({children_strings}){name}:{distance}'

    newick_string = newick_render_node(root_node, 0) + ';'

    # Ensure no entries in the dictionary are left unused.
    assert visited_nodes == set(node_to_children.keys()), f"Error: some nodes aren't in the tree {set(node_to_children.keys())}"

    return newick_string

In [24]:
string = newickify(
    node_to_children=lineage_dict,
    root_node='root',  # root node
)

In [25]:
from io import StringIO
from Bio import Phylo

In [26]:
biophylo_tree = Phylo.read(StringIO(string), "newick")


In [27]:
len([i for i in biophylo_tree.get_terminals()])


513

In [28]:
Phylo.write(biophylo_tree, f'{DATA_DIR}/taxonomy_tree_1A.nwk', "newick")


1