# Exploring the NP classifier chemical classes 

### Import modules

In [1]:
from io import StringIO
from collections import defaultdict

import pandas as pd
from tqdm import tqdm
import numpy as np

# get names from ncbitaxon ontology
from utils import ncbitaxon_curies_to_names

# Taxonomy tree
from ete3 import NCBITaxa
from Bio import Phylo

In [2]:
DATA_DIR = '../data'

### Load NP classifier file

In [3]:
npclassifier_family_df = pd.read_csv(
    's3://enveda-datascience/daniel_domingo/chemical_classification/np_classifier_family_vector.tsv',
    sep='\t'
)

In [4]:
npclassifier_family_df.shape

(381, 567)

In [5]:
# Set plant name as index
npclassifier_family_df.set_index('taxon', inplace=True)

In [6]:
# Merging the non-medicial and medicinal data
npclassifier_family_df = npclassifier_family_df.groupby(['taxon']).agg(sum)

### Calculating no.of plants per class

In [7]:
npclassifier_family_df = npclassifier_family_df.T

In [8]:
family_class_score = pd.DataFrame(
    npclassifier_family_df.astype(bool).sum(axis=1), columns=['no.of.families']
).reset_index()
family_class_score.rename(columns={'index': 'class'}, inplace=True)
family_class_score.sort_values(by='no.of.families', ascending=False, inplace=True)
family_class_score.head(2)

Unnamed: 0,class,no.of.families
378,Flavonols,305
118,Stigmastane steroids,290


## Distribution of subselected classes

In [9]:
selected_classes = """Flavonols
Flavones
Oleanane triterpenoids
Simple coumarins
Acyclic monoterpenoids
Gallotannins
Lupane triterpenoids
Flavanones
Menthane monoterpenoids
Furofuranoid lignans
Neolignans
Isoquinoline alkaloids
Pyridine alkaloids
Anthraquinones and anthrones
Iridoids monoterpenoids
Simple indole alkaloids
Labdane diterpenoids
Kaurane and Phyllocladane diterpenoids
Isocoumarins
Quinoline alkaloids"""

In [10]:
family_class_score = family_class_score[family_class_score['class'].isin(selected_classes.split('\n'))]
len(family_class_score)

20

### Saving subselected class data

In [11]:
npclassifier_family_df.head(2)

taxon,ncbitaxon:1003244,ncbitaxon:1003247,ncbitaxon:1003248,ncbitaxon:1003255,ncbitaxon:104773,ncbitaxon:112800,ncbitaxon:1131839,ncbitaxon:114201,ncbitaxon:1203500,ncbitaxon:1203515,...,ncbitaxon:91850,ncbitaxon:91851,ncbitaxon:91852,ncbitaxon:91896,ncbitaxon:94394,ncbitaxon:95774,ncbitaxon:984489,ncbitaxon:984509,ncbitaxon:984539,ncbitaxon:984551
Monomeric stilbenes,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,9,1,0,0
Coloratane sesquiterpenoids,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
npclassifier_family_df = npclassifier_family_df.reset_index()
npclassifier_family_df.rename(columns={'index': 'class_name'}, inplace=True)

In [13]:
npclassifier_family_df.shape

(566, 382)

In [14]:
# Dropping columns with WFO
remove_cols = [
    col
    for col in npclassifier_family_df.columns
    if not col.startswith('ncbitaxon') and col != 'class_name'
]
npclassifier_family_df.drop(columns=remove_cols, inplace=True)
npclassifier_family_df.shape

(566, 382)

In [15]:
# Set index to class name
npclassifier_family_df.set_index('class_name', inplace=True)

In [16]:
npclassifier_family_df.head(1)

taxon,ncbitaxon:1003244,ncbitaxon:1003247,ncbitaxon:1003248,ncbitaxon:1003255,ncbitaxon:104773,ncbitaxon:112800,ncbitaxon:1131839,ncbitaxon:114201,ncbitaxon:1203500,ncbitaxon:1203515,...,ncbitaxon:91850,ncbitaxon:91851,ncbitaxon:91852,ncbitaxon:91896,ncbitaxon:94394,ncbitaxon:95774,ncbitaxon:984489,ncbitaxon:984509,ncbitaxon:984539,ncbitaxon:984551
class_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Monomeric stilbenes,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,9,1,0,0


In [17]:
# Make a function to normalize each column of the df to percentage
def normalize_df(df):
    for col in df.columns:
        df[col] = df[col] / df[col].sum()
    return df

In [18]:
npclassifier_family_df = normalize_df(npclassifier_family_df)

In [19]:
# Subset families to 20 classes
npclassifier_family_df = npclassifier_family_df[npclassifier_family_df.index.isin(selected_classes.split('\n'))]
len(npclassifier_family_df)

20

In [20]:
plant_names = ncbitaxon_curies_to_names(npclassifier_family_df.columns)

In [21]:
# Replace column names with plant names
npclassifier_family_df.rename(
    columns=plant_names,
   inplace=True,
)

In [22]:
npclassifier_family_df.head(1)

taxon,Aptandraceae,Thesiaceae,Cervantesiaceae,Viscaceae,Siparunaceae,Achariaceae,Anastrophyllaceae,Nartheciaceae,Cystopteridaceae,Rhachidosoraceae,...,Biebersteiniaceae,Nitrariaceae,Muntingiaceae,Orobanchaceae,Sciadopityaceae,Schistochilaceae,Dumortieraceae,Lophocoleaceae,Myliaceae,Solenostomataceae
class_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Menthane monoterpenoids,0.0,0.0,0.0,0.016194,0.0,0.0,0.0,0.017857,0.0,0.0,...,0.0,0.009709,0.0,0.005922,0.0,0.083333,0.0,0.0,0.0,0.009434


In [23]:
# transpose matrix
npclassifier_family_df = npclassifier_family_df.T

In [24]:
npclassifier_family_df.shape

(381, 20)

In [25]:
# Remove rows with only zeros in all columns
npclassifier_family_df = npclassifier_family_df.loc[(npclassifier_family_df!=0).any(axis=1)]

In [26]:
npclassifier_family_df.shape

(373, 20)

In [27]:
npclassifier_family_df.head(1)

class_name,Menthane monoterpenoids,Gallotannins,Isoquinoline alkaloids,Labdane diterpenoids,Oleanane triterpenoids,Flavanones,Lupane triterpenoids,Acyclic monoterpenoids,Furofuranoid lignans,Anthraquinones and anthrones,Isocoumarins,Neolignans,Iridoids monoterpenoids,Flavonols,Simple indole alkaloids,Kaurane and Phyllocladane diterpenoids,Quinoline alkaloids,Flavones,Pyridine alkaloids,Simple coumarins
taxon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Aptandraceae,0.0,0.0,0.0,0.0,0.090909,0.0,0.045455,0.0,0.0,0.0,0.0,0.045455,0.0,0.045455,0.0,0.0,0.0,0.045455,0.0,0.0


In [28]:
npclassifier_family_df.to_csv(f'{DATA_DIR}/subselected_family_chemical_class.tsv', sep='\t')

### Generate taxonomy tree

In [29]:
len(plant_names)

381

In [30]:
ncbi = NCBITaxa()

In [31]:
all_plants = [
    ncbitax
    for ncbitax in plant_names
    if ncbitax.startswith('ncbitaxon') and plant_names[ncbitax] in npclassifier_family_df.index
]
len(all_plants)

373

In [32]:
lineage_dict = defaultdict(dict)

for plant in tqdm(all_plants):
    plant = int(plant.split(':')[1])

    if plant == '':
        continue 

    tree = ncbi.get_lineage(plant)
    tree = [
        i
        for i in tree
        # if i >= 33090 # Viridaeplante
    ]

    # Get the names of the nodes
    names = ncbi.get_taxid_translator(tree)
    
    # Replace ids with names
    tree = [
        names[taxid]
        for taxid in tree
    ]

    for idx in range(len(names) - 1):
        
        parent = tree[idx]
        child = tree[idx + 1]
                             
        lineage_dict[parent][child] = idx + 1

100%|██████████| 373/373 [00:00<00:00, 1178.99it/s]


In [33]:
len(lineage_dict)

189

In [34]:
def newickify(node_to_children, root_node) -> str:
    visited_nodes = set()

    def newick_render_node(name, distance: float) -> str:
        assert name not in visited_nodes, "Error: The tree may not be circular!"

        if name not in node_to_children:
            # Leafs
            return F'{name}:{distance}'
        else:
            # Nodes
            visited_nodes.add(name)
            children = node_to_children[name]
            children_strings = [newick_render_node(child, children[child]) for child in children.keys()]
            children_strings = ",".join(children_strings)
            return F'({children_strings}){name}:{distance}'

    newick_string = newick_render_node(root_node, 0) + ';'

    # Ensure no entries in the dictionary are left unused.
    assert visited_nodes == set(node_to_children.keys()), f"Error: some nodes aren't in the tree {set(node_to_children.keys())}"

    return newick_string

In [35]:
string = newickify(
    node_to_children=lineage_dict,
    root_node='root',  # root node
)

In [36]:
biophylo_tree = Phylo.read(StringIO(string), "newick")

In [37]:
len([i for i in biophylo_tree.get_terminals()])

373

In [38]:
Phylo.write(biophylo_tree, f'{DATA_DIR}/taxonomy_trees.nwk', "newick")

1