# Exploring the NP classifier chemical classes 

### Import modules

In [1]:
from io import StringIO
from collections import defaultdict
import os

import pandas as pd
from tqdm import tqdm

# get names from ncbitaxon ontology
from utils import ncbitaxon_curies_to_names

# Taxonomy tree
from ete3 import NCBITaxa
from Bio import Phylo

In [2]:
DATA_DIR = '../data'

### Load NP classifier file

In [3]:
npclassifier_family_df = pd.read_csv(
    os.path.join(DATA_DIR, 'np_classifier_family_vector.tsv'),
    sep='\t'
)

In [4]:
# Set plant name as index
npclassifier_family_df.set_index('taxon', inplace=True)

# Merging the non-medicial and medicinal data
npclassifier_family_df = npclassifier_family_df.groupby(['taxon']).agg(sum)

### Get family based counts

In [5]:
npclassifier_family_df = npclassifier_family_df.T

In [6]:
family_class_score = pd.DataFrame(
    npclassifier_family_df.astype(bool).sum(axis=1), columns=['no.of.families']
).reset_index()
family_class_score.rename(columns={'index': 'class'}, inplace=True)
family_class_score.sort_values(by='no.of.families', ascending=False).head(4)

Unnamed: 0,class,no.of.families
344,Flavonols,305
170,Stigmastane steroids,290
233,Cinnamic acids and derivatives,275
445,Flavones,274


In [7]:
family_class_score.sort_values(by='no.of.families', ascending=True).head(4)

Unnamed: 0,class,no.of.families
439,3-oligoenoyltetramic acids,0
12,Sterpurane sesquiterpenoids,0
198,Polyene macrolides,1
477,RiPPs-Lasso peptides,1


## Subselecting chemical classes

In [8]:
selected_classes = """Dihydroflavonols
Neolignans
Eudesmane sesquiterpenoids
Chalcones
Monocyclic monoterpenoids
Furofuranoid lignans
Gallotannins
Acyclic monoterpenoids
Flavan-3-ols
Ursane and Taraxastane triterpenoids
Menthane monoterpenoids
Flavanones
Shikimic acids and derivatives
Lupane triterpenoids
Simple coumarins
Oleanane triterpenoids
Flavones
Cinnamic acids and derivatives
Stigmastane steroids
Flavonols"""

In [9]:
npclassifier_family_df = npclassifier_family_df.reset_index()
npclassifier_family_df.rename(columns={'index': 'class_name'}, inplace=True)

# Set index to class name
npclassifier_family_df.set_index('class_name', inplace=True)

In [10]:
def normalize_df(df):
    "Normalize columns of a dataframe to percentage"
    for col in df.columns:
        df[col] = df[col] / df[col].sum()
    return df

In [11]:
npclassifier_family_df = normalize_df(npclassifier_family_df)

In [12]:
# Subset families to 20 classes
npclassifier_family_df = npclassifier_family_df[npclassifier_family_df.index.isin(selected_classes.split('\n'))]
len(npclassifier_family_df)

20

In [13]:
plant_names = ncbitaxon_curies_to_names(npclassifier_family_df.columns)

In [14]:
# Replace column names with plant names
npclassifier_family_df.rename(
    columns=plant_names,
   inplace=True,
)

In [15]:
npclassifier_family_df.head(1)

taxon,Aptandraceae,Thesiaceae,Cervantesiaceae,Viscaceae,Siparunaceae,Achariaceae,Anastrophyllaceae,Nartheciaceae,Cystopteridaceae,Rhachidosoraceae,...,Biebersteiniaceae,Nitrariaceae,Muntingiaceae,Orobanchaceae,Sciadopityaceae,Schistochilaceae,Dumortieraceae,Lophocoleaceae,Myliaceae,Solenostomataceae
class_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Furofuranoid lignans,0.0,0.0,0.026667,0.020243,0.0,0.0,0.030769,0.017857,0.0,0.0,...,0.0,0.0,0.0,0.020305,0.0,0.0,0.0,0.027027,0.0,0.009434


In [16]:
# transpose matrix
npclassifier_family_df = npclassifier_family_df.T

In [17]:
# Remove rows with only zeros in all columns
npclassifier_family_df = npclassifier_family_df.loc[(npclassifier_family_df!=0).any(axis=1)]
npclassifier_family_df.to_csv(f'{DATA_DIR}/chemical_classes_family.tsv', sep='\t')

In [18]:
npclassifier_family_df.head(2)

class_name,Furofuranoid lignans,Dihydroflavonols,Shikimic acids and derivatives,Eudesmane sesquiterpenoids,Stigmastane steroids,Lupane triterpenoids,Chalcones,Cinnamic acids and derivatives,Menthane monoterpenoids,Flavan-3-ols,Oleanane triterpenoids,Flavonols,Ursane and Taraxastane triterpenoids,Gallotannins,Acyclic monoterpenoids,Simple coumarins,Flavanones,Monocyclic monoterpenoids,Flavones,Neolignans
taxon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Aptandraceae,0.0,0.0,0.045455,0.045455,0.136364,0.045455,0.0,0.045455,0.0,0.0,0.090909,0.045455,0.090909,0.0,0.0,0.0,0.0,0.0,0.045455,0.045455
Thesiaceae,0.0,0.0,0.0,0.0,0.033333,0.233333,0.033333,0.0,0.0,0.0,0.233333,0.033333,0.033333,0.033333,0.0,0.033333,0.0,0.0,0.0,0.0


### Generate taxonomy tree

In [19]:
ncbi = NCBITaxa()

In [20]:
all_plants = [
    ncbitax
    for ncbitax in plant_names
    if ncbitax.startswith('ncbitaxon') and plant_names[ncbitax] in npclassifier_family_df.index
]
len(all_plants)

373

In [21]:
lineage_dict = defaultdict(dict)

for plant in tqdm(all_plants):
    plant = int(plant.split(':')[1])

    if plant == '':
        continue 

    tree = ncbi.get_lineage(plant)
    tree = [
        i
        for i in tree
        # if i >= 33090 # Viridaeplante
    ]

    # Get the names of the nodes
    names = ncbi.get_taxid_translator(tree)
    
    # Replace ids with names
    tree = [
        names[taxid]
        for taxid in tree
    ]

    for idx in range(len(names) - 1):
        
        parent = tree[idx]
        child = tree[idx + 1]
                             
        lineage_dict[parent][child] = idx + 1

100%|██████████| 373/373 [00:00<00:00, 2323.12it/s]


In [22]:
def newickify(node_to_children, root_node) -> str:
    visited_nodes = set()

    def newick_render_node(name, distance: float) -> str:
        assert name not in visited_nodes, "Error: The tree may not be circular!"

        if name not in node_to_children:
            # Leafs
            return F'{name}:{distance}'
        else:
            # Nodes
            visited_nodes.add(name)
            children = node_to_children[name]
            children_strings = [newick_render_node(child, children[child]) for child in children.keys()]
            children_strings = ",".join(children_strings)
            return F'({children_strings}){name}:{distance}'

    newick_string = newick_render_node(root_node, 0) + ';'

    # Ensure no entries in the dictionary are left unused.
    assert visited_nodes == set(node_to_children.keys()), f"Error: some nodes aren't in the tree {set(node_to_children.keys())}"

    return newick_string

In [23]:
string = newickify(
    node_to_children=lineage_dict,
    root_node='root',  # root node
)

In [24]:
biophylo_tree = Phylo.read(StringIO(string), "newick")

In [25]:
len([i for i in biophylo_tree.get_terminals()])

373

In [26]:
Phylo.write(biophylo_tree, f'{DATA_DIR}/taxonomy_tree_1B.nwk', "newick")

1