# Taxonomy Finder

## Platform dependent part
- Resolve platform setup
- the difference to local imports should be resolved by setting the Blue Cloud VRE well, Colab will still be an issue.

In [1]:
import sys
import os
import logging
from IPython import get_ipython
logger = logging.getLogger(name="Taxonomic finder")

if 'google.colab' in str(get_ipython()):
    print('Setting Google colab, you will need a ngrok account to make the dashboard display over the tunnel. \
    https://ngrok.com/')
    # clone the momics-demos repository to use it to load data
    try:
        os.system('git clone https://github.com/palec87/momics-demos.git')
        logger.info(f"Repository cloned")
    except OSError as e:
        logger.info(f"An error occurred while cloning the repository: {e}")

    sys.path.insert(0,'/content/momics-demos')

    # this step takes time beacause of many dependencies
    os.system('pip install marine-omics')

from momics.utils import (
    memory_load, reconfig_logger,
    init_setup, get_notebook_environment,
)

# Set up logging
reconfig_logger()

# Determine the notebook environment
env = get_notebook_environment()

init_setup()
logger.info(f"Environment: {env}")

INFO | root | Logging.basicConfig completed successfully
INFO | Taxonomic finder | Environment: vscode
INFO | Taxonomic finder | Environment: vscode


## Imports

In [2]:
import warnings
import holoviews as hv
from skbio.stats.ordination import pcoa

from functools import partial
warnings.filterwarnings('ignore')

import pandas as pd
import panel as pn

# All low level functions are imported from the momics package
import momics.plotting as pl
from momics.panel_utils import (
    tax_finder_selector, create_indicators_diversity,
    serve_app, close_server,
)

from momics.diversity import (
    beta_diversity_parametrized,
    find_taxa_in_table,
)
from momics.utils import load_and_clean

In [3]:
from typing import Tuple

def beta_plot_abund_taxa(
    table: pd.DataFrame,
    metadata: pd.DataFrame,
    found_taxa: pd.DataFrame,
    taxon: str = "ncbi_tax_id",
    **kwargs,
) -> Tuple[hv.element.Scatter, Tuple[float, float]]:
    """
    Creates a beta diversity PCoA plot.

    Args:
        table (pd.DataFrame): DataFrame containing species abundances.
        metadata (pd.DataFrame): A DataFrame containing metadata.
        factor (str): The column name to color the points by.
        taxon (str, optional): The taxon level for beta diversity calculation. Defaults to "ncbi_tax_id".

    Returns:
        Tuple[hv.element.Scatter, Tuple[float, float]]: A tuple containing the beta diversity PCoA plot and the explained variance for PC1 and PC2.
    """
    log_scale = kwargs.get('log_scale', False)
    beta = beta_diversity_parametrized(
        table, taxon=taxon, metric="braycurtis"
    )
    pcoa_result = pcoa(beta, method="eigh")
    explained_variance = (
        pcoa_result.proportion_explained[0],
        pcoa_result.proportion_explained[1],
    )
    if not set(pcoa_result.samples.index) == set(metadata.index):
        raise ValueError("Metadata index name does not match PCoA result.")

    pcoa_df = pd.merge(
        pcoa_result.samples,
        metadata,
        left_index=True,
        right_index=True,
        how="inner",
    )
    pcoa_df['found_abundance'] = 0
    abundance_sum = found_taxa.groupby('source material ID')['abundance'].sum()
    for tax in abundance_sum.index:
        pcoa_df.loc[tax, 'found_abundance'] = abundance_sum[tax]

    return (
        pl.hvplot_plot_pcoa_black(
            pcoa_df, color_by='found_abundance', explained_variance=explained_variance,
            log_scale=log_scale,
            pallette="Viridis",
        ),
        explained_variance,
    )

## User settings

In [4]:
DEBUG = True  # enable stdout logging

## Loading

In [5]:
# parquet files
if 'google.colab' in str(get_ipython()):
    root_folder = os.path.abspath(os.path.join('/content/momics-demos'))
else:
    root_folder = os.path.abspath(os.path.join('../'))

assets_folder = os.path.join(root_folder, 'assets')

In [6]:
def get_valid_samples():
    df_valid = pd.read_csv(
        os.path.join(root_folder, 'data/shipment_b1b2_181.csv')
    )
    return df_valid

valid_samples = get_valid_samples()

In [7]:
# High level function from the momics.utils module
full_metadata, mgf_parquet_dfs = load_and_clean(valid_samples=valid_samples)

In [8]:
# select categorical columns from metadata
categorical_columns = sorted(full_metadata.select_dtypes(include=['object', "boolean"]).columns)

# select numerical columns from metadata
numerical_columns = sorted(full_metadata.select_dtypes(include=['int64', 'float64']).columns)

if DEBUG:
    logger.info(f"Data table names are:\n{mgf_parquet_dfs.keys()}")
    logger.info(f"Categorical metadata columns are:\n{categorical_columns}")
    logger.info(f"Numerical metadata columns are:\n{numerical_columns}")

INFO | Taxonomic finder | Data table names are:
dict_keys(['go', 'go_slim', 'ips', 'ko', 'pfam', 'lsu', 'ssu'])
INFO | Taxonomic finder | Categorical metadata columns are:
['ammonium method', 'chlorophyll method', 'conductivity method', 'country', 'density method', 'dissolved oxygen method', 'environment (biome)', 'environment (feature)', 'environment (material)', 'environmental package', 'investigation type', 'month name', 'nitrate method', 'nitrite method', 'observatory ID', 'observatory local location', 'observatory location ocean or sea', 'observatory regional location', 'organism count', 'organism count method', 'organization', 'organization country', 'pH method', 'phaeopigments method', 'phosphate method', 'pigments (ug/l)', 'pigments method', 'pressure method', 'project name', 'replicate info', 'replicate number', 'sample collection device or method', 'sea subsurface salinity method', 'sea subsurface temperature method', 'sea surface salinity method', 'sea surface temperature me

In [9]:
# filter out only the taxonomy tables
tables = {
    "lsu": mgf_parquet_dfs['lsu'].copy(),
    "ssu": mgf_parquet_dfs['ssu'].copy(),
}

TAXONOMY = pd.DataFrame()
TAXONOMY_RANKS = ['superkingdom', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']

## APP setup

In [10]:
pn.extension("tabulator")
hv.extension("bokeh", "plotly")
if 'google.colab' in str(get_ipython()):
    pn.extension(comms='colab')
ACCENT = "teal"

styles = {
    "box-shadow": "rgba(50, 50, 93, 0.25) 0px 6px 12px -2px, rgba(0, 0, 0, 0.3) 0px 3px 7px -3px",
    "border-radius": "4px",
    "padding": "10px",
}

(select_table_tax,
 tax_level,
 search_term,
 checkbox_exact_match,
 log_scale_checkbox,
) = tax_finder_selector()

backend = pn.widgets.RadioBoxGroup(
    name='Backend',
    options=['matplotlib', 'hvplot'],
    inline=True,
)
backend.value = 'hvplot'

progress_bar, indicator_usage = create_indicators_diversity()

def update_used_gb(event):
    if not event:
        return

    used_gb, total_gb = memory_load()
    progress_bar.value = int(used_gb / total_gb * 100)
    indicator_usage.value = used_gb

# show indicator of the explained variance
explained_var_indicator = pn.indicators.Number(
    name='Explained variance by PC1 + PC2', value=0, format='{value:.1f}%',
    font_size='20pt',
    title_size='12pt',
    colors=[(33, 'red'), (50, 'gold'), (66, 'green')]
)

tax_plot_beta = pn.pane.HoloViews(
    name="Beta PCoA",
    width=1200,
    height=500,
)

### Update methods

In [11]:
def update_tax_plot_beta(tables, metadata):
    ncbi_tax_id = True if tax_level.value == 'ncbi_tax_id' else False
    found_taxa = find_taxa_in_table(
        table=tables[select_table_tax.value],
        tax_level=tax_level.value,
        search_term=search_term.value,
        ncbi_tax_id=ncbi_tax_id,
        exact_match=checkbox_exact_match.value)
    logger.info(f"Found {len(found_taxa)} matching taxa for search term '{search_term.value}' at level '{tax_level.value}'")
    tabs.__setitem__(1,
                     pn.widgets.Tabulator(
                        found_taxa,
                        name='Found taxa table',
                        page_size=25,
                    ),
                    )
    tax_plot_beta.object, explained_var = beta_plot_abund_taxa(
        table=tables[select_table_tax.value],
        metadata=metadata,
        found_taxa=found_taxa,
        taxon='phylum' if tax_level.value in ['all', 'ncbi_tax_id'] else tax_level.value,
        log_scale=log_scale_checkbox.value,
    )
    pcoa_tab.sizing_mode="stretch_both"

    explained_var_indicator.value = sum(explained_var) * 100  # convert to percentage

### Buttons

In [12]:
button_run_finder = pn.widgets.Button(
    name="Find taxa",
    button_type="primary",
    description="This will find the taxa and update the plots.",
    width=200,
)

button_run_finder.on_click(
    lambda event: update_tax_plot_beta(
        tables,
        full_metadata,
    )
)

Watcher(inst=Button(button_type='primary', description='This will find t..., name='Find taxa', width=200), cls=<class 'panel.widgets.button.Button'>, fn=<function <lambda> at 0x7acc37ebc400>, mode='args', onlychanged=False, parameter_names=('clicks',), what='value', queued=False, precedence=0)

### Tabs

In [13]:
pcoa_tab = pn.Column(
    explained_var_indicator,
    tax_plot_beta,
    sizing_mode="stretch_both",
    min_width=600,
    min_height=400,
    scroll=True,
)

tabs = pn.Tabs(
    ('PCoA', pcoa_tab),
    ('Found taxa table', ""),
    styles=styles,
    margin=10,
)

## APP

In [14]:
def app():
    cb = pn.state.add_periodic_callback(
        partial(update_used_gb, indicator_usage),
        period=1000,
        timeout=None,
        )
    cb2 = pn.state.add_periodic_callback(
        partial(update_used_gb, progress_bar),
        period=1000,
        timeout=None,
        )
    toggle = pn.widgets.Toggle(name='Toggle callback', value=True)
    toggle.link(cb, bidirectional=True, value='running')
    toggle.link(cb2, bidirectional=True, value='running')

    indicators = pn.FlexBox(
        progress_bar, indicator_usage, toggle)

    template = pn.template.FastListTemplate(
        title="Taxonomic Finder",
        sidebar=[
            "# Search Parameters", select_table_tax, tax_level, search_term, checkbox_exact_match, log_scale_checkbox,
            pn.layout.Divider(),
            button_run_finder,
        ],
        main=[pn.Column(
                indicators,
                tabs,
                sizing_mode="stretch_both",
                ),
        ],
        main_layout=None,
        accent=ACCENT,
    )
    return template

template = app()

if 'google.colab' in str(get_ipython()):  
    s = serve_app(template, env=env, name="diversity_analysis")
else:
    template.servable()

### Uncomment this if running ngrok tunnel which you want to quit

In [None]:
# only use for the ngrok tunnel in GColab
# close_server(s, env=env)