# Identify samples which contain certain taxa

## Platform dependent part
- Resolve platform setup
- the difference to local imports should be resolved by setting the Blue Cloud VRE well, Colab will still be an issue.

In [1]:
import sys
import os
import logging
from IPython import get_ipython
logger = logging.getLogger(name="Taxonomic finder")

if 'google.colab' in str(get_ipython()):
    print('Setting Google colab, you will need a ngrok account to make the dashboard display over the tunnel. \
    https://ngrok.com/')
    # clone the momics-demos repository to use it to load data
    try:
        os.system('git clone https://github.com/palec87/momics-demos.git')
        logger.info(f"Repository cloned")
    except OSError as e:
        logger.info(f"An error occurred while cloning the repository: {e}")

    sys.path.insert(0,'/content/momics-demos')

    # this step takes time beacause of many dependencies
    os.system('pip install marine-omics')

from momics.utils import (
    memory_load, reconfig_logger,
    init_setup, get_notebook_environment,
)

# Set up logging
reconfig_logger()

# Determine the notebook environment
env = get_notebook_environment()

init_setup()
logger.info(f"Environment: {env}")

INFO | root | Logging.basicConfig completed successfully
INFO | Taxonomic finder | Environment: vscode
INFO | Taxonomic finder | Environment: vscode


## Imports

In [2]:
import warnings
import holoviews as hv
from skbio.stats.ordination import pcoa

warnings.filterwarnings('ignore')

import pandas as pd
import panel as pn

# from mgo.udal import UDAL

# All low level functions are imported from the momics package
import momics.plotting as pl
from momics.panel_utils import tax_finder_selector

from momics.diversity import (
    beta_diversity_parametrized,
    find_taxa_in_table,
)
from momics.utils import load_and_clean

In [12]:
from typing import Tuple

def beta_plot_abund_taxa(
    table: pd.DataFrame,
    metadata: pd.DataFrame,
    found_taxa: pd.DataFrame,
    taxon: str = "ncbi_tax_id",
    **kwargs,
) -> Tuple[hv.element.Scatter, Tuple[float, float]]:
    """
    Creates a beta diversity PCoA plot.

    Args:
        table (pd.DataFrame): DataFrame containing species abundances.
        metadata (pd.DataFrame): A DataFrame containing metadata.
        factor (str): The column name to color the points by.
        taxon (str, optional): The taxon level for beta diversity calculation. Defaults to "ncbi_tax_id".

    Returns:
        Tuple[hv.element.Scatter, Tuple[float, float]]: A tuple containing the beta diversity PCoA plot and the explained variance for PC1 and PC2.
    """
    log_scale = kwargs.get('log_scale', False)
    beta = beta_diversity_parametrized(
        table, taxon=taxon, metric="braycurtis"
    )
    pcoa_result = pcoa(beta, method="eigh")
    explained_variance = (
        pcoa_result.proportion_explained[0],
        pcoa_result.proportion_explained[1],
    )
    if not set(pcoa_result.samples.index) == set(metadata.index):
        raise ValueError("Metadata index name does not match PCoA result.")

    pcoa_df = pd.merge(
        pcoa_result.samples,
        metadata,
        left_index=True,
        right_index=True,
        how="inner",
    )
    pcoa_df['found_abundance'] = 0
    abundance_sum = found_taxa.groupby('source material ID')['abundance'].sum()
    for tax in abundance_sum.index:
        pcoa_df.loc[tax, 'found_abundance'] = abundance_sum[tax]

    return (
        pl.hvplot_plot_pcoa_black(
            pcoa_df, color_by='found_abundance', explained_variance=explained_variance,
            log_scale=log_scale,
            pallette="Viridis",
        ),
        explained_variance,
    )

## User settings

In [3]:
DEBUG = True  # enable stdout logging

## Loading

In [4]:
# parquet files
if 'google.colab' in str(get_ipython()):
    root_folder = os.path.abspath(os.path.join('/content/momics-demos'))
else:
    root_folder = os.path.abspath(os.path.join('../'))

assets_folder = os.path.join(root_folder, 'assets')

In [5]:
def get_valid_samples():
    df_valid = pd.read_csv(
        os.path.join(root_folder, 'data/shipment_b1b2_181.csv')
    )
    return df_valid

valid_samples = get_valid_samples()

In [6]:
# High level function from the momics.utils module
full_metadata, mgf_parquet_dfs = load_and_clean(valid_samples=valid_samples)

In [7]:
# select categorical columns from metadata
categorical_columns = sorted(full_metadata.select_dtypes(include=['object', "boolean"]).columns)

# select numerical columns from metadata
numerical_columns = sorted(full_metadata.select_dtypes(include=['int64', 'float64']).columns)

if DEBUG:
    logger.info(f"Data table names are:\n{mgf_parquet_dfs.keys()}")
    logger.info(f"Categorical metadata columns are:\n{categorical_columns}")
    logger.info(f"Numerical metadata columns are:\n{numerical_columns}")

INFO | Taxonomic finder | Data table names are:
dict_keys(['go', 'go_slim', 'ips', 'ko', 'pfam', 'lsu', 'ssu'])
INFO | Taxonomic finder | Categorical metadata columns are:
['ammonium method', 'chlorophyll method', 'conductivity method', 'country', 'density method', 'dissolved oxygen method', 'environment (biome)', 'environment (feature)', 'environment (material)', 'environmental package', 'investigation type', 'month name', 'nitrate method', 'nitrite method', 'observatory ID', 'observatory local location', 'observatory location ocean or sea', 'observatory regional location', 'organism count', 'organism count method', 'organization', 'organization country', 'pH method', 'phaeopigments method', 'phosphate method', 'pigments (ug/l)', 'pigments method', 'pressure method', 'project name', 'replicate info', 'replicate number', 'sample collection device or method', 'sea subsurface salinity method', 'sea subsurface temperature method', 'sea surface salinity method', 'sea surface temperature me

In [8]:
# filter out only the taxonomy tables
tables = {
    "lsu": mgf_parquet_dfs['lsu'].copy(),
    "ssu": mgf_parquet_dfs['ssu'].copy(),
}

TAXONOMY = pd.DataFrame()
TAXONOMY_RANKS = ['superkingdom', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']

## Taxonomy finder

In [11]:
(select_table_tax,
 tax_level,
 search_term,
 checkbox_exact_match,
 log_scale_checkbox,
) = tax_finder_selector()

tax_plot_beta = pn.pane.HoloViews(
    name="Beta PCoA",
    width=1200,
    height=500,
)

In [24]:
pn.Column(
    select_table_tax,
    tax_level,
    search_term,
    checkbox_exact_match,
    log_scale_checkbox,
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'59326a34-8cb4-496d-87ad-eca9066b0552': {'version…

In [26]:
ncbi_tax_id = True if tax_level.value == 'ncbi_tax_id' else False
found_taxa = find_taxa_in_table(
    table=tables[select_table_tax.value],
    tax_level=tax_level.value,
    search_term=search_term.value,
    ncbi_tax_id=ncbi_tax_id,
    exact_match=checkbox_exact_match.value)

tax_plot_beta.object, explained_var = beta_plot_abund_taxa(
    table=tables[select_table_tax.value],
    metadata=full_metadata,
    found_taxa=found_taxa,
    taxon='phylum' if tax_level.value in ['all', 'ncbi_tax_id'] else tax_level.value,
    log_scale=log_scale_checkbox.value,
)

explained_var_indicator = sum(explained_var) * 100  # convert to percentage

### Visualize

In [21]:
tax_plot_beta

BokehModel(combine_events=True, render_bundle={'docs_json': {'52f37651-4fb9-4e74-b407-266812c2ac2d': {'version…

In [27]:
found_taxa

Unnamed: 0_level_0,Unnamed: 1_level_0,abundance,superkingdom,kingdom,phylum,class,order,family,genus,species
source material ID,ncbi_tax_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
EMOBON_BPNS_So_5,286,2.0,Bacteria,,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae,Pseudomonas,
EMOBON_BPNS_So_6,286,16.0,Bacteria,,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae,Pseudomonas,
EMOBON_BPNS_So_13,286,23.0,Bacteria,,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae,Pseudomonas,
EMOBON_NRMCB_So_1,286,3.0,Bacteria,,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae,Pseudomonas,
EMOBON_NRMCB_So_7,286,2.0,Bacteria,,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae,Pseudomonas,
...,...,...,...,...,...,...,...,...,...,...
EMOBON_VB_Wa_94,286,2.0,Bacteria,,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae,Pseudomonas,
EMOBON_VB_Wa_140,286,8.0,Bacteria,,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae,Pseudomonas,
EMOBON_VB_Wa_141,286,4.0,Bacteria,,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae,Pseudomonas,
EMOBON_VB_Wa_137,286,3.0,Bacteria,,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae,Pseudomonas,
