# Visualize taxonomy and alpha/beta diversities

## Platform dependent part
- Resolve platform setup
- the difference to local imports should be resolved by setting the Blue Cloud VRE well, Colab will still be an issue.

In [1]:
import sys
import os
import logging
from IPython import get_ipython
logger = logging.getLogger(name="Diversity analysis app")

if 'google.colab' in str(get_ipython()):
    print('Setting Google colab, you will need a ngrok account to make the dashboard display over the tunnel. \
    https://ngrok.com/')
    # clone the momics-demos repository to use it to load data
    try:
        os.system('git clone https://github.com/palec87/momics-demos.git')
        logger.info(f"Repository cloned")
    except OSError as e:
        logger.info(f"An error occurred while cloning the repository: {e}")

    sys.path.insert(0,'/content/momics-demos')

    # this step takes time beacause of many dependencies
    os.system('pip install marine-omics')

from momics.utils import (
    memory_load, reconfig_logger,
    init_setup, get_notebook_environment,
)

# Set up logging
reconfig_logger()

# Determine the notebook environment
env = get_notebook_environment()

init_setup()
logger.info(f"Environment: {env}")

INFO | root | Logging.basicConfig completed successfully
INFO | Diversity analysis app | Environment: vscode
INFO | Diversity analysis app | Environment: vscode


## Imports

In [2]:
import warnings
import holoviews as hv

from functools import partial
warnings.filterwarnings('ignore')

import pandas as pd
import panel as pn

from mgo.udal import UDAL

# All low level functions are imported from the momics package
from momics.loader import load_parquets_udal
from momics.metadata import get_metadata_udal, enhance_metadata
import momics.plotting as pl
from momics.panel_utils import (
    diversity_select_widgets, create_indicators_diversity,
    serve_app, close_server,
)

from momics.diversity import (
    beta_diversity_parametrized,
)

from momics.taxonomy import (
    fill_taxonomy_placeholders,
    remove_high_taxa,
    prevalence_cutoff_taxonomy,
)

## User settings

In [3]:
DEBUG = True  # enable stdout logging

## Loading

In [4]:
udal = UDAL()

In [5]:
# parquet files
if 'google.colab' in str(get_ipython()):
    root_folder = os.path.abspath(os.path.join('/content/momics-demos'))
else:
    root_folder = os.path.abspath(os.path.join('../'))

assets_folder = os.path.join(root_folder, 'assets')

In [6]:
@pn.cache()
def get_data():
    return load_parquets_udal()

# Load and merge metadata
@pn.cache()
def get_full_metadata():
    return get_metadata_udal()

@pn.cache()
def get_valid_samples():
    df_valid = pd.read_csv(
        os.path.join(root_folder, 'data/shipment_b1b2_181.csv')
    )
    return df_valid

In [7]:
# Load metadata
full_metadata = get_full_metadata()

# filter the metadata only for valid 181 samples
valid_samples = get_valid_samples()
full_metadata = enhance_metadata(full_metadata, valid_samples)

# LOADing data
mgf_parquet_dfs = get_data()

In [8]:
# select categorical columns from metadata
categorical_columns = sorted(full_metadata.select_dtypes(include=['object', "boolean"]).columns)
cat_to_remove = ["ref_code", "samp_description", "source_mat_id", "source_mat_id_orig",
                 'ENA_accession_number_sample',
]
# remove columns that are not needed for the analysis
categorical_columns = [k for k in categorical_columns if k not in cat_to_remove]

# select numerical columns from metadata
numerical_columns = sorted(full_metadata.select_dtypes(include=['int64', 'float64']).columns)
numerical_columns.remove("chem_administration")
numerical_columns.remove("bac_prod")
numerical_columns.remove("bac_prod_method")
numerical_columns.remove("biomass")
numerical_columns.remove("biomass_method")
numerical_columns.remove("diss_carb_dioxide")
numerical_columns.remove("diss_org_carb")
numerical_columns.remove("diss_org_carb_method")
numerical_columns.remove("diss_inorg_carb")
numerical_columns.remove("diss_inorg_carb_method")
numerical_columns.remove("diss_org_nitro")
numerical_columns.remove("diss_carb_dioxide_method")
numerical_columns.remove("diss_org_nitro_method")
numerical_columns.remove("down_par")
numerical_columns.remove("down_par_method")
numerical_columns.remove("long_store")
numerical_columns.remove("membr_cut")
numerical_columns.remove("n_alkanes")
numerical_columns.remove("n_alkanes_method")
numerical_columns.remove("part_org_carb")
numerical_columns.remove("part_org_carb_method")
numerical_columns.remove("part_org_nitro")
numerical_columns.remove("part_org_nitro_method")
numerical_columns.remove("petroleum_hydrocarb")
numerical_columns.remove("petroleum_hydrocarb_method")
numerical_columns.remove("sulfate")
numerical_columns.remove("sulfate_method")
numerical_columns.remove("sulfide")
numerical_columns.remove("sulfide_method")
numerical_columns.remove("water_current")
numerical_columns.remove("water_current_method")

# assert len(full_metadata.columns) == len(numerical_columns) + len(categorical_columns) + len(cat_to_remove)  # + for removed cats

if DEBUG:
    logger.info(f"Data table names are:\n{mgf_parquet_dfs.keys()}")
    logger.info(f"Categorical metadata columns are:\n{categorical_columns}")
    logger.info(f"Numerical metadata columns are:\n{numerical_columns}")

INFO | Diversity analysis app | Data table names are:
dict_keys(['go', 'go_slim', 'ips', 'ko', 'pfam', 'lsu', 'ssu'])
INFO | Diversity analysis app | Categorical metadata columns are:
['ENA_accession_number_project', 'ENA_accession_number_umbrella', 'ammonium_method', 'arr_date_hq', 'arr_date_seq', 'chlorophyll_method', 'conduc_method', 'contact_email', 'contact_name', 'contact_orcid', 'density_method', 'diss_oxygen_method', 'env_broad_biome', 'env_local', 'env_material', 'env_package', 'extra_site_info', 'failure', 'failure_comment', 'geo_loc_name', 'investigation_type', 'loc_broad_ocean', 'loc_loc', 'loc_regional', 'month_name', 'nitrate_method', 'nitrite_method', 'obs_id', 'organism_count', 'organism_count_method', 'organization', 'organization_country', 'organization_edmoid', 'other_person', 'other_person_orcid', 'ph_method', 'phaeopigments_method', 'phosphate_method', 'pigments', 'pigments_method', 'pressure_method', 'project_name', 'replicate', 'replicate_info', 'samp_collect_dev

In [9]:
df = mgf_parquet_dfs['ssu'].copy()
if DEBUG:
    logger.info(f'Number of unique ref_codes: {df.ref_code.nunique()}')

INFO | Diversity analysis app | Number of unique ref_codes: 181


In [10]:
(select_table, select_cat_factor, 
 select_table_beta, select_taxon,
 select_beta_factor, beta_norm,
 ) = diversity_select_widgets(categorical_columns, numerical_columns)

In [11]:
tables = {
    "lsu": mgf_parquet_dfs['lsu'].copy(),
    "ssu": mgf_parquet_dfs['ssu'].copy(),
}

TAXONOMY = pd.DataFrame()
TAXONOMY_RANKS = ['superkingdom', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']

# redefine select_table_beta
select_table_beta = pn.widgets.Select(
    name='Select table for beta diversity',
    options=list(tables.keys()),
    value='ssu',
)

## Alpha diversity

In [12]:
pn.extension("tabulator")
hv.extension("bokeh", "plotly")
if 'google.colab' in str(get_ipython()):
    pn.extension(comms='colab')

In [13]:
sort_alpha = pn.widgets.RadioBoxGroup(
    name='Sort by',
    options=['factor', 'values'],
    inline=True,
)
sort_alpha.value = 'factor'

backend = pn.widgets.RadioBoxGroup(
    name='Backend',
    options=['matplotlib', 'hvplot'],
    inline=True,
)

pn.Column(
    pn.Row(select_table,select_cat_factor),
    sort_alpha,
    backend
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'e1a839ef-4681-4735-8c0c-41acf0ac10f6': {'version…

In [14]:
pl.alpha_plot(
    tables_dict=mgf_parquet_dfs,
    table_name=select_table.value,
    factor=select_cat_factor.value,
    metadata=full_metadata,
    order=sort_alpha.value,
    backend=backend.value,
)

Key column: id
length of the ref_codes: 181
table shape: (2763, 181)


BokehModel(combine_events=True, render_bundle={'docs_json': {'be5de34f-c2e2-4fb3-b6c9-503d3372b4de': {'version…

In [15]:
pl.av_alpha_plot(
    tables_dict=mgf_parquet_dfs,
    table_name=select_table.value,
    factor=select_cat_factor.value,
    metadata=full_metadata,
    order=sort_alpha.value,
    backend=backend.value,
)

Key column: id
length of the ref_codes: 181
table shape: (2763, 181)


BokehModel(combine_events=True, render_bundle={'docs_json': {'4749638c-4690-467a-84da-a2cd07a142f9': {'version…

## Beta diversity

In [16]:
mapping = pn.widgets.Checkbox(
    name="strict mapping to selected taxonomic level (takes time)",
    value=True,
)

low_prevalence_cutoff = pn.widgets.FloatInput(
    name='Low prevalence cutoff [%]',
    value=10, step=1, start=0, end=100,
    description="Percentage of samples in which the taxon must be present not to be removed.",
)

button_process_taxonomy = pn.widgets.Button(
    name="Process taxonomy",
    button_type="primary",
    description="This will process the taxonomy and update the plots.",
    width=200,
)
progress1 = pn.indicators.Progress(name='Pre-processing progress', value=-1,
                                   active=True, width=200)

## Pre-process taxonomy

In [17]:
def process_taxonomy(table, high_taxon, mapping, prevalence_cutoff_value):
    """
    Preprocess the taxonomy data.
    """
    global TAXONOMY
    TAXONOMY = pd.DataFrame()
    df_filt = tables[table]

    df_filt = fill_taxonomy_placeholders(df_filt, TAXONOMY_RANKS)

    logger.info("Preprocessing taxonomy...")
    if high_taxon != 'None':
        bef = df_filt.shape[0]
        progress1.value = 0
        df_filt = remove_high_taxa(df_filt, TAXONOMY_RANKS, tax_level=high_taxon, strict=mapping)
        aft = df_filt.shape[0]
        logger.info(f"Removed {bef - aft} high taxa at level: {high_taxon}")
        progress1.value = 50

    # low prevalence cutoff
    TAXONOMY = prevalence_cutoff_taxonomy(df_filt, percent=prevalence_cutoff_value)
    progress1.value = 100

button_process_taxonomy.on_click(
    lambda event: process_taxonomy(
        select_table_beta.value,
        select_taxon.value,
        mapping.value,
        low_prevalence_cutoff.value
    )
)

Watcher(inst=Button(button_type='primary', description='This will process t..., name='Process taxonomy', width=200), cls=<class 'panel.widgets.button.Button'>, fn=<function <lambda> at 0x730c6821e700>, mode='args', onlychanged=False, parameter_names=('clicks',), what='value', queued=False, precedence=0)

In [18]:
pn.Column(
    pn.Row(
        select_table_beta,
        select_taxon,
        select_beta_factor,
        
    ),
    beta_norm,
    pn.layout.Divider(),
    mapping,
    low_prevalence_cutoff,
    progress1
    # button_process_taxonomy,
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'6f80b5e4-cfce-41bf-803e-62ba5c1bf5ce': {'version…

In [19]:
process_taxonomy(
    select_table_beta.value,
    select_taxon.value,
    mapping.value,
    low_prevalence_cutoff.value
)

INFO | Diversity analysis app | Preprocessing taxonomy...
100%|██████████| 218/218 [00:52<00:00,  4.18it/s]
INFO | momics.taxonomy | Number of bad taxa at phylum: 9
INFO | momics.taxonomy | Unmapped taxa at phylum: ['Candidatus_Altiarchaeota', 'Hemichordata', 'Aquificae', 'Coprothermobacterota', 'Candidatus_Verstraetearchaeota', 'Orthonectida', 'Chrysiogenetes', 'Parabasalia', 'Candidatus_Rokubacteria']
INFO | Diversity analysis app | Removed 91394 high taxa at level: phylum


In [23]:
TAXONOMY.shape

(4929, 11)

In [21]:
from skbio.stats.ordination import pcoa

In [22]:
if TAXONOMY.empty:
    pl.beta_pc_plot.object, explained_var = pl.beta_plot_pc(
        tables_dict=tables,
        metadata=full_metadata,
        table_name=select_table_beta.value,
        factor=select_beta_factor.value,
        taxon=select_taxon.value,
    )
    
else:
    beta = beta_diversity_parametrized(
        TAXONOMY, taxon=select_taxon.value, metric="braycurtis"
    )
    pcoa_result = pcoa(beta, method="eigh")  # , number_of_dimensions=3)
    explained_variance = (
        pcoa_result.proportion_explained[0],
        pcoa_result.proportion_explained[1]
    )
    pcoa_df = pd.merge(
        pcoa_result.samples,
        full_metadata,
        left_index=True,
        right_on="ref_code",
        how="inner",
    )
    assert 'source_mat_id' in pcoa_df.columns, (f"Missing 'source_mat_id' column in PCoA DataFrame")
    beta_pc_plot, explained_var = pl.hvplot_plot_pcoa_black(pcoa_df, color_by=select_beta_factor.value, explained_variance=explained_variance), explained_variance

explained_var_indicator = sum(explained_var) * 100  # convert to percentage
print('Explained variance:', explained_var_indicator)
beta_pc_plot.opts(
    title=f'Beta diversity PCA plot for {select_table_beta.value} table',
    width=1200,
    height=800,
)

Explained variance: 58.05568542263515


In [24]:
beta = beta_diversity_parametrized(
            TAXONOMY, taxon=select_taxon.value, metric="braycurtis"
        )

In [24]:
pl.beta_plot(
    tables_dict=mgf_parquet_dfs,
    table_name=select_table_beta.value,
    norm=beta_norm.value,
    taxon=select_taxon.value,
    backend=backend.value,
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'23daf491-790e-4535-8fec-091a43d14ebc': {'version…

In [34]:
# beta_pc_plot, explained_var_indicator = pl.beta_plot_pc(
#         tables_dict=mgf_parquet_dfs,
#         metadata=full_metadata,
#         table_name=select_table_beta.value,
#         factor=select_beta_factor.value,
#         taxon=select_taxon.value,
#     )
# print('Explained variance:', explained_var_indicator)
# beta_pc_plot.opts(
#     title=f'Beta diversity PCA plot for {select_table_beta.value} table',
#     width=1200,
#     height=800,
# )