# Visualize pivoted taxonomy (LSU and SSU tables)
- PCoA beta diversities
- Permanova calculations

## Platform dependent part
- Resolve platform setup
- the difference to local imports should be resolved by setting the Blue Cloud VRE well, Colab will still be an issue.

In [None]:
import sys
import os
import io
import gc
import logging
import psutil

from IPython import get_ipython
logger = logging.getLogger(name="Diversity analysis app")
NUMBER_PERMUTATIONS = 999

if 'google.colab' in str(get_ipython()):
    print('Setting Google colab, you will need a ngrok account to make the dashboard display over the tunnel. \
    https://ngrok.com/')
    # clone the momics-demos repository to use it to load data
    try:
        os.system('git clone https://github.com/palec87/momics-demos.git')
        logger.info(f"Repository cloned")
    except OSError as e:
        logger.info(f"An error occurred while cloning the repository: {e}")

    sys.path.insert(0,'/content/momics-demos')

    # this step takes time beacause of many dependencies
    os.system('pip install marine-omics')

elif psutil.users() == []:
    logger.info("Binder")
    NUMBER_PERMUTATIONS = 29  # permanova extremely slow on binder, therefore a change here
else:
    logger.info("Local")

from momics.utils import (
    memory_load, reconfig_logger,
    init_setup, get_notebook_environment,
    load_and_clean, taxonomy_common_preprocess01
)
from momics.metadata import filter_metadata_table, filter_data

# Set up logging
reconfig_logger()

# Determine the notebook environment
env = get_notebook_environment()

init_setup()
logger.info(f"Environment: {env}")

INFO | root | Logging.basicConfig completed successfully
INFO | Diversity analysis app | Environment: vscode
INFO | Diversity analysis app | Environment: vscode


## Imports

In [None]:
# This needs to be repeated here for the Pannel dashboard to work, WEIRD
# TODO: report as possible bug
import os
import warnings

from functools import partial
warnings.filterwarnings('ignore')

import pandas as pd
import panel as pn
from dotenv import load_dotenv
load_dotenv()

# All low level functions are imported from the momics package
from momics.diversity import run_permanova, update_subset_indicator, update_taxa_count_indicator

import momics.plotting as pl
from momics.panel_utils import (
    create_indicators_diversity,
    serve_app,
    close_server,
)
from momics.constants import TAXONOMY_RANKS

from momics.taxonomy import (
    pivot_taxonomic_data,
    separate_taxonomy,
)

### User settings

In [3]:
DEBUG = True  # enable stdout logging

## Loading

In [5]:
# parquet files
if 'google.colab' in str(get_ipython()):
    root_folder = os.path.abspath(os.path.join('/content/momics-demos'))
else:
    root_folder = os.path.abspath(os.path.join('../'))

assets_folder = os.path.join(root_folder, 'assets')

In [None]:
def get_valid_samples():
    df_valid = pd.read_csv(
        os.path.join(root_folder, 'data/shipment_b1b2_181.csv')
    )
    return df_valid

valid_samples = get_valid_samples()

### Enhance matadata, merge IDs to data

In [None]:
# High level function from the momics.utils module
full_metadata, mgf_parquet_dfs = load_and_clean(valid_samples=valid_samples)


### Clean metadata

In [None]:
# select categorical columns from metadata
categorical_columns = sorted(full_metadata.select_dtypes(include=['object', "boolean"]).columns)

# select numerical columns from metadata
numerical_columns = sorted(full_metadata.select_dtypes(include=['int64', 'float64']).columns)

if DEBUG:
    logger.info(f"Data table names are:\n{mgf_parquet_dfs.keys()}")
    logger.info(f"Categorical metadata columns are:\n{categorical_columns}")
    logger.info(f"Numerical metadata columns are:\n{numerical_columns}")

# This is for PCoA from Andrzej more or less
factors_to_remove = [
    "organization country", "project_name",
    'env_broad_biome', 'env_local', "extra_site_info", 'failure_comment',
    'obs_id', 'size_frac','ship_date', 'ship_date_seq', 'sampling_event', 'organism_count',
    'sample collection device or method',
    'ammonium method', 'chlorophyll method', 'conductivity method', 'density method', 'dissolved oxygen method',
    'nitrate method', 'nitrite method', 'ph method', 'phaeopigments method', 'phosphate method', 'pigments method',
    "project name", 'pressure method',
    'sea subsurface salinity method', 'sea subsurface temperature method', 'sea surface salinity method', 'sea surface temperature method',
    'silicate method', 'turbidity method', 'pigments (ug/l)', 'organism count', 'investigation type', 'pH method',
    'environment (feature)', 'observatory local location', 'environment (biome)',
    'observatory ID', 'organism count method', 'replicate info',
]

factor_cols = [col for col in categorical_columns if col not in factors_to_remove]

INFO | Diversity analysis app | Data table names are:
dict_keys(['go', 'go_slim', 'ips', 'ko', 'pfam', 'lsu', 'ssu'])
INFO | Diversity analysis app | Categorical metadata columns are:
['ammonium method', 'chlorophyll method', 'conductivity method', 'country', 'density method', 'dissolved oxygen method', 'environment (biome)', 'environment (feature)', 'environment (material)', 'environmental package', 'investigation type', 'month name', 'nitrate method', 'nitrite method', 'observatory ID', 'observatory local location', 'observatory location ocean or sea', 'observatory regional location', 'organism count', 'organism count method', 'organization', 'organization country', 'pH method', 'phaeopigments method', 'phosphate method', 'pigments (ug/l)', 'pigments method', 'pressure method', 'project name', 'replicate info', 'replicate number', 'sample collection device or method', 'sea subsurface salinity method', 'sea subsurface temperature method', 'sea surface salinity method', 'sea surface te

In [None]:
logger.info(f'Total metadata columns: {len(full_metadata.columns)}')
logger.info(f'Numerical columns: {len(numerical_columns)}')
logger.info(f'Categorical columns: {len(categorical_columns)}')
logger.info(f'Extra columns: {set(full_metadata.columns) - set(numerical_columns) - set(categorical_columns)}')

INFO | Diversity analysis app | Total metadata columns: 98
INFO | Diversity analysis app | Numerical columns: 57
INFO | Diversity analysis app | Categorical columns: 40
INFO | Diversity analysis app | Columns removed: 4
INFO | Diversity analysis app | Extra columns: {'collection date'}
INFO | Diversity analysis app | Numerical columns: 57
INFO | Diversity analysis app | Categorical columns: 40
INFO | Diversity analysis app | Columns removed: 4
INFO | Diversity analysis app | Extra columns: {'collection date'}


In [12]:
TAXONOMY = pd.DataFrame()

## Pivot the tables

In [13]:
global original_tables
original_tables = {
    "LSU": mgf_parquet_dfs['lsu'],
    "SSU": mgf_parquet_dfs['ssu'],
}

In [14]:
# Free memory
del mgf_parquet_dfs

### Pivoting into update method

In [None]:
def pivot(lsu, ssu):

    lsu_standard = pivot_taxonomic_data(lsu)
    ssu_standard = pivot_taxonomic_data(ssu)

    split_taxo_tables_lsu = separate_taxonomy(lsu_standard)
    split_taxo_tables_ssu = separate_taxonomy(ssu_standard)

    global granular_tables
    granular_tables = {
        "LSU": split_taxo_tables_lsu,
        "SSU": split_taxo_tables_ssu
    }


def pivot_single(table):
    """
    Pivot the taxonomic data table.
    """
    standard_table = pivot_taxonomic_data(table)
    split_taxo_tables = separate_taxonomy(standard_table)

    return split_taxo_tables

In [16]:
pivot(original_tables['LSU'],
      original_tables['SSU'])

## Granular PCoA page for the app


### Dropdowns for the pCOA
- credits for inspiration to Andrzej Tkacz's NB

In [None]:
pn.extension("tabulator")


select_granular_table = pn.widgets.Select(
    name="Granular analysis",
    options= list(granular_tables.keys()),
    description="Select a table for granular analysis",
)

select_granular_level = pn.widgets.Select(
    name="Subset taxonomic level",
    options=list(granular_tables[select_granular_table.value].keys()),
    description="Select a table for analysis",
)

pcoa_factor_dropdowns = {
    categorical_col: pn.widgets.MultiSelect(
        name=categorical_col,
        value=['All'],
        options=['All'] + list(full_metadata[categorical_col].unique()),
        size=6, width=180,)
        for categorical_col in factor_cols
}

box_granular = pn.GridBox(
    *pcoa_factor_dropdowns.values(),
    ncols=5,
    )

color_factor_granular = pn.widgets.Select(
    name="Color by",
    value=factor_cols[0],
    options=factor_cols,
)

# show indicator of the explained variance
explained_var_indicator = pn.indicators.Number(
    name='Explained variance by PC1 + PC2', value=0, format='{value:.1f}%',
    font_size='20pt',
    title_size='12pt',
    colors=[(33, 'red'), (50, 'gold'), (66, 'green')]
)

beta_pc_plot_granular = pn.pane.HoloViews(
    name="Beta PCoA",
    width=1000,
    height=600,
    )

button_filter_table = pn.widgets.Button(
    name="Filter table",
    button_type="primary",
    width=200,
)

if DEBUG:
    logger.info(f"Granular levels are:\n{list(granular_tables[select_granular_table.value].keys())}")

## Sidebar methods

In [None]:
select_taxon = pn.widgets.Select(
        name="Taxon",
        value="phylum",
        options=["kingdom", "phylum", "class", "order", "family", "genus", "species"],
        description="At which taxon level is beta diversity calculated",
    )

mapping = pn.widgets.Checkbox(
    name="strict mapping to selected taxonomic level (takes time)",
    value=True,
)

low_prevalence_cutoff = pn.widgets.FloatInput(
    name='Low prevalence cutoff [%]',
    value=10, step=1, start=0, end=100,
    description="Percentage of samples in which the taxon must be present not to be removed.",
)

button_process_taxonomy = pn.widgets.Button(
    name="Process taxonomy",
    button_type="primary",
    description="This will process the taxonomy and update the plots.",
    width=200,
)

taxonomy_process_status = pn.pane.Markdown(
    """No processed taxonomy yet.""",
    hard_line_break=True,
)

use_processed = pn.widgets.Checkbox(
    name="Use processed taxonomy?",
    value=False,
)

### Methods
- filter data and metadata
- update widgets

In [None]:
def get_filtered_metadata():
    # Retrieve the selected factors from the dropdowns
    selected_factors = {col: pcoa_factor_dropdowns[col].value for col in factor_cols}
    # Filter the metadata table
    filtered_metadata = filter_metadata_table(full_metadata, selected_factors)
    return filtered_metadata


def filter_all_box_selection(df):
    # Retrieve the filtered metadata
    filtered_metadata = get_filtered_metadata()
    # Filter the data
    filtered_data = filter_data(df, filtered_metadata)
    assert not filtered_data.empty, "Filtered data is empty. Check your selections."
    return filtered_metadata, filtered_data


def update_beta_pc_plot_granular(filtered_data, metadata, factor):
    beta_pc_plot_granular.object, explained_var = pl.beta_plot_pc_granular(
        filtered_data=filtered_data,
        metadata=metadata,
        factor=factor)
    explained_var_indicator.value = sum(explained_var) * 100  # convert to percentage


def update_filtered_data():
    # Retrieve the filtered metadata
    global filtered_metadata
    global filtered_data
    if use_processed.value:
        # Use the processed taxonomy
        split_taxo = pivot_single(TAXONOMY)

        # how many rows are all zeros
        logger.info(f"Number of all-zero rows in split taxo: {(split_taxo[select_granular_level.value] == 0).all(axis=1).sum()}")
        logger.info(f"Number of all-zero columns in split taxo: {(split_taxo[select_granular_level.value] == 0).all(axis=0).sum()}")

        # remove the all-zero rows and columns
        split_taxo[select_granular_level.value] = split_taxo[select_granular_level.value].loc[
            ~(split_taxo[select_granular_level.value] == 0).all(axis=1), :]
        split_taxo[select_granular_level.value] = split_taxo[select_granular_level.value].loc[:, ~(split_taxo[select_granular_level.value] == 0).all(axis=0)]

        filtered_metadata, filtered_data = filter_all_box_selection(
            split_taxo[select_granular_level.value])
    else:
        filtered_metadata, filtered_data = filter_all_box_selection(
            granular_tables[select_granular_table.value][select_granular_level.value])

    # Update the beta plot
    update_beta_pc_plot_granular(filtered_data, filtered_metadata, color_factor_granular.value)
    update_subset_indicator(subset_selected, filtered_metadata)
    update_taxa_count_indicator(taxa_selected, filtered_data)

### Placeholders, strict taxa filtering and low prevalence filter.

In [None]:
def process_taxonomy(table, high_taxon, mapping, prevalence_cutoff_value):
    """
    Preprocess the taxonomy data.
    """
    global TAXONOMY
    TAXONOMY = pd.DataFrame()

    df_filt = original_tables[table]
    TAXONOMY = taxonomy_common_preprocess01(
        df_filt, high_taxon, mapping,
        prevalence_cutoff_value, TAXONOMY_RANKS,
    )

    taxonomy_process_status.object = f"""
        Processed taxonomy with high taxon: {high_taxon} (strict mapping: {mapping})
        and low prevalence cutoff: {prevalence_cutoff_value}% of abundance.
        Number of taxa after processing: {TAXONOMY.shape[0]}.
        """
    update_filtered_data()


button_process_taxonomy.on_click(
    lambda event: process_taxonomy(
        select_granular_table.value,
        select_taxon.value,
        mapping.value,
        low_prevalence_cutoff.value
    )
)

In [None]:
filtered_metadata, filtered_data = filter_all_box_selection(granular_tables['LSU']['Bacteria_phylum'])

### Bindings

In [None]:
button_filter_table.on_click(
    lambda event: update_filtered_data(),
)

pn.bind(update_beta_pc_plot_granular,
    filtered_data=filtered_data,
    metadata=filtered_metadata,
    factor=color_factor_granular,
    watch=True,
    )

pcoa_instructions = pn.pane.Markdown(
    """
    ### Instructions
    1. Side panel filters LSU/SSU tables by taxonomy levels.
    2. Color_by is used to color the beta diversity plot.
    3. Main panel filter further the table by the metadata values.
        - `Ctrl`-click to select multiple values in the dropdowns.
    4. Filtering and update of the plot happens only after clicking the `Filter table` button to save CPU.
    """
)

pcoa_tab_granular = pn.Column(
    pcoa_instructions,
    box_granular,
    button_filter_table,
    explained_var_indicator,
    beta_pc_plot_granular,
    scroll=True,
)

## Permanova page for the app
- Credits to Andrzej Tkacz

### Widgets

In [None]:
# PERMANOVA Dropdowns
permanova_factor = pn.widgets.Select(
    name="Main Permanova factor",
    options=['All'] + factor_cols,
    description='Limit by group(s) in factor:',
)

permanova_group = pn.widgets.MultiSelect(
    name="Groups of unique values of the factor",
    options=[],
    description='Groups:',
)

permanova_additional_factors = pn.widgets.MultiSelect(
    name="Factors to test vs ALL the rest",
    options=factor_cols,
    description='PERMANOVA Factors:',
)

permanova_button = pn.widgets.Button(
    name="PERMANOVA",
    button_type="primary",
    width=200,
)

permanova_result_indicator = pn.widgets.Tabulator(pd.DataFrame(), name='Permanova Result')

permanova_instructions = pn.pane.Markdown(
    """
    ### Instructions
    1. Select a factor to limit the analysis.
    2. Select groups in the factor (`Ctrl`-click to select multiple).
    3. Select additional factors for against which PERMANOVA will be run (`Ctrl`-click to select multiple).
    4. Click the `PERMANOVA` button to run the analysis.
    5. **NOTE**, locally permanova with 999 permutations is instant, however takes extremely long on binder.
        - the number of permutations is set to 29 (for binder) and does not lead to correct p-value.
        - Both locally and on GColab, number of premutations is set to 999.
    """
)

### Updates and bindings

In [None]:
def update_permanova_result():
    # Run the permanova function and update the result indicator
    permanova_results = run_permanova(
        granular_tables[select_granular_table.value][select_granular_level.value],
        full_metadata,
        permanova_factor.value,
        permanova_group.value,
        permanova_additional_factors.value,
        permutations=NUMBER_PERMUTATIONS,  # 29 for binder, 999 for local
        verbose=True,
    )
    permanova_result_indicator.value = pd.DataFrame.from_dict(permanova_results)

# Update groups based on selected factor
def update_groups(permanova_factor):
    logger.info(f"Permanova factor value: {permanova_factor}")
    if permanova_factor in factor_cols:
        unique_groups = sorted(full_metadata[permanova_factor].dropna().unique())
        permanova_group.options = unique_groups
    elif permanova_factor == 'All':
        permanova_group.options = sorted(full_metadata.index.to_list())
    else:
        raise ValueError(f"Unknown factor: {permanova_factor}")
    
pn.bind(update_groups,
    permanova_factor,
    watch=True,
)

permanova_button.on_click(
    lambda event: update_permanova_result()
)

In [None]:
permanova_tab = pn.Column(
    permanova_instructions,
    pn.Row(
        permanova_factor,
        permanova_group,
        permanova_additional_factors,
    ),
    permanova_button,
    permanova_result_indicator,
    scroll=True,
)

### Add to the side panel

In [None]:
total_samplings = full_metadata.index.nunique()
subset = filtered_metadata.index.nunique()
taxa_count = len(filtered_data)


subset_selected = pn.indicators.Number(
    name="Subset of samples you filtered",
    value=subset,
    format="{value}" + f"/{total_samplings}",
    width=150,
    font_size="34px",
    title_size="14px",
)

taxa_selected = pn.indicators.Number(
    name="Taxa in the selection.",
    value=taxa_count,
    format="{value}",
    width=150,
    font_size="34px",
    title_size="14px",
)

## APP setup

In [None]:
pn.extension("tabulator")
if 'google.colab' in str(get_ipython()):
    pn.extension(comms='colab')
ACCENT = "teal"

styles = {
    "box-shadow": "rgba(50, 50, 93, 0.25) 0px 6px 12px -2px, rgba(0, 0, 0, 0.3) 0px 3px 7px -3px",
    "border-radius": "4px",
    "padding": "10px",
}

# TODO: there is a bug in the panel library that does not allow to open png files, renoming does not help 
image = pn.pane.JPG(os.path.join(assets_folder, "figs/metaGOflow_logo_italics.jpg"),
                    width=200,
                    height=100,
                    )

tabs = pn.Tabs(
    ('PCoA', pcoa_tab_granular),
    ('Permanova', permanova_tab),
    styles=styles,
    margin=10
)
_, indicator_usage = create_indicators_diversity()

def update_used_gb(event):
    if not event:
        return

    used_gb, total_gb = memory_load()
    indicator_usage.value = used_gb


filtered_metadata, filtered_data = filter_all_box_selection(
        granular_tables[select_granular_table.value][select_granular_level.value],
        )

def app():
    cb = pn.state.add_periodic_callback(
        partial(update_used_gb, indicator_usage),
        period=1000,
        timeout=None,
        )

    toggle = pn.widgets.Toggle(
        name='Toggle callback',
        value=True,
        button_type='success',)
    toggle.link(cb, bidirectional=True, value='running')

    template = pn.template.FastListTemplate(
        title="Diversity Analysis",
        sidebar=[image,
                "# Beta granular", select_granular_table, select_granular_level,
                color_factor_granular,
                pn.layout.Divider(),
                subset_selected,
                taxa_selected,
                pn.layout.Divider(),
                select_taxon,
                mapping,
                low_prevalence_cutoff,
                button_process_taxonomy,
                taxonomy_process_status,
                use_processed,
                pn.layout.Divider(),
                indicator_usage,
                toggle,
                ],
        main=[pn.Column(
                tabs,
            )],
        main_layout=None,
        accent=ACCENT,
    )
    return template

template = app()

# stupid trick to trigger updata()
color_factor_granular.value = color_factor_granular.options[1]
color_factor_granular.value = color_factor_granular.options[0]

if 'google.colab' in str(get_ipython()):  
    s = serve_app(template, env=env, name="diversity_analysis")
else:
    template.servable()

### Uncomment this if running if running ngrok tunnel which you want to quit

In [None]:
# only use for the ngrok tunnel in GColab
# close_server(s, env=env)