# Visualize taxonomy and alpha/beta diversities

## Platform dependent part
- Resolve platform setup
- the difference to local imports should be resolved by setting the Blue Cloud VRE well, Colab will still be an issue.

In [None]:
import sys
import os
import io
import logging
from IPython import get_ipython
logger = logging.getLogger(name="Diversity analysis app")

if 'google.colab' in str(get_ipython()):
    # clone the momics-demos repository to use the utils module from there
    # TODO: eventually utils from momics will be used for that
    try:
        os.system('git clone https://github.com/palec87/momics-demos.git')
        logger.info(f"Repository cloned")
    except OSError as e:
        logger.info(f"An error occurred while cloning the repository: {e}")

    sys.path.insert(0,'/content/momics-demos')
# elif "zmqshell" in str(get_ipython()):
#     logger.info("Binder")
#     sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
# else:
#     sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))  # local utils, to be removed in the future

#     # downside of this is that all the deps need to be installed in the current (momics-demos) environment
#     sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../../marine-omics')))  # local momics package, to be removed too

from momics.utils import (
    memory_load, reconfig_logger,
    init_setup, get_notebook_environment,
)

# Set up logging
reconfig_logger()

# from utils import init_setup, get_notebook_environment
# Determine the notebook environment
env = get_notebook_environment()

init_setup()
logger.info(f"Environment: {env}")

# if path exists add sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../../marine-omics')))  # local momics package, to be removed too
# local_momics_path = os.path.abspath(os.path.join(os.getcwd(), '../../../marine-omics'))
# if os.path.exists(local_momics_path):
#     sys.path.append(local_momics_path)
#     logger.info(f"Added local momics path: {local_momics_path}")
#     print(f"Added local momics path: {local_momics_path}")

Local IPython, nothing else to install


## Imports

In [2]:
# This needs to be repeated here for the Pannel dashboard to work, WEIRD
# TODO: report as possible bug
import sys
import os
import io
import warnings
import psutil


from functools import partial
warnings.filterwarnings('ignore')

# import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
import panel as pn
from dotenv import load_dotenv
load_dotenv()

from skbio.diversity import beta_diversity
from skbio.stats.ordination import pcoa

# All low level functions are imported from the momics package
from momics.loader import load_parquets
import momics.plotting as pl
from momics.panel_utils import (
    diversity_select_widgets, create_indicators_diversity,
    serve_app, close_server,
)


# Note: This is breaking the panel preview functionality
# %load_ext autoreload
# %autoreload 2

## User settings

In [None]:
DEBUG = True  # enable stdout logging

## Loading

In [None]:
def fill_na_for_object_columns(df):
    """
    Fill NA values with 'NA' for object columns in the dataframe.

    Args:
        df (pd.DataFrame): The input dataframe.

    Returns:
        pd.DataFrame: The dataframe with NA values filled for object columns.
    """
    # Apply fillna only to object columns
    df[df.select_dtypes(include=['object']).columns] = df.select_dtypes(include=['object']).apply(lambda col: col.fillna('NA'))
    return df

@pn.cache()
def get_data(folder):
    return load_parquets(folder)

@pn.cache()
def get_metadata(folder):
    # Load metadata
    sample_metadata = pd.read_csv(
        os.path.join(folder, "Batch1and2_combined_logsheets_2024-11-12.csv")
    )

    observatory_metadata = pd.read_csv(
        os.path.join(folder, "Observatory_combined_logsheets_validated.csv")
    )

    # Merge metadata
    full_metadata = pd.merge(
        sample_metadata,
        observatory_metadata,
        on=["obs_id", "env_package"],  # Matching conditions
        how="inner"  # Inner join
    )

    # Sort the merged dataframe by 'ref_code' column in ascending order
    full_metadata = full_metadata.sort_values(by="ref_code", ascending=True)

    # first convert some of the boolean cols
    full_metadata["failure"] = full_metadata["failure"].astype(str)
    # replace the 'nan' values with 'NA'
    full_metadata["failure"] = full_metadata["failure"].replace("nan", "NA")


    # adding replacement for the missing values for object type columns
    full_metadata = fill_na_for_object_columns(full_metadata)
    
    return full_metadata

In [None]:
# parquet files
if 'google.colab' in str(get_ipython()):
    root_folder = os.path.abspath(os.path.join('/content/momics-demos'))
else:
    root_folder = os.path.abspath(os.path.join('../'))


data_folder = os.path.join(root_folder, 'data/parquet_files')
assets_folder = os.path.join(root_folder, 'assets')


mgf_parquet_dfs = get_data(data_folder)

In [None]:
# Load and merge metadata
full_metadata = get_metadata(os.path.join(root_folder, 'data'))

# select categorical columns from metadata
categorical_columns = sorted(full_metadata.select_dtypes(include=['object', "boolean"]).columns)
cat_to_remove = ["ref_code", "samp_description", "source_mat_id", "source_mat_id_orig"]
categorical_columns = [k for k in categorical_columns if k not in cat_to_remove]

# select numerical columns from metadata
numerical_columns = sorted(full_metadata.select_dtypes(include=['int64', 'float64']).columns)

assert len(full_metadata.columns) == len(numerical_columns) + len(categorical_columns) + len(cat_to_remove)  # + for removed cats

if DEBUG:
    logger.info(f"Data table names are:\n{mgf_parquet_dfs.keys()}")
    logger.info(f"Categorical metadata columns are:\n{categorical_columns}")
    logger.info(f"Numerical metadata columns are:\n{numerical_columns}")

In [None]:
df = mgf_parquet_dfs['SSU'].copy()
if DEBUG:
    logger.info(f'Number of unique ref_codes: {df.ref_code.nunique()}')

## Development of the beta diversity part

In [None]:
# TODO: link these functions to the indicator
# TODO: put them in the momics package
def get_missing_taxa(df):
    for taxon in ["superkingdom", "kingdom", "phylum", "class", "order", "family", "genus", "species"]:
        logger.info(f'Not classified on {taxon}: {get_missing_taxa_single(df, taxon)}')
    return

def get_missing_taxa_single(df, taxon):
    return len(df[df[taxon].isnull()])

## App setup

In [None]:
pn.extension("tabulator")
if 'google.colab' in str(get_ipython()):
    pn.extension(comms='colab')
ACCENT = "teal"

styles = {
    "box-shadow": "rgba(50, 50, 93, 0.25) 0px 6px 12px -2px, rgba(0, 0, 0, 0.3) 0px 3px 7px -3px",
    "border-radius": "4px",
    "padding": "10px",
}

# TODO: there is a bug in the panel library that does not allow to open png files, renoming does not help 
image = pn.pane.JPG(os.path.join(assets_folder, "figs/metaGOflow_logo_italics.jpg"),
                    width=200,
                    height=100,
                    )

(select_table, select_cat_factor, 
 select_table_beta, select_taxon,
 select_beta_factor, beta_norm,
 ) = diversity_select_widgets(categorical_columns, numerical_columns)

progress_bar, indicator_usage = create_indicators_diversity()

def update_used_gb(event):
    if not event:
        return

    used_gb, total_gb = memory_load()
    progress_bar.value = int(used_gb / total_gb * 100)
    indicator_usage.value = used_gb

bplot_alpha = pn.bind(
    pl.alpha_plot,
    tables_dict=mgf_parquet_dfs,
    table_name=select_table,
    factor=select_cat_factor,
    metadata=full_metadata,
)

bplot_av_alpha = pn.bind(
    pl.av_alpha_plot,
    tables_dict=mgf_parquet_dfs,
    table_name=select_table,
    factor=select_cat_factor,
    metadata=full_metadata,
)

bplot_beta_heatmap = pn.bind(
    pl.beta_plot,
    tables_dict=mgf_parquet_dfs,
    table_name=select_table_beta,
    norm=beta_norm,
    taxon=select_taxon,
)

def update_beta_pc_plot(tables_dict, metadata, table_name, taxon, factor):
    beta_pc_plot.object, explained_var_indicator.value = pl.beta_plot_pc(
        tables_dict=tables_dict,
        metadata=metadata,
        table_name=table_name,
        taxon=taxon,
        factor=factor)

beta_pc_plot = pn.pane.Matplotlib(
    name="Beta PCoA",
    height=600,
    )

pn.bind(update_beta_pc_plot,
    tables_dict=mgf_parquet_dfs,
    metadata=full_metadata,
    table_name=select_table_beta,
    taxon=select_taxon,
    factor=select_beta_factor,
    watch=True,
    )

# show indicator of the explained variance
explained_var_indicator = pn.indicators.Number(
    name='Explained variance by PC1 + PC2', value=0, format='{value:.1f}%',
    font_size='20pt',
    title_size='12pt',
    colors=[(33, 'red'), (50, 'gold'), (66, 'green')]
)

atable = pn.widgets.Tabulator(
    df,
    name="Data View")

# assemble tab with the matrix and checkbox
heatmap_tab = pn.Column(
    beta_norm,
    bplot_beta_heatmap,
    height=600,
    scroll=True,
)

pcoa_tab = pn.Column(
    explained_var_indicator,
    beta_pc_plot,
    height=600,
    scroll=True,
)

tabs = pn.Tabs(
    ('Alpha div.', bplot_alpha),
    ('Av Aplpha div.', bplot_av_alpha),
    ('Beta div.', heatmap_tab),
    ('PCoA', pcoa_tab),
    atable,
    styles=styles,
    margin=10,
)

def app():
    cb = pn.state.add_periodic_callback(
        partial(update_used_gb, indicator_usage),
        period=1000,
        timeout=None,
        )
    cb2 = pn.state.add_periodic_callback(
        partial(update_used_gb, progress_bar),
        period=1000,
        timeout=None,
        )
    toggle = pn.widgets.Toggle(name='Toggle callback', value=True)
    toggle.link(cb, bidirectional=True, value='running')
    toggle.link(cb2, bidirectional=True, value='running')

    indicators = pn.FlexBox(
        progress_bar, indicator_usage, toggle)

    template = pn.template.FastListTemplate(
        title="Diversity Analysis",
        sidebar=[image,
                "# Alpha diversity", select_table, select_cat_factor,
                pn.layout.Divider(),
                "# Beta diversity", select_table_beta, select_taxon, select_beta_factor,
                ],
        main=[pn.Column(indicators,
                        tabs,
                    )],
        main_layout=None,
        accent=ACCENT,
    )
    return template

template = app()

# stupid trick to trigger updata()
select_beta_factor.value = select_beta_factor.options[1]
select_beta_factor.value = select_beta_factor.options[0]

if 'google.colab' in str(get_ipython()):  
    s = serve_app(template, env=env, name="diversity_analysis")
else:
    template.servable()

### Uncomment this if running if running ngrok tunnel which you want to quit

In [None]:
# only use for the ngrok tunnel in GColab
# close_server(s, env=env)