# Visualize taxonomy and alpha/beta diversities

## Platform dependent part
- Resolve platform setup
- the difference to local imports should be resolved by setting the Blue Cloud VRE well, Colab will still be an issue.

In [1]:
import sys
import os
import io

if 'google.colab' in str(get_ipython()):
    # clone the momics-demos repository to use the utils module from there
    # TODO: eventually utils from momics will be used for that
    try:
        os.system('git clone https://github.com/palec87/momics-demos.git')
        print(f"Repository cloned")
    except OSError as e:
        print(f"An error occurred while cloning the repository: {e}")

    sys.path.insert(0,'/content/momics-demos')

else:
    sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from utils import init_setup, get_notebook_environment
init_setup()

# Initialize the environment variable
notebook_environment = 'unknown'
# Determine the notebook environment
env = get_notebook_environment()
print(f"Environment: {env}")

Platform: local Linux
Environment: vscode


## Imports

In [2]:
# This needs to be repeated here for the Pannel dashboard to work, WEIRD
# TODO: report as possible bug
import sys
import os
import io
import warnings
warnings.filterwarnings('ignore')

# import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
import panel as pn
from dotenv import load_dotenv
load_dotenv()

from skbio.diversity import beta_diversity
from skbio.stats.ordination import pcoa

# All low level functions are imported from the momics package
from momics.loader import load_parquets
import momics.plotting as pl
from momics.panel_utils import diversity_select_widgets, create_indicators
from momics.utils import memory_load

# Note: This is breaking the panel preview functionality
# %load_ext autoreload
# %autoreload 2

## Loading

In [3]:
def fill_na_for_object_columns(df):
    """
    Fill NA values with 'NA' for object columns in the dataframe.

    Args:
        df (pd.DataFrame): The input dataframe.

    Returns:
        pd.DataFrame: The dataframe with NA values filled for object columns.
    """
    # Apply fillna only to object columns
    df[df.select_dtypes(include=['object']).columns] = df.select_dtypes(include=['object']).apply(lambda col: col.fillna('NA'))
    return df

@pn.cache()
def get_data(folder):
    return load_parquets(folder)

@pn.cache()
def get_metadata(folder):
    # Load metadata
    sample_metadata = pd.read_csv(
        os.path.join(folder, "Batch1and2_combined_logsheets_2024-09-11.csv")
    )

    observatory_metadata = pd.read_csv(
        os.path.join(folder, "Observatory_combined_logsheets_validated.csv")
    )

    # Merge metadata
    full_metadata = pd.merge(
        sample_metadata,
        observatory_metadata,
        on=["obs_id", "env_package"],  # Matching conditions
        how="inner"  # Inner join
    )

    # Sort the merged dataframe by 'ref_code' column in ascending order
    full_metadata = full_metadata.sort_values(by="ref_code", ascending=True)

    # first convert some of the boolean cols
    full_metadata["failure"] = full_metadata["failure"].astype(str)
    # replace the 'nan' values with 'NA'
    full_metadata["failure"] = full_metadata["failure"].replace("nan", "NA")


    # adding replacement for the missing values for object type columns
    full_metadata = fill_na_for_object_columns(full_metadata)
    
    return full_metadata

In [4]:
# parquet files
if 'google.colab' in str(get_ipython()):
    root_folder = os.path.abspath(os.path.join('/content/momics-demos'))
else:
    root_folder = os.path.abspath(os.path.join('../'))


data_folder = os.path.join(root_folder, 'data/parquet_files')
assets_folder = os.path.join(root_folder, 'assets')


mgf_parquet_dfs = get_data(data_folder)

In [5]:
# Load and merge metadata
full_metadata = get_metadata(data_folder)

# select categorical columns from metadata
categorical_columns = sorted(full_metadata.select_dtypes(include=['object', "boolean"]).columns)
cat_to_remove = ["ref_code", "samp_description", "source_mat_id", "source_mat_id_orig"]
categorical_columns = [k for k in categorical_columns if k not in cat_to_remove]
# print(type(categorical_columns))

# select numerical columns from metadata
numerical_columns = sorted(full_metadata.select_dtypes(include=['int64', 'float64']).columns)

assert len(full_metadata.columns) == len(numerical_columns) + len(categorical_columns) + len(cat_to_remove)  # + for removed cats
# print(f"Data table names are:\n{mgf_parquet_dfs.keys()}")
# print(f"Categorical metadata columns are:\n{categorical_columns}")
# print(f"Numerical metadata columns are:\n{numerical_columns}")
# mgf_parquet_dfs['SSU'].sort_values(by='abundance', ascending=False)

In [6]:
df = mgf_parquet_dfs['SSU'].copy()
df.ref_code.nunique(), df.reads_name.nunique()

(54, 54)

`ref_code` and `reads_name` are the same length so I will use the `ref_code`

## Development of the beta diversity part

In [7]:
# TODO: link these functions to the indicator
# TODO: put them in the momics package
def get_missing_taxa(df):
    for taxon in ["superkingdom", "kingdom", "phylum", "class", "order", "family", "genus", "species"]:
        print(f'Not classified on {taxon}: {get_missing_taxa_single(df, taxon)}')
    return

def get_missing_taxa_single(df, taxon):
    return len(df[df[taxon].isnull()])

## App setup

In [12]:
pn.extension("tabulator")
if 'google.colab' in str(get_ipython()):
    pn.extension(comms='colab')
ACCENT = "teal"

styles = {
    "box-shadow": "rgba(50, 50, 93, 0.25) 0px 6px 12px -2px, rgba(0, 0, 0, 0.3) 0px 3px 7px -3px",
    "border-radius": "4px",
    "padding": "10px",
}

# TODO: there is a bug in the panel library that does not allow to open png files, renoming does not help 
image = pn.pane.JPG(os.path.join(assets_folder, "figs/metaGOflow_logo_italics.jpg"),
                    width=200, height=100)

(select_table, select_cat_factor, 
 select_table_beta, select_taxon,
 select_beta_factor, beta_norm,
 ) = diversity_select_widgets(categorical_columns, numerical_columns)

indicators = create_indicators()
used_gb, total_gb = memory_load()
indicator_test = pn.FlexBox(
    pn.indicators.Number(
        value=used_gb, name="RAM usage [GB]",
        format="{value:,.1f}",
    ),
)

# TODO: learn the periodic callbacks
# indicators.append(indicator_test)
# cb = pn.state.add_periodic_callback(memory_load, 200, timeout=5000)
# toggle = pn.widgets.Toggle(name='Toggle callback', value=True)
# toggle.link(cb, bidirectional=True, value='running')

bplot_alpha = pn.bind(
    pl.alpha_plot,
    tables_dict=mgf_parquet_dfs,
    table_name=select_table,
    factor=select_cat_factor,
    metadata=full_metadata,
)

bplot_av_alpha = pn.bind(
    pl.av_alpha_plot,
    tables_dict=mgf_parquet_dfs,
    table_name=select_table,
    factor=select_cat_factor,
    metadata=full_metadata,
)


bplot_beta_heatmap = pn.bind(
    pl.beta_plot,
    tables_dict=mgf_parquet_dfs,
    table_name=select_table_beta,
    norm=beta_norm,
    taxon=select_taxon,
)

# show indicator of the explained variance
explained_var_indicator = pn.indicators.Number(
    name='Explained variance by PC1 + PC2', value=0, format='{value:.1f}%',
    font_size='20pt',
    title_size='12pt',
    colors=[(33, 'red'), (50, 'gold'), (66, 'green')]
)

def update(tables_dict, metadata, table_name, taxon, factor):
    beta_plot.object, explained_var_indicator.value = pl.beta_plot_pc(
        tables_dict=tables_dict,
        metadata=metadata,
        table_name=table_name,
        taxon=taxon,
        factor=factor)


beta_plot = pn.pane.Matplotlib(
    sizing_mode="stretch_both",
    name="Beta PCoA")

imodel = pn.bind(update, tables_dict=mgf_parquet_dfs,
    metadata=full_metadata,
    table_name=select_table_beta,
    taxon=select_taxon,
    factor=select_beta_factor,
    watch=True,
    )

atable = pn.widgets.Tabulator(df, sizing_mode="stretch_both", name="Data View")

# assemble tab with the matrix and checkbox
heatmap_tab = pn.Column(
    beta_norm,
    bplot_beta_heatmap,
)

pcoa_tab = pn.Column(
    explained_var_indicator,
    beta_plot,
)

tabs = pn.Tabs(
    ('Alpha div.', bplot_alpha),
    ('Av Aplpha div.', bplot_av_alpha),
    ('Beta div.', heatmap_tab),
    ('PCoA', pcoa_tab),
    atable,
    styles=styles, sizing_mode="stretch_width", height=500, margin=10
)

template = pn.template.FastListTemplate(
    title="Diversity Analysis",
    sidebar=[image,
            #  toggle,
             "# Alpha diversity", select_table, select_cat_factor,
             pn.layout.Divider(),
             "# Beta diversity", select_table_beta, select_taxon, select_beta_factor,
             ],
    main=[pn.Column(indicators,
                    tabs,
                    sizing_mode="stretch_both",
                   )],
    main_layout=None,
    accent=ACCENT,
)

# stupid trick to trigger updata()
select_beta_factor.value = select_beta_factor.options[1]
select_beta_factor.value = select_beta_factor.options[0]

if "google.colab" in str(get_ipython()) or env == "vscode":
    # server=pn.serve({"": template}, port=80, address="0.0.0.0", threaded=True, websocket_origin="*")
    server=pn.serve({"": template}, port=8080, address="127.0.0.1", threaded=True, websocket_origin="*")
    os.system("curl http://localhost:8080")
    from pyngrok import ngrok

    # Terminate open tunnels if exist
    ngrok.kill()

    # Setting the authtoken (optional)
    # Get your authtoken from https://dashboard.ngrok.com/auth
    NGROK_AUTH_TOKEN = os.getenv("NGROK_TOKEN")
    ngrok.set_auth_token(NGROK_AUTH_TOKEN)

    # Open an HTTPs tunnel on port 80 for http://localhost:8080
    public_url = ngrok.connect(port='8080')
    print("Tracking URL:", public_url)
    # tunnels = ngrok.get_tunnels()
    # print(f"Number of open tunnels: {len(tunnels)}, {tunnels}")
else:
    template.servable()

Key column: id
length of the ref_codes: 54
table shape: (2637, 54)
Key column: id
length of the ref_codes: 54
table shape: (2637, 54)




float64
numerical 0 50
object
categorical
Launching server at http://127.0.0.1:8080


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 14396  100 14396    0     0  12699      0  0:00:01  0:00:01 --:--:-- 12706


<!DOCTYPE html>
<html lang="en" >
  <head>
    <meta charset="utf-8">
    <title>Diversity Analysis</title>
    <link rel="apple-touch-icon" sizes="180x180" href="static/extensions/panel/images/apple-touch-icon.png">
    <link rel="icon" href="/static/extensions/panel/images/favicon.ico" type="">
    <meta name="name" content="Diversity Analysis">
    <style>
      html, body {
	display: flow-root;
        box-sizing: border-box;
        height: 100%;
        margin: 0;
        padding: 0;
      }
    </style>
    <link rel="stylesheet" href="static/extensions/panel/bundled/datatabulator/tabulator-tables@6.3.0/dist/css/tabulator_fast.min.css?v=1.5.5" type="text/css" />
<script type="esms-options">{"shimMode": true}</script>

<script type="text/javascript" src="static/extensions/panel/bundled/reactiveesm/es-module-shims@^1.10.0/dist/es-module-shims.min.js"></script>
<script type="text/javascript" src="static/extensions/panel/bundled/datatabulator/tabulator-tables@6.3.0/dist/js/tabulator

t=2025-02-06T17:36:49+0000 lvl=warn msg="invalid tunnel configuration" pg=/api/tunnels id=0e2117343ed256a1 err="yaml: unmarshal errors:\n  line 1: field port not found in type config.HTTPv2Tunnel"


PyngrokNgrokHTTPError: ngrok client exception, API returned 400: {"error_code":102,"status_code":400,"msg":"invalid tunnel configuration","details":{"err":"yaml: unmarshal errors:\n  line 1: field port not found in type config.HTTPv2Tunnel"}}


In [13]:
server.stop()
ngrok.disconnect(server)
ngrok.kill()