# Visualize pivoted taxonomy (LSU and SSU tables)
- PCoA beta diversities
- Permanova calculations

## Platform dependent part
- Resolve platform setup
- the difference to local imports should be resolved by setting the Blue Cloud VRE well, Colab will still be an issue.

In [None]:
import sys
import os
import gc
import logging
import psutil

from IPython import get_ipython
logger = logging.getLogger(name="Diversity analysis app")
NUMBER_PERMUTATIONS = 999

if 'google.colab' in str(get_ipython()):
    print('Setting Google colab, you will need a ngrok account to make the dashboard display over the tunnel. \
    https://ngrok.com/')
    # clone the momics-demos repository to use it to load data
    try:
        os.system('git clone https://github.com/palec87/momics-demos.git')
        logger.info(f"Repository cloned")
    except OSError as e:
        logger.info(f"An error occurred while cloning the repository: {e}")

    sys.path.insert(0,'/content/momics-demos')

    # this step takes time beacause of many dependencies
    os.system('pip install marine-omics')

elif psutil.users() == []:
    logger.info("Binder")
    NUMBER_PERMUTATIONS = 29  # permanova extremely slow on binder, therefore a change here
else:
    logger.info("Local")

from momics.utils import reconfig_logger, init_setup

# Set up logging
reconfig_logger()
init_setup()

INFO | root | Logging.basicConfig completed successfully


## Imports

In [None]:
import os
import warnings

warnings.filterwarnings('ignore')

# import numpy as np
import pandas as pd
import panel as pn

from mgo.udal import UDAL

# All low level functions are imported from the momics package
from momics.diversity import run_permanova
from momics.loader import load_parquets_udal
from momics.metadata import get_metadata_udal, enhance_metadata, filter_metadata_table, filter_data
import momics.plotting as pl
from momics.taxonomy import (
    pivot_taxonomic_data,
    separate_taxonomy)

### User settings

In [3]:
DEBUG = True  # enable stdout logging

## Loading

In [4]:
udal = UDAL()

In [5]:
# parquet files
if 'google.colab' in str(get_ipython()):
    root_folder = os.path.abspath(os.path.join('/content/momics-demos'))
else:
    root_folder = os.path.abspath(os.path.join('../'))

assets_folder = os.path.join(root_folder, 'assets')

In [6]:
@pn.cache()
def get_data():
    return load_parquets_udal()

# Load and merge metadata
@pn.cache()
def get_full_metadata():
    return get_metadata_udal()

@pn.cache()
def get_valid_samples():
    df_valid = pd.read_csv(
        os.path.join(root_folder, 'data/shipment_b1b2_181.csv')
    )
    return df_valid

### Enhance matadata

In [7]:
# Load metadata
full_metadata = get_full_metadata()

# filter the metadata only for valid 181 samples
valid_samples = get_valid_samples()
full_metadata = enhance_metadata(full_metadata, valid_samples)

# LOADing data
mgf_parquet_dfs = get_data()

In [8]:
# select categorical columns from metadata
categorical_columns = sorted(full_metadata.select_dtypes(include=['object', "boolean"]).columns)
cat_to_remove = ["ref_code", "samp_description", "source_mat_id", "source_mat_id_orig"]
categorical_columns = [k for k in categorical_columns if k not in cat_to_remove]

# select numerical columns from metadata
numerical_columns = sorted(full_metadata.select_dtypes(include=['int64', 'float64']).columns)

if DEBUG:
    logger.info(f"Data table names are:\n{mgf_parquet_dfs.keys()}")
    logger.info(f"Categorical metadata columns are:\n{categorical_columns}")
    logger.info(f"Numerical metadata columns are:\n{numerical_columns}")

# This is for PCoA from Andrzej more or less
factors_to_remove = ['ENA_accession_number_project', "ENA_accession_number_umbrella", "arr_date_hq",
                     "arr_date_seq", "contact_email", "contact_name", "contact_orcid",
                     "investigation_type", "long_store", "organism_count_method", "organization_edmoid",
                     'other_person', 'other_person_orcid',"organization_country", "project_name",
                     "samp_store_date", 'samp_mat_process', 'samp_mat_process_dev',
                     'samp_store_loc', 'sampl_person', 'sampl_person_orcid', 'store_person',
                     'store_person_orcid', 'time_fi', "wa_id",
                     'env_broad_biome', 'env_local', "extra_site_info", 'failure_comment',
                     'obs_id', 'size_frac','ship_date', 'ship_date_seq', 'sampling_event', 'organism_count',
                     'samp_collect_device',
                     'ammonium_method', 'chlorophyll_method', 'conduc_method', 'density_method', 'diss_oxygen_method',
                     'nitrate_method', 'nitrite_method', 'ph_method', 'phaeopigments_method', 'phosphate_method', 'pigments_method', 'pressure_method',
                     'sea_subsurf_salinity_method', 'sea_subsurf_temp_method', 'sea_surf_salinity_method', 'sea_surf_temp_method',
                     'silicate_method', 'turbidity_method']

factor_cols = [col for col in categorical_columns if col not in factors_to_remove]

INFO | Diversity analysis app | Data table names are:
dict_keys(['go', 'go_slim', 'ips', 'ko', 'pfam', 'lsu', 'ssu'])
INFO | Diversity analysis app | Categorical metadata columns are:
['ENA_accession_number_project', 'ENA_accession_number_umbrella', 'ammonium_method', 'arr_date_hq', 'arr_date_seq', 'chlorophyll_method', 'conduc_method', 'contact_email', 'contact_name', 'contact_orcid', 'density_method', 'diss_oxygen_method', 'env_broad_biome', 'env_local', 'env_material', 'env_package', 'extra_site_info', 'failure', 'failure_comment', 'geo_loc_name', 'investigation_type', 'loc_broad_ocean', 'loc_loc', 'loc_regional', 'month_name', 'nitrate_method', 'nitrite_method', 'obs_id', 'organism_count', 'organism_count_method', 'organization', 'organization_country', 'organization_edmoid', 'other_person', 'other_person_orcid', 'ph_method', 'phaeopigments_method', 'phosphate_method', 'pigments', 'pigments_method', 'pressure_method', 'project_name', 'replicate', 'replicate_info', 'samp_collect_dev

In [9]:
len(full_metadata.columns), len(numerical_columns),len(categorical_columns), len(cat_to_remove)

(142, 70, 68, 4)

## Pivot the tables

In [10]:
# LSU and SSU
lsu = mgf_parquet_dfs['lsu']
ssu = mgf_parquet_dfs['ssu']

lsu_standard = pivot_taxonomic_data(lsu)
ssu_standard = pivot_taxonomic_data(ssu)

In [11]:
# Free memory
del mgf_parquet_dfs
del lsu
del ssu

gc.collect()

60

In [12]:
lsu_standard.set_index('taxonomic_concat', inplace=True)
ssu_standard.set_index('taxonomic_concat', inplace=True)

split_taxo_tables_lsu = separate_taxonomy(lsu_standard)
split_taxo_tables_ssu = separate_taxonomy(ssu_standard)

In [13]:
del lsu_standard
del ssu_standard
gc.collect()

6

## Granular PCoA


In [14]:
pn.extension("tabulator")

granular_tables = {
    "LSU": split_taxo_tables_lsu,
    "SSU": split_taxo_tables_ssu
}

select_granular_table = pn.widgets.Select(
    name="Granular analysis",
    options= list(granular_tables.keys()),
    description="Select a table for granular analysis",
)

select_granular_level = pn.widgets.Select(
    name="Subset taxonomic level",
    options=list(granular_tables[select_granular_table.value].keys()),
    description="Select a table for analysis",
)

pcoa_factor_dropdowns = {
    categorical_col: pn.widgets.MultiSelect(
        name=categorical_col,
        value=['All'],
        options=['All'] + list(full_metadata[categorical_col].unique()),
        size=6, width=180,)
        for categorical_col in factor_cols
}

box_granular = pn.GridBox(
    *pcoa_factor_dropdowns.values(),
    ncols=5,
    )

color_factor_granular = pn.widgets.Select(
    name="Color by",
    value=factor_cols[0],
    options=factor_cols,
)

In [None]:
global subset_selected, taxa_selected, subset


def get_filtered_metadata():
    # Retrieve the selected factors from the dropdowns
    selected_factors = {col: pcoa_factor_dropdowns[col].value for col in factor_cols}
    # Filter the metadata table
    filtered_metadata = filter_metadata_table(full_metadata, selected_factors)
    return filtered_metadata


def filter_all_box_selection(df):
    # Retrieve the filtered metadata
    filtered_metadata = get_filtered_metadata()
    # Filter the data
    filtered_data = filter_data(df, filtered_metadata)
    return filtered_metadata, filtered_data


def update_filtered_data(button):
    logger.info(f"Button clicked: {button.name}")
    # Retrieve the filtered metadata
    filtered_metadata, filtered_data = filter_all_box_selection(
        granular_tables[select_granular_table.value][select_granular_level.value])
    logger.info(f"matadata shape {filtered_metadata.shape}")
    logger.info(f"data shape {filtered_data.shape}")
    # Update the global variables
    subset_selected = filtered_metadata['ref_code'].nunique()
    taxa_selected = len(filtered_data)

In [16]:
filtered_metadata, filtered_data = filter_all_box_selection(granular_tables['LSU']['Bacteria_phylum'])

total_samplings = full_metadata['ref_code'].nunique()
subset = filtered_metadata['ref_code'].nunique()
taxa_count = len(filtered_data)


In [None]:
button_filter_table = pn.widgets.Button(
    name="Filter table",
    button_type="primary",
    width=200,
)

button_filter_table.on_click(update_filtered_data)

pn.Column(
    pn.Row(
        select_granular_table,
        select_granular_level,
        color_factor_granular,
    ),
    pn.layout.Divider(),
    box_granular,
    button_filter_table
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'f8a5d60c-f48a-484b-8c33-b1f74115dbb9': {'version…

In [None]:
filtered_metadata, filtered_data = filter_all_box_selection(
        granular_tables[select_granular_table.value][select_granular_level.value],
        )

beta_pc_plot_granular, explained_var = pl.beta_plot_pc_granular(
    filtered_data=filtered_data,
    metadata=filtered_metadata,
    factor=color_factor_granular.value)
    
beta_pc_plot_granular.opts(
    title="Beta diversity PCoA",
    width=1200,
    height=800,
)

## Permanova testing

In [None]:
# PERMANOVA Dropdowns
permanova_factor = pn.widgets.Select(
    name="Main Permanova factor",
    options=['All'] + factor_cols,
    description='Limit by group(s) in factor:',
)

permanova_group = pn.widgets.MultiSelect(
    name="Groups of unique values of the factor",
    options=[],
    description='Groups:',
)

permanova_additional_factors = pn.widgets.MultiSelect(
    name="Factors to test vs ALL the rest",
    options=factor_cols,
    description='PERMANOVA Factors:',
)

permanova_button = pn.widgets.Button(
    name="PERMANOVA",
    button_type="primary",
    width=200,
)

In [None]:
permanova_button.on_click(
    lambda event: update_permanova_result()
)


def update_permanova_result():
    global permanova_result
    # Run the permanova function and update the result indicator
    permanova_results = run_permanova(
        granular_tables[select_granular_table.value][select_granular_level.value],
        full_metadata,
        permanova_factor.value,
        permanova_group.value,
        permanova_additional_factors.value,
        permutations=NUMBER_PERMUTATIONS,  # 29 for binder, 999 for local
        verbose=True,
    )
    permanova_result = pd.DataFrame.from_dict(permanova_results)


# Update groups based on selected factor
def update_groups(permanova_factor):
    logger.info(f"Permanova factor value: {permanova_factor}")
    if permanova_factor in factor_cols:
        unique_groups = sorted(full_metadata[permanova_factor].dropna().unique())
        permanova_group.options = unique_groups
    elif permanova_factor == 'All':
        permanova_group.options = sorted(full_metadata['ref_code'].dropna().unique())
    else:
        raise ValueError(f"Unknown factor: {permanova_factor}")
    
pn.bind(update_groups,
    permanova_factor,
    watch=True,
)

INFO | Diversity analysis app | Permanova factor value: All


<function param.reactive.bind.<locals>.wrapped(*wargs, **wkwargs)>

In [24]:
pn.Row(
    permanova_factor,
    permanova_group,
    permanova_additional_factors,
    permanova_button,
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'c3e9c8ea-7626-4fb8-8b7e-29ae828f3ac7': {'version…

### Updates and bindings

In [None]:
permanova_result

Unnamed: 0,env_package
method name,PERMANOVA
test statistic name,pseudo-F
sample size,181
number of groups,2
test statistic,31.561698
p-value,0.001
number of permutations,999
