# Visualize pivoted taxonomy (LSU and SSU tables)
- PCoA beta diversities
- Permanova calculations

## Platform dependent part
- Resolve platform setup
- the difference to local imports should be resolved by setting the Blue Cloud VRE well, Colab will still be an issue.

In [1]:
import sys
import os
import gc
import logging
import psutil

from IPython import get_ipython
logger = logging.getLogger(name="Diversity analysis app")
NUMBER_PERMUTATIONS = 999

if 'google.colab' in str(get_ipython()):
    print('Setting Google colab, you will need a ngrok account to make the dashboard display over the tunnel. \
    https://ngrok.com/')
    # clone the momics-demos repository to use it to load data
    try:
        os.system('git clone https://github.com/palec87/momics-demos.git')
        logger.info(f"Repository cloned")
    except OSError as e:
        logger.info(f"An error occurred while cloning the repository: {e}")

    sys.path.insert(0,'/content/momics-demos')

    # this step takes time beacause of many dependencies
    os.system('pip install marine-omics')

elif psutil.users() == []:
    logger.info("Binder")
    NUMBER_PERMUTATIONS = 29  # permanova extremely slow on binder, therefore a change here
else:
    logger.info("Local")

from momics.utils import (
    reconfig_logger, init_setup, 
    load_and_clean, taxonomy_common_preprocess01
)

# Set up logging
reconfig_logger()
init_setup()

INFO | root | Logging.basicConfig completed successfully


## Imports

In [2]:
import os
import warnings

warnings.filterwarnings('ignore')

import pandas as pd
import panel as pn

# All low level functions are imported from the momics package
from momics.diversity import run_permanova
from momics.metadata import (
    filter_metadata_table, filter_data,
)
import momics.plotting as pl
from momics.taxonomy import (
    pivot_taxonomic_data,
    separate_taxonomy)

### User settings

In [3]:
DEBUG = True  # enable stdout logging

## Loading

In [4]:
# parquet files
if 'google.colab' in str(get_ipython()):
    root_folder = os.path.abspath(os.path.join('/content/momics-demos'))
else:
    root_folder = os.path.abspath(os.path.join('../'))

assets_folder = os.path.join(root_folder, 'assets')

In [5]:
def get_valid_samples():
    df_valid = pd.read_csv(
        os.path.join(root_folder, 'data/shipment_b1b2_181.csv')
    )
    return df_valid

valid_samples = get_valid_samples()

### Enhance matadata

In [6]:
# High level function from the momics.utils module
full_metadata, mgf_parquet_dfs = load_and_clean(valid_samples=valid_samples)

In [7]:
# select categorical columns from metadata
categorical_columns = sorted(full_metadata.select_dtypes(include=['object', "boolean"]).columns)

# select numerical columns from metadata
numerical_columns = sorted(full_metadata.select_dtypes(include=['int64', 'float64']).columns)

if DEBUG:
    logger.info(f"Data table names are:\n{mgf_parquet_dfs.keys()}")
    logger.info(f"Categorical metadata columns are:\n{categorical_columns}")
    logger.info(f"Numerical metadata columns are:\n{numerical_columns}")

INFO | Diversity analysis app | Data table names are:
dict_keys(['go', 'go_slim', 'ips', 'ko', 'pfam', 'lsu', 'ssu'])
INFO | Diversity analysis app | Categorical metadata columns are:
['ammonium method', 'chlorophyll method', 'conductivity method', 'country', 'density method', 'dissolved oxygen method', 'environment (biome)', 'environment (feature)', 'environment (material)', 'environmental package', 'investigation type', 'month name', 'nitrate method', 'nitrite method', 'observatory ID', 'observatory local location', 'observatory location ocean or sea', 'observatory regional location', 'organism count', 'organism count method', 'organization', 'organization country', 'pH method', 'phaeopigments method', 'phosphate method', 'pigments (ug/l)', 'pigments method', 'pressure method', 'project name', 'replicate info', 'replicate number', 'sample collection device or method', 'sea subsurface salinity method', 'sea subsurface temperature method', 'sea surface salinity method', 'sea surface te

In [8]:
factors_to_remove = [
    "organization_country", "project_name",
    'env_broad_biome', 'env_local', "extra_site_info", 'failure_comment',
    'obs_id', 'size_frac','ship_date', 'ship_date_seq', 'sampling_event', 'organism_count',
    'sample collection device or method',
    'ammonium method', 'chlorophyll method', 'conductivity method', 'density method', 'dissolved oxygen method',
    'nitrate method', 'nitrite method', 'ph method', 'phaeopigments method', 'phosphate method', 'pigments method',
    "project name", 'pressure method',
    'sea subsurface salinity method', 'sea subsurface temperature method', 'sea surface salinity method', 'sea surface temperature method',
    'silicate method', 'turbidity method', 'pigments (ug/l)', 'organism count', 'investigation type', 'pH method',
    'environment (feature)', 'observatory local location', 'environment (biome)',
]

factor_cols = [col for col in categorical_columns if col not in factors_to_remove]

In [9]:
len(full_metadata.columns), len(numerical_columns),len(categorical_columns)

(98, 57, 40)

## Pivot the tables

In [10]:
# LSU and SSU
lsu = mgf_parquet_dfs['lsu']
ssu = mgf_parquet_dfs['ssu']

lsu_standard = pivot_taxonomic_data(lsu)
ssu_standard = pivot_taxonomic_data(ssu)

In [11]:
# Free memory
del mgf_parquet_dfs
del lsu
del ssu

gc.collect()

60

In [12]:
split_taxo_tables_lsu = separate_taxonomy(lsu_standard)
split_taxo_tables_ssu = separate_taxonomy(ssu_standard)

In [13]:
del lsu_standard
del ssu_standard
gc.collect()

24

## Granular PCoA


In [14]:
pn.extension("tabulator")

granular_tables = {
    "LSU": split_taxo_tables_lsu,
    "SSU": split_taxo_tables_ssu
}

select_granular_table = pn.widgets.Select(
    name="Granular analysis",
    options= list(granular_tables.keys()),
    description="Select a table for granular analysis",
)

select_granular_level = pn.widgets.Select(
    name="Subset taxonomic level",
    options=list(granular_tables[select_granular_table.value].keys()),
    description="Select a table for analysis",
)

pcoa_factor_dropdowns = {
    categorical_col: pn.widgets.MultiSelect(
        name=categorical_col,
        value=['All'],
        options=['All'] + list(full_metadata[categorical_col].unique()),
        size=6, width=180,)
        for categorical_col in factor_cols
}

box_granular = pn.GridBox(
    *pcoa_factor_dropdowns.values(),
    ncols=5,
    )

color_factor_granular = pn.widgets.Select(
    name="Color by",
    value=factor_cols[0],
    options=factor_cols,
)

In [15]:
global subset_selected, taxa_selected, subset


def get_filtered_metadata():
    # Retrieve the selected factors from the dropdowns
    selected_factors = {col: pcoa_factor_dropdowns[col].value for col in factor_cols}
    # Filter the metadata table
    filtered_metadata = filter_metadata_table(full_metadata, selected_factors)
    return filtered_metadata


def filter_all_box_selection(df):
    # Retrieve the filtered metadata
    filtered_metadata = get_filtered_metadata()
    # Filter the data
    filtered_data = filter_data(df, filtered_metadata)
    return filtered_metadata, filtered_data


def update_filtered_data(button):
    logger.info(f"Button clicked: {button.name}")
    # Retrieve the filtered metadata
    filtered_metadata, filtered_data = filter_all_box_selection(
        granular_tables[select_granular_table.value][select_granular_level.value])
    logger.info(f"matadata shape {filtered_metadata.shape}")
    logger.info(f"data shape {filtered_data.shape}")
    # Update the global variables
    subset_selected = filtered_metadata.index.to_list()
    taxa_selected = len(filtered_data)

In [16]:
filtered_metadata, filtered_data = filter_all_box_selection(granular_tables['LSU']['Bacteria_phylum'])

total_samplings = full_metadata.index.nunique()
subset = filtered_metadata.index.nunique()
taxa_count = len(filtered_data)


In [17]:
total_samplings, subset, taxa_count

(181, 181, 127)

In [18]:
filtered_data.head()

Unnamed: 0_level_0,EMOBON_AAOT_Wa_1,EMOBON_AAOT_Wa_2,EMOBON_AAOT_Wa_22,EMOBON_AAOT_Wa_26,EMOBON_AAOT_Wa_27,EMOBON_AAOT_Wa_41,EMOBON_AAOT_Wa_42,EMOBON_AAOT_Wa_46,EMOBON_AAOT_Wa_47,EMOBON_AAOT_Wa_6,...,EMOBON_VB_Wa_4,EMOBON_VB_Wa_41,EMOBON_VB_Wa_42,EMOBON_VB_Wa_43,EMOBON_VB_Wa_44,EMOBON_VB_Wa_5,EMOBON_VB_Wa_93,EMOBON_VB_Wa_94,EMOBON_VB_Wa_96,EMOBON_VB_Wa_97
phylum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p_,17.195929,15.745879,19.840803,1.287185,1.262165,18.515775,25.153573,2.582925,2.419307,3.440877,...,3.512297,24.734255,28.302716,2.249799,2.58262,3.598311,20.867119,22.624616,2.591653,2.402028
p_Acidobacteria,0.010178,0.002734,0.080132,0.004524,0.0,0.051844,0.048497,0.002103,0.0,0.007573,...,0.00283,0.04572,0.052738,0.0,0.0,0.0,0.148933,0.080182,0.003817,0.0
p_Actinobacteria,2.498728,2.06938,2.131524,3.617238,3.993676,1.66642,1.584222,7.025219,8.252198,13.829143,...,0.684912,0.754372,0.817439,3.586733,2.666353,0.81644,1.02598,0.48109,3.437088,2.44742
p_Aquificae,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003817,0.0
p_Armatimonadetes,0.0,0.0,0.005342,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.017579,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
button_filter_table = pn.widgets.Button(
    name="Filter table",
    button_type="primary",
    width=200,
)

button_filter_table.on_click(update_filtered_data)

pn.Column(
    pn.Row(
        select_granular_table,
        select_granular_level,
        color_factor_granular,
    ),
    pn.layout.Divider(),
    box_granular,
    button_filter_table
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'7db6cd7b-11dd-429a-94e6-33f24aa62d77': {'version…

In [20]:
filtered_metadata, filtered_data = filter_all_box_selection(
        granular_tables[select_granular_table.value][select_granular_level.value],
        )

beta_pc_plot_granular, explained_var = pl.beta_plot_pc_granular(
    filtered_data=filtered_data,
    metadata=filtered_metadata,
    factor=color_factor_granular.value)
    
beta_pc_plot_granular.opts(
    title="Beta diversity PCoA",
    width=1200,
    height=800,
)

## Permanova testing

In [21]:
# PERMANOVA Dropdowns
permanova_factor = pn.widgets.Select(
    name="Main Permanova factor",
    options=['All'] + factor_cols,
    description='Limit by group(s) in factor:',
)

permanova_group = pn.widgets.MultiSelect(
    name="Groups of unique values of the factor",
    options=[],
    description='Groups:',
)

permanova_additional_factors = pn.widgets.MultiSelect(
    name="Factors to test vs ALL the rest",
    options=factor_cols,
    description='PERMANOVA Factors:',
)

permanova_button = pn.widgets.Button(
    name="PERMANOVA",
    button_type="primary",
    width=200,
)

In [22]:
permanova_button.on_click(
    lambda event: update_permanova_result()
)


def update_permanova_result():
    global permanova_result
    # Run the permanova function and update the result indicator
    permanova_results = run_permanova(
        granular_tables[select_granular_table.value][select_granular_level.value],
        full_metadata,
        permanova_factor.value,
        permanova_group.value,
        permanova_additional_factors.value,
        permutations=NUMBER_PERMUTATIONS,  # 29 for binder, 999 for local
        verbose=True,
    )
    permanova_result = pd.DataFrame.from_dict(permanova_results)


# Update groups based on selected factor
def update_groups(permanova_factor):
    logger.info(f"Permanova factor value: {permanova_factor}")
    if permanova_factor in factor_cols:
        unique_groups = sorted(full_metadata[permanova_factor].dropna().unique())
        permanova_group.options = unique_groups
    elif permanova_factor == 'All':
        permanova_group.options = sorted(full_metadata.index.to_list())
    else:
        raise ValueError(f"Unknown factor: {permanova_factor}")
    
pn.bind(update_groups,
    permanova_factor,
    watch=True,
)

INFO | Diversity analysis app | Permanova factor value: All


<function param.reactive.bind.<locals>.wrapped(*wargs, **wkwargs)>

In [23]:
pn.Row(
    permanova_factor,
    permanova_group,
    permanova_additional_factors,
    permanova_button,
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'ad4e54cb-5cdf-47cc-907b-341f37ac3e58': {'version…

### Display permanova results

In [None]:
# NBVAL_SKIP
try:
    permanova_result
except NameError:
    pass

Unnamed: 0,country
method name,PERMANOVA
test statistic name,pseudo-F
sample size,181
number of groups,9
test statistic,3.788864
p-value,0.001
number of permutations,999
