## Fater testing out of the panel app integrations

In [None]:
import sys
import os
import io

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from utils import init_setup
init_setup()

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# import panel as pn

from skbio.diversity import beta_diversity
from skbio.stats.ordination import pcoa

# All low level functions are imported from the momics package
from momics.loader import load_parquets
import momics.plotting as pl
# from momics.panel_utils import diversity_select_widgets, create_indicators
# from momics.utils import memory_load
from momics.diversity import (
    alpha_diversity_parametrized,
    beta_diversity_parametrized,
)

# Note: This is breaking the panel preview functionality
%load_ext autoreload
%autoreload 2

In [2]:
def get_data(folder):
    return load_parquets(folder)

def fill_na_for_object_columns(df):
    """
    Fill NA values with 'NA' for object columns in the dataframe.

    Args:
        df (pd.DataFrame): The input dataframe.

    Returns:
        pd.DataFrame: The dataframe with NA values filled for object columns.
    """
    # Apply fillna only to object columns
    df[df.select_dtypes(include=['object']).columns] = df.select_dtypes(include=['object']).apply(lambda col: col.fillna('NA'))
    return df

def get_metadata(folder):
    # Load metadata
    sample_metadata = pd.read_csv(
        os.path.join(folder, "Batch1and2_combined_logsheets_2024-09-11.csv")
    )

    observatory_metadata = pd.read_csv(
        os.path.join(folder, "Observatory_combined_logsheets_validated.csv")
    )

    # Merge metadata
    full_metadata = pd.merge(
        sample_metadata,
        observatory_metadata,
        on=["obs_id", "env_package"],  # Matching conditions
        how="inner"  # Inner join
    )

    # Sort the merged dataframe by 'ref_code' column in ascending order
    full_metadata = full_metadata.sort_values(by="ref_code", ascending=True)

    # first convert some of the boolean cols
    full_metadata["failure"] = full_metadata["failure"].astype(str)
    # replace the 'nan' values with 'NA'
    full_metadata["failure"] = full_metadata["failure"].replace("nan", "NA")


    # adding replacement for the missing values for object type columns
    full_metadata = fill_na_for_object_columns(full_metadata)
    
    return full_metadata

In [3]:
# parquet files
if 'google.colab' in str(get_ipython()):
    root_folder = os.path.abspath(os.path.join('/content/momics-demos'))
else:
    root_folder = os.path.abspath(os.path.join('../'))


data_folder = os.path.join(root_folder, 'data/parquet_files')
assets_folder = os.path.join(root_folder, 'assets')


mgf_parquet_dfs = get_data(data_folder)

# metadata
# Load and merge metadata
full_metadata = get_metadata(data_folder)

# select categorical columns from metadata
categorical_columns = sorted(full_metadata.select_dtypes(include=['object', "boolean"]).columns)
cat_to_remove = ["ref_code", "samp_description", "source_mat_id", "source_mat_id_orig"]
categorical_columns = [k for k in categorical_columns if k not in cat_to_remove]

# select numerical columns from metadata
numerical_columns = sorted(full_metadata.select_dtypes(include=['int64', 'float64']).columns)

assert len(full_metadata.columns) == len(numerical_columns) + len(categorical_columns) + len(cat_to_remove)  # +1 for 'ref_code'

In [4]:
# no problem here, just ADD the normalization option
beta = beta_diversity_parametrized(mgf_parquet_dfs['SSU'], 'class')
# beta.to_data_frame().columns

In [None]:
pcoa_result = pcoa(beta, method="eigh", number_of_dimensions=3)
pcoa_result.samples.head()

In [12]:
pcoa_df = pd.merge(
        pcoa_result.samples,
        full_metadata,
        left_index=True,
        right_on="ref_code",
        how="inner",
    )

In [None]:
pcoa_df.head()

In [None]:
pcoa_df.describe()

In [None]:
pcoa_df['contact_name'].value_counts()

In [None]:
for factor in categorical_columns:
    fig = pl.plot_pcoa_black(pcoa_df, color_by=factor)
    display(fig)
    # plt.show(fig)
# pl.plot_pcoa_black(pcoa_df, color_by="contact_name")

In [None]:
pl.plot_pcoa_black(pcoa_df, color_by="alkalinity")
pcoa_df['alkalinity'].value_counts()

In [None]:
full_metadata['alkalinity'].unique()