# Display sequencing emo-bon efforts across European sites
- At first showing the validated and relessed data from [emo-bon-data-validataion](https://github.com/emo-bon/emo-bon-data-validation/tree/main/validated-data)
- Second, ask Cymon what metadata can be shown about data which are not ready/released yet.
- I use `leafmap` for GIS integration

In [None]:
# system dependent setup
import sys
import os
import io
import logging
from IPython import get_ipython

logger = logging.getLogger(name="Sequencing Progress")

if 'google.colab' in str(get_ipython()):
    print('Setting Google colab, you will need a ngrok account to make the dashboard display over the tunnel. \
    https://ngrok.com/')
    # clone the momics-demos repository to use it to load data
    try:
        os.system('git clone https://github.com/palec87/momics-demos.git')
        logger.info(f"Repository cloned")
    except OSError as e:
        logger.info(f"An error occurred while cloning the repository: {e}")

    sys.path.insert(0,'/content/momics-demos')

    # this step takes time beacause of many dependencies
    os.system('pip install marine-omics')

    os.system('pip install leafmap')
    logger.info(f"leafmap installed")

from momics.utils import (
    init_setup, get_notebook_environment,
    memory_load, reconfig_logger
)

# Set up logging
reconfig_logger()

# Determine the notebook environment
env = get_notebook_environment()


init_setup()
logger.info(f"Environment: {env}")

## Import

In [None]:
import sys
import os
import io

import pandas as pd
import leafmap
from ipywidgets import Layout

import panel as pn
from dotenv import load_dotenv
load_dotenv()

from momics.panel_utils import serve_app, close_server, create_indicators_landing_page

## User settings

In [None]:
DEBUG = True  # enable stdout logging

## Loading

In [None]:
# parquet files
if 'google.colab' in str(get_ipython()):
    root_folder = os.path.abspath(os.path.join('/content/momics-demos'))
    df_tracker = pd.read_csv(os.path.join(root_folder,"wf0_landing_page/emobon_sequencing_master.csv") ,index_col=False)
else:
    root_folder = os.path.abspath(os.path.join('../'))
    df_tracker = pd.read_csv("emobon_sequencing_master.csv" ,index_col=False)

assets_folder = os.path.join(root_folder, 'assets')

### Data from the validated-data repo ran through pydantic by Cymon

In [None]:
url_obs = "https://raw.githubusercontent.com/emo-bon/emo-bon-data-validation/refs/heads/main/validated-data/Observatory_combined_logsheets_validated.csv"
url_metadata = "https://raw.githubusercontent.com/emo-bon/emo-bon-data-validation/refs/heads/main/validated-data/Batch1and2_combined_logsheets_2024-11-12.csv"

# tracking csv file
df_obs = pd.read_csv(url_obs ,index_col=0)

# change string that ';' is followed by a space
df_obs['organization'] = df_obs['organization'].str.replace('; ', ';')
df_obs['organization'] = df_obs['organization'].str.replace(';', '; ')


df_metadata = pd.read_csv(url_metadata ,index_col=0)

# df_tracker.columns, df_obs.columns

In [None]:
# df_obs.head()

In [None]:
# df_tracker.head()

In [None]:
# manual hash table because names are pain in the backside and long
org_hash = {
    'EMT21': 'Toralla Marine Science Station',
    'UMF': 'Umea Marine Sciences Centre',
    'ROSKOGO': 'Station Biologique de Roscoff',
    'LMO': 'Linnaeus University',
    'BPNS': 'Flanders Marine Institute (VLIZ)',
    'ESC68N': 'The Arctic University of Norway (UiT)',
    'OOB': 'Observatoire Océanologique de Banyuls',
    'RFormosa': 'Centre of Marine Sciences (CCMAR)',
    'Bergen': 'University of Bergen (UiB)',
    'OSD74': 'CIIMAR Porto',
    'VB': 'Institut de la Mer de Villefranche',
    'MBAL4': 'Marine Biological Association, Plymouth',
    'HCMR-1': 'Hellenic Centre for Marine Research',
    'NRMCB': 'Stazione Zoologica Anton Dohrn',
    'PiEGetxo': 'Plentzia Marine Station',
    'IUIEilat': 'Interuniversity Institute for Marine Sciences in Eilat',
    'AAOT': 'Institute of Marine Science, (ISMAR)'
}


# statistics part
def get_stats(df: pd.DataFrame, hash) -> pd.DataFrame:
    # pivot table on run_status
    df_pivot = df.pivot_table(index="obs_id", columns=["run_status", 'batch'], values="organization",
                              aggfunc="count")

    df_pivot['sequenced'] = df_pivot.groupby(level=0, axis=1).sum()["COMPLETED"].astype(int)
    df_pivot["total"] = df.groupby("obs_id").count()["ref_code"].astype(int)
    df_pivot["percentage"] = round(df_pivot['sequenced'] / df_pivot["total"] * 100, ndigits=2)

    #replace Nan in run_status with 'queued'
    df['run_status'] = df['run_status'].fillna('queued')

    # remove lines which are not in org_hash
    df = df[df['obs_id'].isin(hash.keys())]
    df_pivot2 = df.pivot_table(index="obs_id",
                               columns=["run_status", "sample_type"],
                               values="organization",
                               aggfunc="count")
    return df_pivot, df_pivot2


def progress_per_station(df: pd.DataFrame, hash_table: dict = None) -> pn.Column:
    list_indicators = []

    # get the station data
    for station in list(df.index):
        list_indicators.append(
            pn.indicators.LinearGauge(
                name=station if station not in org_hash else org_hash[station],
                value=int(df.loc[station, "sequenced"]), format='{value} sequenced',
                bounds=(0, int(df.loc[station, "total"])),
                colors=[(0.4, '#c2270c'), (0.8, '#f5c011'), (1, '#0b8c21')],
                horizontal=True,
                width=60,
                ),
            )
    return list_indicators

### This is the table to display in the workflow

In [None]:
# Failed attempt to clean and shorted the org names automatically
organizations = df_obs[['organization']]

# remore duplicates
organizations = organizations.drop_duplicates()
organizations = organizations[organizations['organization'] != 'Estación de Ciencias Mariñas de Toralla - Centre of Marine Research, University of Vigo']

In [None]:
# add organization column from the df_obs to the df_tracker based on the obs_id
df_tracker = pd.merge(df_tracker, organizations, on="obs_id", how="left")

In [None]:
# TODO: refactor to func and put to the momics module.

df_stats, df_aggregated = get_stats(df_tracker, org_hash)
# Reset index to merge properly
df_stats = df_stats.reset_index()

# Flatten the multi-level columns
df_stats.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in df_stats.columns]

df_stats = pd.merge(df_stats, organizations, left_on="obs_id_", right_index=True, how="left")
# remove trailing _ from the column names
df_stats.columns = df_stats.columns.str.rstrip('_')

# set index
df_stats.set_index('obs_id', inplace=True)


print(df_stats['total'].sum())
df_stats.head()

In [None]:
df_aggregated.head()

### Notes on what I need
- I want to show NUMBER + PERCENTAGE of processed samplings per station
- granular per batch

## GIS methods

In [None]:
def create_map(df: pd.DataFrame) -> leafmap.Map:
    # Create a map centered at the given coordinates
    m = leafmap.Map(center=(50, 10), zoom=4,
                    layout=Layout(height='800px')
                    )
    m.add_points_from_xy(
        df, x="longitude", y="latitude",
        popup=['organization', "contact_name", "contact_email", "ENA_accession_number_umbrella", 'tot_depth_water_col'],
        layer_name="EMO-BON Observatories",
        max_cluster_radius=10,
        )
    return m

## APP setup

In [None]:
pn.extension("tabulator")
if 'google.colab' in str(get_ipython()):
    pn.extension(comms='colab')
ACCENT = "teal"

styles = {
    "box-shadow": "rgba(50, 50, 93, 0.25) 0px 6px 12px -2px, rgba(0, 0, 0, 0.3) 0px 3px 7px -3px",
    "border-radius": "4px",
    "padding": "10px",
}

obs_map = create_map(df_obs)
indicators = progress_per_station(df_stats)
row = pn.FlexBox(
    pn.Row(
        *create_indicators_landing_page(df_aggregated),
        sizing_mode="stretch_both",
    )
)


def app():
    template = pn.template.FastListTemplate(
        title="EMOBON Sequencing Progress Tracker",
        main=[
            pn.Row(
                pn.Column(
                    pn.pane.Markdown("## Aggregated stats for <a href= 'https://www.fairease.eu/' target=_blank>FAIR-EASE project</a>"),
                    row,
                    pn.FlexBox(
                        obs_map,
                    ),
                ),
                pn.Column(*indicators),
            ),
        ],
        main_layout=None,
        accent=ACCENT,
    )
    return template


template = app()
if 'google.colab' in str(get_ipython()):    
    s = serve_app(template, env=env, name="landing_page")
else:
    template.servable()

In [None]:
# only use for the ngrok tunnel in GColab
# close_server(s, env=env)