# Display sequencing emo-bon efforts across European sites
- At first showing the validated and relessed data from [emo-bon-data-validataion](https://github.com/emo-bon/emo-bon-data-validation/tree/main/validated-data)
- Second, ask Cymon what metadata can be shown about data which are not ready/released yet.
- I use `leafmap` for GIS integration

In [1]:
# system dependent setup
import sys
import os
import io
import logging
from IPython import get_ipython

logger = logging.getLogger(name="Diversity analysis app")

if 'google.colab' in str(get_ipython()):
    # clone the momics-demos repository to use the utils module from there
    # TODO: eventually utils from momics will be used for that
    try:
        os.system('git clone https://github.com/palec87/momics-demos.git')
        logger.info(f"Repository cloned")
    except OSError as e:
        logger.info(f"An error occurred while cloning the repository: {e}")

    sys.path.insert(0,'/content/momics-demos')

else:
    sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))  # local utils, to be removed in the future

    # downside of this is that all the deps need to be installed in the current (momics-demos) environment
    sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../../marine-omics')))  # local momics package, to be removed too

from utils import init_setup, get_notebook_environment
init_setup()

# Initialize the environment variable
notebook_environment = 'unknown'
# Determine the notebook environment
env = get_notebook_environment()
logger.info(f"Environment: {env}")

Platform: local Linux


## Import

In [2]:
import sys
import os
import io

import requests
import pandas as pd
# import leafmap.leafmap as leafmap
import leafmap

import panel as pn

from momics.utils import memory_load, reconfig_logger
from momics.panel_utils import serve_app, close_server

In [3]:
# parquet files
if 'google.colab' in str(get_ipython()):
    root_folder = os.path.abspath(os.path.join('/content/momics-demos'))
else:
    root_folder = os.path.abspath(os.path.join('../'))


assets_folder = os.path.join(root_folder, 'assets')

## Data from the validated-data repo ran through pydantic by Cymon

In [4]:
url_obs = "https://raw.githubusercontent.com/emo-bon/emo-bon-data-validation/refs/heads/main/validated-data/Observatory_combined_logsheets_validated.csv"
url_metadata = "https://raw.githubusercontent.com/emo-bon/emo-bon-data-validation/refs/heads/main/validated-data/Batch1and2_combined_logsheets_2024-11-12.csv"

# tracking csv file
df_tracker = pd.read_csv("min_merged.csv" ,index_col=False)
df_obs = pd.read_csv(url_obs ,index_col=0)
df_metadata = pd.read_csv(url_metadata ,index_col=0)

# df_obs.columns, df_metadata.columns
df_tracker.columns, df_obs.columns

(Index(['ref_code', 'obs_id', 'batch', 'sample_type', 'reads_name',
        'seq_run_ro_crate_fname', 'forward_read_fname', 'backward_read_fname',
        'run_status', 'version', 'date_started', 'system_run', 'output_loc',
        'output_size'],
       dtype='object'),
 Index(['project_name', 'latitude', 'longitude', 'geo_loc_name',
        'loc_broad_ocean', 'loc_broad_ocean_mrgid', 'loc_regional',
        'loc_regional_mrgid', 'loc_loc', 'loc_loc_mrgid', 'env_broad_biome',
        'env_local', 'env_package', 'tot_depth_water_col', 'organization',
        'organization_country', 'organization_edmoid', 'wa_id',
        'extra_site_info', 'contact_name', 'contact_email', 'contact_orcid',
        'ENA_accession_number_umbrella', 'ENA_accession_number_project'],
       dtype='object'))

In [5]:
"RFormosa" in df_obs.index

True

In [6]:
df_tracker.head()

Unnamed: 0,ref_code,obs_id,batch,sample_type,reads_name,seq_run_ro_crate_fname,forward_read_fname,backward_read_fname,run_status,version,date_started,system_run,output_loc,output_size
0,EMOBON00084,BPNS,1,sediment,DBH_AAANOSDA_1_HMNJKDSX3.UDI248,,DBH_AAANOSDA_1_1_HMNJKDSX3.UDI248_clean.fastq.gz,,COMPLETED,1.0,25/09/2023,REDI,,
1,EMOBON00085,BPNS,1,sediment,DBH_AAAOOSDA_1_HMNJKDSX3.UDI260,,DBH_AAAOOSDA_1_1_HMNJKDSX3.UDI260_clean.fastq.gz,,COMPLETED,1.0,02/10/2023,REDI,,7.7
2,EMOBON00087,BPNS,1,sediment,DBH_AAAIOSDA_1_HMNJKDSX3.UDI224,DBH_AAAIOSDA_1_HMNJKDSX3.UDI224.zip,DBH_AAAIOSDA_1_1_HMNJKDSX3.UDI224_clean.fastq.gz,DBH_AAAIOSDA_1_2_HMNJKDSX3.UDI224_clean.fastq.gz,COMPLETED,1.0,01/06/2023,HCMR-HPC,HCMR-HPC,12.0
3,EMOBON00094,NRMCB,1,sediment,DBH_AAACOSDA_1_HWLTKDRXY.UDI211,,DBH_AAACOSDA_1_1_HWLTKDRXY.UDI211_clean.fastq.gz,,COMPLETED,1.0,18/07/2023,REDI,,19.0
4,EMOBON00095,NRMCB,1,sediment,DBH_AAAFOSDA_1_HMNJKDSX3.UDI283,,DBH_AAAFOSDA_1_1_HMNJKDSX3.UDI283_clean.fastq.gz,,COMPLETED,1.0,11/08/2023,REDI,,9.3


In [18]:
# statistics part
def get_stats(df: pd.DataFrame) -> pd.DataFrame:
    # group by organization
    df_grouped = df.groupby("obs_id").count()
    df_grouped = df_grouped.reset_index()

    # try pivot table on run_status
    df_pivot = df.pivot_table(index="obs_id", columns=["run_status", 'batch'], values="ref_code", aggfunc="count")
    df_pivot['sequenced'] = df_pivot.groupby(level=0, axis=1).sum()["COMPLETED"].astype(int)
    df_pivot["total"] = df.groupby("obs_id").count()["ref_code"].astype(int)
    df_pivot["percentage"] = round(df_pivot['sequenced'] / df_pivot["total"] * 100, ndigits=2)
    return df_pivot


def progress_per_station_old(df: pd.DataFrame) -> pn.Column:
    list_indicators = []

    # get the station data
    for station in list(df.index):
        # print(station, df.loc[station, "percentage"])
        list_indicators.append(
            pn.indicators.Dial(
                name=station,
                value=float(df.loc[station, "total"]), format='{value} %',
                bounds=(0, 100),
                colors=[(0.4, 'red'), (0.8, 'yellow'), (1, 'green')],
                sizing_mode='stretch_width',
                ),
            )
    return list_indicators


def progress_per_station(df: pd.DataFrame) -> pn.Column:
    list_indicators = []

    # get the station data
    for station in list(df.index):
        # print(station, df.loc[station, "percentage"])
        list_indicators.append(
            pn.indicators.LinearGauge(
                name=station,
                value=float(df.loc[station, "percentage"]), format='{value} %',
                bounds=(0, 100),
                colors=[(0.4, 'red'), (0.8, 'yellow'), (1, 'green')],
                horizontal=True,
                width=70,
                ),
            )
    return list_indicators

## This is the table to display in the workflow

In [19]:
df_stats = get_stats(df_tracker)
print(df_stats['total'].sum())
df_stats.head()

565


run_status,COMPLETED,COMPLETED,sequenced,total,percentage
batch,001,002,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
obs_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AAOT,7.0,8.0,15,34,44.12
BPNS,11.0,12.0,23,81,28.4
EMT21,4.0,8.0,12,36,33.33
ESC68N,4.0,2.0,6,30,20.0
HCMR-1,3.0,,3,38,7.89


### Notes on what I need
- I want to show NUMBER + PERCENTAGE of processed samplings per station
- granular per batch

## GIS methods

In [20]:
def create_map(df: pd.DataFrame) -> leafmap.Map:
    # Create a map centered at the given coordinates
    m = leafmap.Map(center=(50, 10), zoom=4,
                    # layout={'width': '60%'},
                    )
    m.add_points_from_xy(
        df, x="longitude", y="latitude",
        popup=['organization', "contact_name", "contact_email", "ENA_accession_number_umbrella", 'tot_depth_water_col'],
        layer_name="EMO-BON Observatories")
    return m

## APP

In [None]:
pn.extension("tabulator")
if 'google.colab' in str(get_ipython()):
    pn.extension(comms='colab')
ACCENT = "teal"

styles = {
    "box-shadow": "rgba(50, 50, 93, 0.25) 0px 6px 12px -2px, rgba(0, 0, 0, 0.3) 0px 3px 7px -3px",
    "border-radius": "4px",
    "padding": "10px",
}

obs_map = create_map(df_obs)
indicators = progress_per_station(df_stats)
# dials1 = pn.Row(*indicators[:len(indicators)//2], sizing_mode='stretch_width')
# dials2 = pn.Row(*indicators[len(indicators)//2:], sizing_mode='stretch_width')


def app():
    template = pn.template.FastListTemplate(
        title="Sequencing Progress Tracking",
        main=[
            # pn.Column(
            #     dials1,
            #     dials2,
            #     obs_map,
            #     ),
            pn.Row(
                obs_map,
                pn.Column(*indicators),
            ),
        ],
        main_layout=None,
        accent=ACCENT,
    )
    return template


template = app()

s = serve_app(template, env=env, name="landing_page")



Port 4040 is in use, trying another port
Port 4040 is in use, trying another port
Port 4040 is in use, trying another port
Port 4040 is in use, trying another port
Port 4040 is in use, trying another port
Port 4040 is in use, trying another port
Using port 4046
Launching server at http://127.0.0.1:4046


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
t=2025-03-06T16:16:44+0000 lvl=warn msg="can't bind default web address, trying alternatives" obj=web addr=127.0.0.1:4040


Tracking URL: NgrokTunnel: "https://cbff-194-210-251-99.ngrok-free.app" -> "http://localhost:4046"




In [11]:
close_server(s, env=env)