# Display sequencing emo-bon efforts across European sites
- At first showing the validated and relessed data from [emo-bon-data-validataion](https://github.com/emo-bon/emo-bon-data-validation/tree/main/validated-data)
- Second, ask Cymon what metadata can be shown about data which are not ready/released yet.
- I use `leafmap` for GIS integration

In [None]:
# system dependent setup
import sys
import os
import io
import logging
logger = logging.getLogger(name="Diversity analysis app")

if 'google.colab' in str(get_ipython()):
    # clone the momics-demos repository to use the utils module from there
    # TODO: eventually utils from momics will be used for that
    try:
        os.system('git clone https://github.com/palec87/momics-demos.git')
        logger.info(f"Repository cloned")
    except OSError as e:
        logger.info(f"An error occurred while cloning the repository: {e}")

    sys.path.insert(0,'/content/momics-demos')

else:
    sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))  # local utils, to be removed in the future

    # downside of this is that all the deps need to be installed in the current (momics-demos) environment
    sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../../marine-omics')))  # local momics package, to be removed too

from utils import init_setup, get_notebook_environment
init_setup()

# Initialize the environment variable
notebook_environment = 'unknown'
# Determine the notebook environment
env = get_notebook_environment()
logger.info(f"Environment: {env}")

## Imports

In [None]:
import requests
import pandas as pd
# import leafmap.leafmap as leafmap
import leafmap

## Data from the validated-data repo ran through pydantic by Cymon

In [None]:
url_obs = "https://raw.githubusercontent.com/emo-bon/emo-bon-data-validation/refs/heads/main/validated-data/Observatory_combined_logsheets_validated.csv"
url_metadata = "https://raw.githubusercontent.com/emo-bon/emo-bon-data-validation/refs/heads/main/validated-data/Batch1and2_combined_logsheets_2024-11-12.csv"


df_obs = pd.read_csv(url_obs ,index_col=0)
df_metadata = pd.read_csv(url_metadata ,index_col=0)

df_obs.columns, df_metadata.columns

In [None]:
m = leafmap.Map(center=(50, 10), zoom=4)
m.add_points_from_xy(
    df_obs, x="longitude", y="latitude",
    popup=['organization', "contact_name", "contact_email", "ENA_accession_number_umbrella", 'tot_depth_water_col'],
    layer_name="EMO-BON Observatories")
# m

## Sequencing data tracking from Cymon's spreadsheet

In [None]:
def get_seq_track_data(kind="FILTERS"):
    url = f"https://docs.google.com/spreadsheets/d/1j9tRRsRCcyViDMTB1X7lx8POY1P5bV7UijxKKSebZAM/gviz/tq?tqx=out:csv&sheet={kind}"
    df = pd.read_csv(url)

    df_out = process_seq_track_data(df, kind)
    return df_out


def process_seq_track_data(df, kind):
    df_out = df[df.columns[:17]]  # drop empty cols at the end
    df_out.rename(columns={"Unnamed: 16": "comments"}, inplace=True)
    df_out.drop(columns=["Comp. Resources", "Run Duration"], inplace=True)
    return df_out


In [None]:
df_sediment = get_seq_track_data("SEDIMENTS")
df_filters = get_seq_track_data("FILTERS")

# for col in df_sediment.columns:
#     print(f"column: {col}", df_sediment[col].unique())

In [None]:
# for col in df_filters.columns:
#     print(f"column: {col}", df_filters[col].unique())

## Create a template for the TRACKING table from the `sequencing-crate` repo
- sequence [repo](https://github.com/emo-bon/sequencing-crate/tree/main/shipment)
- full will combine emo-bon-validation, sequencing-crate and metagoflow tracker

In [None]:
df_filters.columns

### Merging tables

In [None]:
# min version
# columns from shipment: sourcce_mat_id, scientific_name, ref_code
# this should run through the validataion process

# columns from MGflow tracker: ref_code (to merge on), batch_number, seq_run_ro_crate_fname, forward_read_fname, backward_read_fname, run_status,
# version, date_started, who, system_run, output_loc, output_size, notes

ALL_SHIPMENTS = ["001", "002", "003-0", "003-1", "003-2"]

def query_batch_shipment_data(batch_string):
    url = f"https://raw.githubusercontent.com/emo-bon/sequencing-crate/refs/heads/main/shipment/batch-{batch_string}/run-information-batch-{batch_string}.csv"
    df = pd.read_csv(url)
    df['batch'] = batch_string


    # I need to extract the sample type on this level
    if "source_material_id" in df.columns:

        # cycle through the source_material_id and extract the sample type
        for i in range(len(df)):
            if "_Wa_" in df.loc[i, "source_material_id"]:
                df.loc[i, "sample_type"] = "filters"
            else:
                df.loc[i, "sample_type"] = "sediment"

            if "blank" in df.loc[i, "source_material_id"].lower():
                df.loc[i, "sample_type"] = df.loc[i, "sample_type"] + "_blank"

    else:
        # cycle through the old_source_mat_id and extract the sample type
        for i in range(len(df)):
            if "_Wa_" in df.loc[i, "old_source_mat_id"]:
                df.loc[i, "sample_type"] = "filters"
            else:
                df.loc[i, "sample_type"] = "sediment"

            if "blank" in df.loc[i, "old_source_mat_id"].lower():
                df.loc[i, "sample_type"] = df.loc[i, "sample_type"] + "_blank"

    return df

def query_all_shipment_data():
    df = pd.concat([query_batch_shipment_data(batch) for batch in ALL_SHIPMENTS], ignore_index=True)
    return df

def query_track_data():
    df_sed = get_seq_track_data("SEDIMENTS")
    df_filt = get_seq_track_data("FILTERS")

    # concatenate the two dataframes
    df = pd.concat([df_sed, df_filt], ignore_index=True)

    # rename certain columns
    df.rename(columns={"Seq Run RO-Crate Filename": "seq_run_ro_crate_fname"}, inplace=True)
    df.rename(columns={"Forward Read Filename": "forward_read_fname"}, inplace=True)
    df.rename(columns={"BackwardRead Filename": "backward_read_fname"}, inplace=True)
    df.rename(columns={"Output Location": "output_loc"}, inplace=True)

    # rename columns to replace the space with underscore and make them lowercase
    df.columns = df.columns.str.replace(" ", "_").str.lower()
    return df

def infer_sample_type(df):
    df["sample_type"] = df["old_source_mat_id"].apply(lambda x: "filters" if "_Wa_" in x else "sediment")
    # append _blamk if in the lowercase of source_mat_id_orig
    # df["sample_type"] = df["old_source_mat_id"].str.lower().apply(lambda x: x + "_blank" if "blank" in x else x)
    df["sample_type"] = df["sample_type"].apply(lambda x: x + "_blank" if "blank" in df["source_mat_id_orig"].str.lower() else x)
    return df

def min_merge(df_shipment, df_tracking):
    df_shipment["ref_code"] = df_shipment["ref_code"].str.replace(" ", "")
    df_tracking["ref_code"] = df_tracking["ref_code"].str.replace(" ", "")

    df_shipment = df_shipment[["ref_code", "batch", "sample_type"]]

    df_tracking = df_tracking[["ref_code", "seq_run_ro_crate_fname", "forward_read_fname",
                               "backward_read_fname", "run_status", "version", "date_started",
                               "who", "system_run", "output_loc", "output_size"]]

    df = df_shipment.merge(df_tracking, on="ref_code", how="left")
    return df

In [None]:
df_shipments = query_all_shipment_data()
df_tracking = query_track_data()
# df_shipments.columns

In [None]:
df_full = min_merge(df_shipments, df_tracking)

In [None]:
df_full.head()

In [None]:
# save as csv
df_full.to_csv("min_merged.csv", index=False)