# Querry data based on ro-Crates

1. This specifically looks at the intermediate steps of the metaGOflow pipeline.
2. `fastap` outputs
3. ...

**Steps:** (for each metaGOflow step)
1. Acess ro-crate metadata file and extract needed data sources
2. Get the data
3. Visualize

In [None]:
import sys
import os
import io
import gc
import logging
import psutil

from IPython import get_ipython
logger = logging.getLogger(name="Quality Control app")
NUMBER_PERMUTATIONS = 999

if 'google.colab' in str(get_ipython()):
    print('Setting Google colab, you will need a ngrok account to make the dashboard display over the tunnel. \
    https://ngrok.com/')
    # clone the momics-demos repository to use it to load data
    try:
        os.system('git clone https://github.com/palec87/momics-demos.git')
        logger.info(f"Repository cloned")
    except OSError as e:
        logger.info(f"An error occurred while cloning the repository: {e}")

    sys.path.insert(0,'/content/momics-demos')

    # this step takes time beacause of many dependencies
    os.system('pip install momics@git+https://github.com/emo-bon/marine-omics-methods.git@main')

elif psutil.users() == []:
    logger.info("Binder")
    NUMBER_PERMUTATIONS = 29  # permanova extremely slow on binder, therefore a change here
else:
    logger.info("Local")

from momics.utils import (
    memory_load, reconfig_logger,
    init_setup, get_notebook_environment,
)

# Set up logging
reconfig_logger()

# Determine the notebook environment
env = get_notebook_environment()

init_setup()
logger.info(f"Environment: {env}")

INFO | root | Logging.basicConfig completed successfully
INFO | Diversity analysis app | Environment: vscode
INFO | Diversity analysis app | Environment: vscode


## Imports

In [None]:
# This needs to be repeated here for the Pannel dashboard to work, WEIRD
# TODO: report as possible bug
import sys
import os
import io
import requests
import html

from functools import partial
import panel as pn
from dotenv import load_dotenv
load_dotenv()

# All low level functions are imported from the momics package
from momics.loader import get_rocrate_metadata_gh
from momics.panel_utils import (
    create_indicators_diversity,
    serve_app,
    close_server,
)

### Data

In [None]:
if 'google.colab' in str(get_ipython()):
    root_folder = os.path.abspath(os.path.join('/content/momics-demos'))
else:
    root_folder = os.path.abspath(os.path.join('../'))

assets_folder = os.path.join(root_folder, 'assets')

### Trying new public access

In [4]:
def get_rocrate_metadata(sample_id):
    """
    Get the metadata from the ro-crate
    """
    # Get the metadata from the ro-crate
    url = f"https://api.github.com/repos/emo-bon/analysis-results-cluster-01-crate/contents/{sample_id}-ro-crate/ro-crate-metadata.json"
    req = requests.get(
        url,
        headers={
            "accept": "application/vnd.github.v3.raw",
        },
    )
    print("ro-crate-metadata.json request status", req.status_code)
    return req.json()


## Methods

- I do not really want to code parsing of this html
  - TODO: check for some written analyser of fastap

In [7]:

def extract_data_by_name(metadata, name='FASTP analysis of raw sequence data'):
    for d in metadata['@graph']:
        if 'name' in d.keys() and d['name'] == name:
            data = d
            break
    return data


def extract_all_datafiles(metadata):
    datafiles = []
    for d in metadata['@graph']:
        if 'name' in d.keys() and d['@type'] == 'File':
            data_unit = {}
            data_unit['name'] = d['name']
            try:
                # in MB
                data_unit['sizeMB'] = int(int(d['contentSize'])/1e6)
            except KeyError:
                data_unit['sizeMB'] = 'unknown'

            try:
                data_unit['downloadUrl'] = d['downloadUrl']
            except KeyError:
                data_unit['downloadUrl'] = 'unknown'
            datafiles.append(data_unit)
            
    return datafiles

## Dropdown and tabs

In [None]:
archives = [
    "EMOBON_MBAL4_Wa_2",
    "EMOBON_NRMCB_So_1",
    "EMOBON_NRMCB_So_7",
    "EMOBON_OSD74_Wa_2",
    "EMOBON_PiEGetxo_Wa_4",
    "EMOBON_RFormosa_Wa_6",
    "EMOBON_ROSKOGO_So_1",
    "EMOBON_VB_Wa_93",
]
select_archive = pn.widgets.Select(
    name="Archive",
    options= archives,
    value=archives[0],
    description="Select an archive for inspection",
)

fastap_tab = pn.pane.HTML(
    """No data loaded yet. Please select an archive from the dropdown menu""",
    sizing_mode="stretch_both",
)
metagoflow_tab1 = pn.pane.Str(
    """No data loaded yet. Please select an archive from the dropdown menu""",
    sizing_mode="stretch_both",
)
metagoflow_tab2 = pn.pane.Str(
    """No data loaded yet. Please select an archive from the dropdown menu""",
    sizing_mode="stretch_both",
)
reads_qc_tab = pn.pane.Markdown(
    """No data loaded yet. Please select an archive from the dropdown menu""",
    hard_line_break=True,
)
functional_tab = pn.pane.Str(
    """No data loaded yet. Please select an archive from the dropdown menu""",
)
krona1 = pn.pane.HTML(
    """No data loaded yet. Please select an archive from the dropdown menu""",
)
krona2 = pn.pane.HTML(
    """No data loaded yet. Please select an archive from the dropdown menu""",
)

### Bindings

In [None]:
pn.extension('mathjax', 'plotly')

def redraw_tabs(archive):
    """
    Redraw the tabs with the selected archive
    """
    if archive is None:
        fastap_tab.object = "No data loaded yet. Please select an archive from the dropdown menu"
        return

    # Get the metadata from the ro-crate
    metadata = get_rocrate_metadata(archive)
    data = extract_data_by_name(metadata, name='FASTP analysis of raw sequence data')
    r = requests.get(data['downloadUrl'])
    escaped_content = html.escape(r.content.decode('utf-8'))
    iframe = f'<iframe srcdoc="{escaped_content}" style="height:100%; width:900px;"></iframe>'
    fastap_tab.object = iframe

    data = extract_data_by_name(metadata, name='MetaGOflow YAML configuration file')
    r = requests.get(data['downloadUrl'])
    metagoflow_tab1.object = r.content.decode('utf-8')

    data = extract_data_by_name(metadata, name='MetaGOflow configuration in YAML')
    r = requests.get(data['downloadUrl'])
    metagoflow_tab2.object = r.content.decode('utf-8')

    data = extract_data_by_name(metadata, name='Trimmed reverse reads QC summary')
    data2 = extract_data_by_name(metadata, name='Trimmed forward reads QC summary')
    data3 = extract_data_by_name(metadata, name='QC summary of merged reads')
    r = requests.get(data['downloadUrl']).content.decode('utf-8')
    r2 = requests.get(data2['downloadUrl']).content.decode('utf-8')
    r3 = requests.get(data3['downloadUrl']).content.decode('utf-8')
    reads_qc_tab.object = """# Trimmed reads\n## Forward reads\n""" + r2 + """\n## Reverse reads\n""" + r + """\n# Merged reads\n""" + r3

    data = extract_data_by_name(metadata, name='Geno Ontology summary statistics')
    data2 = extract_data_by_name(metadata, name='InterProScan summary statistics')
    data3 = extract_data_by_name(metadata, name='Kegg Ontology summary statistics')
    data4 = extract_data_by_name(metadata, name='ORF summary statistics')
    data5 = extract_data_by_name(metadata, name="Numbers of RNA's counted")
    data6 = extract_data_by_name(metadata, name='Merged contigs KO summary')
    r = requests.get(data['downloadUrl']).content.decode('utf-8')
    r2 = requests.get(data2['downloadUrl']).content.decode('utf-8')
    r3 = requests.get(data3['downloadUrl']).content.decode('utf-8')
    r4 = requests.get(data4['downloadUrl']).content.decode('utf-8')
    r5 = requests.get(data5['downloadUrl']).content.decode('utf-8')
    r6 = requests.get(data6['downloadUrl']).content.decode('utf-8')
    functional_tab.object = concat_to_markdown([r, r2, r3, r4, r5, r6])

    data = extract_data_by_name(metadata, name='Krona summary of LSU taxonomic inventory')
    data2 = extract_data_by_name(metadata, name='Krona summary of SSU taxonomic inventory')
    r = requests.get(data['downloadUrl']).content.decode('utf-8')
    r2 = requests.get(data2['downloadUrl']).content.decode('utf-8')
    escaped_content = html.escape(r)
    escaped_content2 = html.escape(r2)
    iframe = f'<iframe srcdoc="{escaped_content}" style="height:700px; width:1000px;"></iframe>'
    krona1.object = iframe
    iframe2 = f'<iframe srcdoc="{escaped_content2}" style="height:700px; width:1000px;"></iframe>'
    krona2.object = iframe2


def concat_to_markdown(data: list) -> str:
    """
    Concatenate the data to a markdown string
    """
    return "\n".join([f"{item}\n" for item in data])

pn.bind(redraw_tabs,
    archive=select_archive,
    watch=True,
    )




<function param.reactive.bind.<locals>.wrapped(*wargs, **wkwargs)>

In [None]:
redraw_tabs(select_archive.value)

# Querry data based on ro-Crates

1. This specifically looks at the intermediate steps of the metaGOflow pipeline.
2. `fastap` outputs
3. ...

**Steps:** (for each metaGOflow step)
1. Acess ro-crate metadata file and extract needed data sources
2. Get the data
3. Visualize

## APP setup

In [None]:
pn.extension("tabulator")
if 'google.colab' in str(get_ipython()):
    pn.extension(comms='colab')
ACCENT = "teal"

styles = {
    "box-shadow": "rgba(50, 50, 93, 0.25) 0px 6px 12px -2px, rgba(0, 0, 0, 0.3) 0px 3px 7px -3px",
    "border-radius": "4px",
    "padding": "10px",
}

# TODO: there is a bug in the panel library that does not allow to open png files, renoming does not help 
image = pn.pane.JPG(os.path.join(assets_folder, "figs/metaGOflow_logo_italics.jpg"),
                    width=200,
                    height=100,
                    )
tabs = pn.Tabs(
    ('Fastap QC', fastap_tab),
    ('Reads QC', reads_qc_tab),
    ('Krona', pn.Column(
        "# Krona from LSU",
        krona1,
        "# Krona from SSU",
        krona2),
    ),
    ('Functional', functional_tab),
    ('MGF yml 1', metagoflow_tab1),
    ('MGF yml 2', metagoflow_tab2),
    styles=styles,
    margin=10
)

_, indicator_usage = create_indicators_diversity()

def update_used_gb(event):
    if not event:
        return

    used_gb, total_gb = memory_load()
    indicator_usage.value = used_gb


def app():
    cb = pn.state.add_periodic_callback(
        partial(update_used_gb, indicator_usage),
        period=1000,
        timeout=None,
        )

    toggle = pn.widgets.Toggle(
        name='Toggle callback',
        value=True,
        button_type='success',)
    toggle.link(cb, bidirectional=True, value='running')
    template = pn.template.FastListTemplate(
        title="Quality Control",
        sidebar=[image,
                "# Archive", select_archive,
                pn.layout.Divider(),
                indicator_usage,
                toggle,
                ],
        main=[pn.Column(
                tabs,
            )],
        main_layout=None,
        accent=ACCENT,
    )
    return template

template = app()

if 'google.colab' in str(get_ipython()):  
    s = serve_app(template, env=env, name="quality_control")
else:
    template.servable()

### Uncomment this if running if running ngrok tunnel which you want to quit

In [None]:
# only use for the ngrok tunnel in GColab
# close_server(s, env=env)