# App for running GECCO jobs in Galaxy

1. Upload and run workflow.
2. Monitor the job.
3. Receive completion notification with some basic summary provided by Galaxy.

Note: 
- "Receiving" the results (tentatively download) is part of the analysis pipeline.
- included `.gb`, `.gbk` and `.embl` formats to be allowed for gecco
  - `.embl` from ena comes as `.txt` upon download actually.
  - ncbi genbank download full or normal always cames as `.gb`.
  
**IMPORTANT**
You need to have a `.env` file looking like this
```
GALAXY_URL="https://earth-system.usegalaxy.eu/"
GALAXY_KEY="..."
```
For `binder` users, you cannot upload the `.env` so you will need to hardcode the `API token`.

## TODOs

- save json, from the analysis dashboard, update the json (this will be heavily environmental dependent)
  - note for which environments it works (like GColab env will get deleted etc).
  - So far tested `linux local`
    - to test: `win local`, `gcolab`

In [1]:
import os
import sys
import logging
from IPython import get_ipython

logger = logging.getLogger(name="GECCO galaxy runner")

if 'google.colab' in str(get_ipython()):
    # clone the momics-demos repository to use the utils module from there
    # TODO: eventually utils from momics will be used for that
    try:
        os.system('git clone https://github.com/palec87/momics-demos.git')
        logger.info(f"Repository cloned")
    except OSError as e:
        logger.info(f"An error occurred while cloning the repository: {e}")

    sys.path.insert(0,'/content/momics-demos')
elif "zmqshell" in str(get_ipython()):
    logger.info("Binder")
    sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

    logger.info('Binder will not allow you to upload the ".env" file')
    os.environ["GALAXY_EARTH_URL"] = "https://earth-system.usegalaxy.eu/"
    #########################
    ### INPUT TOKEN HERE ####
    #########################
    os.environ["GALAXY_EARTH_KEY"] = 0
    assert isinstance(os.environ["GALAXY_EARTH_KEY"], str) is True, "token must be a string"

else:
    sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))  # local utils, to be removed in the future

    # downside of this is that all the deps need to be installed in the current (momics-demos) environment
    sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../../marine-omics')))  # local momics package, to be removed too

from utils import init_setup, get_notebook_environment
init_setup()

# Determine the notebook environment
env = get_notebook_environment()
logger.info(f"Environment: {env}")

Binder


Cloning into 'marine-omics'...


Repository marine-omics cloned
Collecting git+https://github.com/palec87/marine-omics.git
  Cloning https://github.com/palec87/marine-omics.git to /tmp/pip-req-build-c0m9qw_h


  Running command git clone --filter=blob:none --quiet https://github.com/palec87/marine-omics.git /tmp/pip-req-build-c0m9qw_h


  Resolved https://github.com/palec87/marine-omics.git to commit 5cd67a5082c4336018cd1e8aac1fe2ca45077d71
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: momics
  Building wheel for momics (pyproject.toml): started
  Building wheel for momics (pyproject.toml): finished with status 'done'
  Created wheel for momics: filename=momics-0.0.1-py3-none-any.whl size=27408 sha256=40227ff92ff320a664f7e54731c6aa850cbbf97a624b98af37d849aeaa440bdc
  Stored in directory: /tmp/pip-ephem-wheel-cache-sax2txdo/wheels/6a/2d/d9/3abe84bc0a11258798bbdfbbf5df57841c73165fd80e3fc016
Successfully built momics
Installing collected packages: momics
Successfully installed momics-0.0.1
momics instal

In [None]:
# This needs to be repeated here for the Pannel dashboard to work, WEIRD
# TODO: report as possible bug
import sys
import os

import bioblend.galaxy as g  # BioBlend is a Python library, wrapping the functionality of Galaxy and CloudMan APIs
import panel as pn

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from utils import init_setup
init_setup()

from bioblend.galaxy.datasets import DatasetClient

from momics.galaxy import RemGalaxy, Gecco
from momics.panel_utils import (
    serve_app, close_server,
)
from momics.utils import memory_load, reconfig_logger

# instead of the jupyter magic, you can also use
from dotenv import load_dotenv
load_dotenv()

## Session setup

In [None]:
DEBUG = True  # enable stdout logging
reconfig_logger()  # Set up logging

# these variables need to be set in the .env file at the root of the project
exp = RemGalaxy("GALAXY_EARTH_URL", "GALAXY_EARTH_KEY")
gecco_tool_id = "toolshed.g2.bx.psu.edu/repos/althonos/gecco/gecco/0.9.6"  # The id of the tool GECCO

In [None]:
# environmentally dependent paths.
# TODO: why is this not in the momics package already?
if 'google.colab' in str(get_ipython()):
    root_folder = os.path.abspath(os.path.join('/content/momics-demos'))
else:
    root_folder = os.path.abspath(os.path.join('../'))

assets_folder = os.path.join(root_folder, 'assets')

## App setup

TODO: do I really need to duplicate the `current_history`, `current_file` name and id? I could handle the tuples in the "backend"

In [None]:
# buttons
but_login = pn.widgets.Button(name="🔐 Galaxy Login")
but_get_histories = pn.widgets.Button(name="📚 Refresh Histories")
but_create_history = pn.widgets.Button(name="📝 Create New History")
but_get_datasets = pn.widgets.Button(name="📖 Refresh Datasets")

# input file handling
but_upload_dataset = pn.widgets.Button(name="📤 Upload Dataset to Galaxy")

# r'(*.fasta|*.gbk|*.embl|*.txt)'), this is a PR i suggested https://github.com/holoviz/panel/issues/7726
file_input = pn.widgets.FileSelector('~', file_pattern="*")
file_source_checkbox = pn.widgets.Checkbox(name='Use a file from the Galaxy', value=True)

but_submit = pn.widgets.Button(name="🚀 Submit GECCO task")

# this perhaps can get solved with file selector
history_name = pn.widgets.TextInput(name='New History Name', placeholder='Enter a string here...')

current_history_name = pn.widgets.StaticText(name='Current History Name', value='No history selected')
current_history_id = pn.widgets.StaticText(name='Current History ID', value='')

current_file_name = pn.widgets.StaticText(name='Current filename for GECCO', value='No file selected')
current_file_id = pn.widgets.StaticText(name='Current file ID for GECCO', value='')

# selectors
select_history = pn.widgets.Select(
    name="Select History",
    options=[],
    description="Your Galaxy histories, create a new one if needed",
)
select_dataset = pn.widgets.Select(
    name="Select Dataset",
    options=[("", "")],
    description="Your Galaxy fasta datasets",
    value=("", ""),
)

# intro text
intro_text = pn.pane.Markdown(
    """
    # GECCO Galaxy Runner

    GECCO (Gene Cluster prediction with Conditional Random Fields) is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs). It is developed in the Zeller group and is part of the suite of computational microbiome analysis tools hosted at EMBL.

    ## Instructions

    1. Log in to your Galaxy instance.
    2. Select a history or create a new one.
    3. Select or upload a fasta/GenBank/EMBL file to the selected history.
    4. Submit the GECCO task.
    5. Continue with the post-processing of the results in the analysis NB.

    ## Output

    GECCO will create the following files once done (using the same prefix as the input file):

    - features.tsv: The genes file, containing the genes identified in the input sequences.
    - features.tsv: The features file, containing the protein domains identified in the input sequences.
    - clusters.tsv: A clusters file, containing the coordinates of the predicted clusters, along their putative biosynthetic type.
    - {sequence}_cluster_{N}.gbk: If any BGCs were found, a GenBank file per cluster, containing the cluster sequence annotated with its member proteins and domains.

    ## References
    [link](https://doi.org/10.1101/2021.05.03.442509) Carroll, L. M., Larralde, M., Fleck, J. S., Ponnudurai, R., Milanese, A., Cappio, E., & Zeller, G. (2021). Accurate de novo identification of biosynthetic gene clusters with GECCO.
    [GECCO tool page](https://usegalaxy.eu/?tool_id=toolshed.g2.bx.psu.edu%2Frepos%2Falthonos%2Fgecco%2Fgecco%2F0.9.6)
    """
)

# gecco params
mask = pn.widgets.Checkbox(name='Enable masking of regions with unknown nucleotides', value=False)
pad = pn.widgets.Checkbox(name='Enable padding of gene sequences smaller than the CRF window length', value=True)
cds = pn.widgets.IntInput(name='IntInMinimum number of genes required for a clusterput',
                          value=3, step=1, start=2, end=1000,
                          )
threshold = pn.widgets.FloatInput(name='Probability threshold for cluster detection',
                                  value=0.05, step=0.01, start=0.0, end=1.0,
                                  )
postproc = pn.widgets.Select(
    name="Post-processing method for gene cluster validation",
    options=["gecco", "antiSMASH"],
)
gene_filter = pn.widgets.IntInput(
    name='Number of genes from the contig edges to filter out',
    value=0, step=1, start=0, end=100)

antimash_sideload = pn.widgets.Checkbox(name='Generate an antiSMASH v6 sideload JSON file', value=False)

# this is currently not Implemented for Galaxy tools
# monitor here, https://github.com/galaxyproject/galaxy/issues/1364
# email_input = pn.widgets.TextInput(name='Email notification', placeholder='Enter a string here...')

In [None]:
gecco_params = {
    "select_history": select_history,
    "current_history_name": current_history_name,
    "current_history_id": current_history_id,
    "select_dataset": select_dataset,
    "current_file_name": current_file_name,
    "current_file_id": current_file_id,
    "file_source_checkbox": file_source_checkbox,
    "file_input": file_input,
    "mask": mask,
    "pad": pad,
    "cds": cds,
    "threshold": threshold,
    "postproc": postproc,
    "gene_filter": gene_filter,
    "antimash_sideload": antimash_sideload,
    "history_name": history_name,
}

gecco = Gecco(gecco_params)

In [None]:
@pn.depends(file_source=file_source_checkbox, watch=True)
def _generate_data_input_flexbox(file_source):
    if file_source:
        return pn.Column(but_get_datasets, select_dataset)
    else:
        return pn.Column(
            "GECCO accepts `fasta`, GenBank (`.gbk`, `gb`) and EMBL ('.embl'). In principle all of those could be `.txt` files too.",
            file_input,
            but_upload_dataset,
            )

In [None]:
pn.extension("tabulator")
if 'google.colab' in str(get_ipython()):
    pn.extension(comms='colab')
pn.extension(notifications=True)
ACCENT = "teal"

styles = {
    "box-shadow": "rgba(50, 50, 93, 0.25) 0px 6px 12px -2px, rgba(0, 0, 0, 0.3) 0px 3px 7px -3px",
    "border-radius": "4px",
    "padding": "10px",
}

image = pn.pane.JPG(os.path.join(assets_folder, "figs/logo_gecco.jpeg"),
                    width=200,
                    )

def app():
    history_flexbox = pn.Column(
        pn.Row(but_get_histories, but_create_history),
        history_name,
        select_history,
        pn.bind(gecco.handle_get_histories, clicks=but_get_histories.param.clicks),
        pn.bind(gecco.handle_create_history, clicks=but_create_history.param.clicks),
        sizing_mode="stretch_width",
    )

    gecco_flexbox = pn.FlexBox(
        pn.Column(
            mask, pad, cds, threshold, postproc, gene_filter, antimash_sideload,
            pn.layout.Divider(),
            "## Submit", current_file_name, current_history_name, but_submit,
        ),
        sizing_mode="stretch_both",
    )

    tabs = pn.Tabs(
        ("Introduction", intro_text),
        ('File Upload', _generate_data_input_flexbox),
        ('GECCO Parameters', gecco_flexbox),
        styles=styles, sizing_mode="stretch_both",
    )

    pn.bind(gecco.handle_update_current_file_name, select_dataset, watch=True)
    pn.bind(gecco.handle_update_current_history_name, select_history, watch=True)
    pn.bind(gecco.handle_update_current_history_id, select_history, watch=True)
    pn.bind(gecco.handle_submit_gecco, but_submit, watch=True)
    template = pn.template.FastListTemplate(
        title="Run GECCO on Galaxy",
        sidebar=[image,
                but_login, pn.bind(gecco.handle_login, clicks=but_login.param.clicks),
                pn.layout.Divider(margin=(-20, 0, 0, 0)),
                "## Histories", history_flexbox,
                "## Datasets", file_source_checkbox,
                pn.bind(gecco.handle_get_datasets, clicks=but_get_datasets.param.clicks),  # this callback cannot be moved, do I need it with clicks?
                pn.bind(gecco.handle_upload_dataset, clicks=but_upload_dataset.param.clicks),
                ],
        main=[tabs,
            ],
        main_layout=None,
        accent=ACCENT,
    )
    return template


template = app()
logger.info("Template created")

# serve the app
if 'google.colab' in str(get_ipython()):
    s = serve_app(template, env=env, name="GECCO_galaxy_runner")
else:
    template.servable()

### Uncomment this if running if running ngrok tunnel which you want to quit

In [None]:
# only use for the ngrok tunnel in GColab
# close_server(s, env=env)