# Run GECCO jobs in Galaxy

1. Upload and run workflow.
2. Monitor the job.
3. Receive completion notification with some basic summary provided by Galaxy.

Note: 
- "Receiving" the results (tentatively download) is part of the analysis pipeline.
- included `.gb`, `.gbk` and `.embl` formats to be allowed for gecco
  - `.embl` from ena comes as `.txt` upon download actually.
  - ncbi genbank download full or normal always cames as `.gb`.
  
**IMPORTANT**
You need to have a `.env` file looking like this
```
GALAXY_URL="https://earth-system.usegalaxy.eu/"
GALAXY_KEY="..."
```
For `binder` users, you cannot upload the `.env` so you will need to hardcode the `API token`.

In [None]:
import os
import sys
import logging
import psutil
from IPython import get_ipython

In [None]:
# NBVAL_SKIP
logger = logging.getLogger(name="GECCO galaxy runner")

if 'google.colab' in str(get_ipython()):
    print('Setting Google colab, you will need a ngrok account to make the dashboard display over the tunnel. \
    https://ngrok.com/')
    # clone the momics-demos repository to use it to load data
    try:
        os.system('git clone https://github.com/palec87/momics-demos.git')
        logger.info(f"Repository cloned")
    except OSError as e:
        logger.info(f"An error occurred while cloning the repository: {e}")

    sys.path.insert(0,'/content/momics-demos')

    # this step takes time beacause of many dependencies
    os.system('pip install marine-omics')

elif psutil.users() == []:
    logger.info("Binder")

    logger.info('Binder will not allow you to upload the ".env" file')
    os.environ["GALAXY_EARTH_URL"] = "https://earth-system.usegalaxy.eu/"
    #########################
    ### INPUT TOKEN HERE ####
    #########################
    os.environ["GALAXY_EARTH_KEY"] = ""
    assert os.environ["GALAXY_EARTH_KEY"] != "", "token cannot be an empty string, SET your API key."

else:
    print('Local environment, you should have .env file at root')

In [None]:
from momics.utils import reconfig_logger, init_setup

# Set up logging
reconfig_logger()
init_setup()

import bioblend.galaxy as g  # BioBlend is a Python library, wrapping the functionality of Galaxy and CloudMan APIs
import panel as pn

from momics.galaxy import RemGalaxy, Gecco

# instead of the jupyter magic, you can also use
from dotenv import load_dotenv
load_dotenv()

## User settings

In [None]:
DEBUG = True  # enable stdout logging

## Session setup

In [None]:
DEBUG = True  # enable stdout logging
reconfig_logger()  # Set up logging

# these variables need to be set in the .env file at the root of the project
exp = RemGalaxy("GALAXY_EARTH_URL", "GALAXY_EARTH_KEY")
gecco_tool_id = "toolshed.g2.bx.psu.edu/repos/althonos/gecco/gecco/0.9.6"  # The id of the tool GECCO

In [None]:
# environmentally dependent paths.
# TODO: why is this not in the momics package already?
if 'google.colab' in str(get_ipython()):
    root_folder = os.path.abspath(os.path.join('/content/momics-demos'))
else:
    root_folder = os.path.abspath(os.path.join('../'))

assets_folder = os.path.join(root_folder, 'assets')

In [None]:
pn.extension("tabulator")
if 'google.colab' in str(get_ipython()):
    pn.extension(comms='colab')
    
# buttons
but_login = pn.widgets.Button(name="🔐 Galaxy Login")
but_get_histories = pn.widgets.Button(name="📚 Refresh Histories")
but_create_history = pn.widgets.Button(name="📝 Create New History")
but_get_datasets = pn.widgets.Button(name="📖 Refresh Datasets")

# input file handling
but_upload_dataset = pn.widgets.Button(name="📤 Upload Dataset to Galaxy")

# r'(*.fasta|*.gbk|*.embl|*.txt)'), this is a PR i suggested https://github.com/holoviz/panel/issues/7726
file_input = pn.widgets.FileSelector('~', file_pattern="*")
file_source_checkbox = pn.widgets.Checkbox(name='Use a file from the Galaxy', value=True)

but_submit = pn.widgets.Button(name="🚀 Submit GECCO task")

# this perhaps can get solved with file selector
history_name = pn.widgets.TextInput(name='New History Name', placeholder='Enter a string here...')

current_history_name = pn.widgets.StaticText(name='Current History Name', value='No history selected')
current_history_id = pn.widgets.StaticText(name='Current History ID', value='')

current_file_name = pn.widgets.StaticText(name='Current filename for GECCO', value='No file selected')
current_file_id = pn.widgets.StaticText(name='Current file ID for GECCO', value='')

# selectors
select_history = pn.widgets.Select(
    name="Select History",
    options=[],
    description="Your Galaxy histories, create a new one if needed",
)
select_dataset = pn.widgets.Select(
    name="Select Dataset",
    options=[("", "")],
    description="Your Galaxy fasta datasets",
    value=("", ""),
)


# gecco params
mask = pn.widgets.Checkbox(name='Enable masking of regions with unknown nucleotides', value=False)
pad = pn.widgets.Checkbox(name='Enable padding of gene sequences smaller than the CRF window length', value=True)
cds = pn.widgets.IntInput(name='IntInMinimum number of genes required for a clusterput',
                          value=3, step=1, start=2, end=1000,
                          )
threshold = pn.widgets.FloatInput(name='Probability threshold for cluster detection',
                                  value=0.05, step=0.01, start=0.0, end=1.0,
                                  )
postproc = pn.widgets.Select(
    name="Post-processing method for gene cluster validation",
    options=["gecco", "antiSMASH"],
)
gene_filter = pn.widgets.IntInput(
    name='Number of genes from the contig edges to filter out',
    value=0, step=1, start=0, end=100)

antimash_sideload = pn.widgets.Checkbox(name='Generate an antiSMASH v6 sideload JSON file', value=False)

In [None]:
gecco_params = {
    "select_history": select_history,
    "current_history_name": current_history_name,
    "current_history_id": current_history_id,
    "select_dataset": select_dataset,
    "current_file_name": current_file_name,
    "current_file_id": current_file_id,
    "file_source_checkbox": file_source_checkbox,
    "file_input": file_input,
    "mask": mask,
    "pad": pad,
    "cds": cds,
    "threshold": threshold,
    "postproc": postproc,
    "gene_filter": gene_filter,
    "antimash_sideload": antimash_sideload,
    "history_name": history_name,
}

gecco = Gecco(gecco_params)

In [None]:
@pn.depends(file_source=file_source_checkbox, watch=True)
def _generate_data_input_flexbox(file_source):
    if file_source:
        global history_flexbox
        history_flexbox = pn.Column(
            pn.Row(),
            pn.Row(but_get_histories, but_create_history),
            history_name,
            select_history,
            pn.bind(gecco.handle_get_histories, clicks=but_get_histories.param.clicks),
            pn.bind(gecco.handle_create_history, clicks=but_create_history.param.clicks),
            sizing_mode="stretch_width",
        )
        ret = pn.Column(
            "Select a history to use a file from it.",
            history_flexbox,
            current_history_name,
            current_history_id,
            pn.Row(
                but_get_datasets, select_dataset
            )
        )
        return ret
    else:
        return pn.Column(
            "GECCO accepts `fasta`, GenBank (`.gbk`, `gb`) and EMBL ('.embl'). In principle all of those could be `.txt` files too.",
            file_input,
            but_upload_dataset,
            )


In [None]:
pn.bind(gecco.handle_get_histories, clicks=but_get_histories.param.clicks, watch=True)
pn.bind(gecco.handle_create_history, clicks=but_create_history.param.clicks, watch=True)
pn.bind(gecco.handle_update_current_file_name, select_dataset, watch=True)
pn.bind(gecco.handle_update_current_history_name, select_history, watch=True)
pn.bind(gecco.handle_update_current_history_id, select_history, watch=True)
pn.bind(gecco.handle_submit_gecco, but_submit, watch=True)
pn.bind(gecco.handle_get_datasets, clicks=but_get_datasets.param.clicks, watch=True)
pn.bind(gecco.handle_upload_dataset, clicks=but_upload_dataset.param.clicks, watch=True),

In [None]:
## login
file_source_checkbox

In [None]:
_generate_data_input_flexbox(file_source_checkbox.value)

## Gecco parameters

In [None]:
gecco_flexbox = pn.FlexBox(
        pn.Column(
            mask, pad, cds, threshold, postproc, gene_filter, antimash_sideload,
            pn.layout.Divider(),
            "## Submit", current_file_name, current_history_name, but_submit,
        ),
        sizing_mode="stretch_both",
    )

gecco_flexbox