# Interacting with Galaxy through the API to run the tool GECCO to identify putative novel Biosynthetic Gene Clusters (BGCs)

**Information about GECCO:** https://github.com/zellerlab/GECCO <br>

**Information about Galaxy** <br>
Training: https://training.galaxyproject.org/ <br>
Galaxy for Earth System and Environment: https://earth-system.usegalaxy.eu/ (DP: this one works)<br>
European Galaxy server: https://usegalaxy.eu/ (DP: I did not manage to run GECCO there as of 25-01-21)<br>

**Questions:**
How do I solve storage of job and file `IDs` which I need to query later?
  - Should be compatible for running locally and on the BC VRE
  - For now use `.json`
  - This file is created upon submission, but needs to be updated after the job is done. How if the user logs-out.
  - The analysis NB should be the one querying the results `IDs`

<h3> Installing and importing required modules <h3>

In [None]:
import os
import sys
import json
import logging
logger = logging.getLogger(name="GECCO galaxy runner")

if 'google.colab' in str(get_ipython()):
    # clone the momics-demos repository to use the utils module from there
    # TODO: eventually utils from momics will be used for that
    try:
        os.system('git clone https://github.com/palec87/momics-demos.git')
        logger.info(f"Repository cloned")
    except OSError as e:
        logger.info(f"An error occurred while cloning the repository: {e}")

    sys.path.insert(0,'/content/momics-demos')

else:
    sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))  # local utils, to be removed in the future

    # downside of this is that all the deps need to be installed in the current (momics-demos) environment
    sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../../marine-omics')))  # local momics package, to be removed too

from utils import init_setup, get_notebook_environment
init_setup()

# Initialize the environment variable
notebook_environment = 'unknown'
# Determine the notebook environment
env = get_notebook_environment()
logger.info(f"Environment: {env}")

In [None]:
import os
import sys
import json
from datetime import datetime
from platform import python_version
import logging

# Import
import bioblend.galaxy as g  # BioBlend is a Python library, wrapping the functionality of Galaxy and CloudMan APIs
# import boto3
import pandas as pd
from bioblend.galaxy import GalaxyInstance
from bioblend.galaxy.datasets import DatasetClient

from momics.galaxy.blue_cloud import BCGalaxy
# instead of the jupyter magic, you can also use
from dotenv import load_dotenv
load_dotenv()

## Galaxy setup

### How to create a galaxy API key?

Code [here](https://github.com/galaxyproject/bioblend/blob/main/docs/examples/create_user_get_api_key.py). *If you already have login at Galaxy*, go to User(top right) -> Preferences -> Manage API Key

In [None]:
# Read your secrets from the .env file
# To see your API key login -> click 'user' (top right) -> click 'preferences' -> click 'Manage API Key' (menu to the left) -> click the icon to 'copy key'
GALAXY_URL = os.getenv("GALAXY_EARTH_URL")  # alternatively os.environ.get('GALAXY_URL'), "https://earth-system.usegalaxy.eu/"
GALAXY_KEY = os.getenv("GALAXY_EARTH_KEY")  # alternatively os.environ.get('GALAXY_KEY')

history_name = "GECCO Run"
# setup for gecco and galaxy
upload_data_flag = False
gecco_tool_id = "toolshed.g2.bx.psu.edu/repos/althonos/gecco/gecco/0.9.6"  # The id of the tool GECCO

In [None]:
# Connect to Galaxy instance
gi = GalaxyInstance(url=GALAXY_URL, key=GALAXY_KEY)

In [None]:
h = gi.histories.get_histories()
h

Create a new history for the GECCO run named `GECCO Run`

In [None]:

history = gi.histories.create_history(name=history_name)
history_id = history["id"]
print(history_id)

#### Upload input files to the Galaxy history

In [None]:
# Path to the file to upload to Jupyter (here using a sample fasta file in the folder 'data')
# file_path = "data/EMOBON00092_final_V2.contigs.fa"  # Ensure the file is in your working directory
file_path = "../input_gecco/EMOBON00092_final_V2.contigs.fa"

In [None]:
# Upload file
upload_data = gi.tools.upload_file(file_path, history_id)
uploaded_dataset_id = upload_data["outputs"][0]["id"]
print(
    f"File uploaded to Galaxy with dataset ID: {uploaded_dataset_id}"
)  # dataset ID might be usefull bellow

In [None]:
# testing code
dc = DatasetClient(gi)

In [None]:
gi.datasets.get_datasets()

In [None]:
dc.get_datasets(history_id=history_id)

## Run GECCO in Galaxy

In [None]:
tool_info = gi.tools.show_tool(gecco_tool_id)
print(tool_info)

In [None]:
## method to find all your available datasets on galaxy
# this method is called upon pressing a button in the webapp
def filter_datasets_by_key(datasets, key, value):
    lst_dict = [k for k in datasets if key in k and k[key] == value]
    names = [(k["name"], k['id']) for k in lst_dict]
    return names

In [None]:
if not upload_data_flag:
    dname, did = filter_datasets_by_key(gi.datasets.get_datasets(), "extension", 'fasta')[0]

In [None]:
# Define inputs for the GECCO tool with additional parameters

if upload_data_flag:
    inputs = {
        "input": {
            "id": uploaded_dataset_id,  # The dataset ID from the upload step
            "src": "hda",  # History Dataset Association
        },
        "mask": True,  # Enable masking of regions with unknown nucleotides
        "cds": 3,  # Minimum number of genes required for a cluster
        "threshold": 0.05,  # Probability threshold for cluster detection
        "postproc": "gecco",  # Post-processing method for gene cluster validation
        "antismash_sideload": False,  # ,  # Generate an antiSMASH v6 sideload JSON file
        #'email': 'email@email.pt'  # Email notification
    }
else:
    inputs = {
        "input": {
            "id": did,  # The dataset ID from the upload step
            "src": "hda",  # History Dataset Association
        },
        "mask": True,  # Enable masking of regions with unknown nucleotides
        "cds": 3,  # Minimum number of genes required for a cluster
        "threshold": 0.05,  # Probability threshold for cluster detection
        "postproc": "gecco",  # Post-processing method for gene cluster validation
        "antismash_sideload": False,  # ,  # Generate an antiSMASH v6 sideload JSON file
        #'email': 'email@email.pt'  # Email notification
    }

# Run the GECCO tool
tool_run = gi.tools.run_tool(
    history_id=history_id, tool_id=gecco_tool_id, tool_inputs=inputs
)

# Get job ID to monitor
job_id = tool_run["jobs"][0]["id"]
print(f"GECCO tool job submitted with job ID: {job_id}")

In [None]:
gi.jobs.cancel_job(job_id)

### Saving the `.json` file locally for the job

In [None]:
def store_galaxy_job_json(tool_id: str, job_id: str, history_id: str):
    # Store the job information in a JSON file
    job_info = gi.jobs.show_job(job_id)
    job_info["tool_id"] = tool_id
    job_info["history_id"] = history_id
    job_info["job_id"] = job_id
    
    ## Get the current datetime and format it
    # datetime_stamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    job_info_file = f"job_info_{job_id}.json"
    with open(job_info_file, "w") as f:
        json.dump(job_info, f)
    pass

In [None]:
# Monitor the job status (get job id from running the previous cell)
# gi.jobs.show_job(job_id), # 11ac94870d0bb33a4a74056d2ffeb889
gi.jobs.get_state(job_id)

In [None]:
# test, can I store the job info in a json file for a deleted job
# yes it works
store_galaxy_job_json(gecco_tool_id, job_id, history_id)

### TODO: this file needs to be also updated once the job is done and user accesses it.

#### List the Outputs from the Job

In [None]:
#  Get history id from running the previous cell
# List datasets in the history after the tool run
datasets = gi.histories.show_history(history_id, contents=True)

### Download the `.tsv` table outputs

In [None]:
# Identify the output dataset ids
# To understand the output: https://git.lumc.nl/mflarralde/gecco
target_names = {
    "GECCO summary of detected genes on data 1 (TSV)": "dataset_id_2",
    "GECCO summary of detected features on data 1 (TSV)": "dataset_id_3",
    "GECCO summary of detected BGCs on data 1 (TSV)": "dataset_id_4",
}

# Initialize the dataset ID variables
dataset_id_2 = None
dataset_id_3 = None
dataset_id_4 = None

# Loop through the datasets and assign the IDs to the correct variable
for dataset in datasets:
    if dataset["name"] in target_names:
        if target_names[dataset["name"]] == "dataset_id_2":
            dataset_id_2 = dataset["id"]
        elif target_names[dataset["name"]] == "dataset_id_3":
            dataset_id_3 = dataset["id"]
        elif target_names[dataset["name"]] == "dataset_id_4":
            dataset_id_4 = dataset["id"]

# Display the results
print(f"Dataset ID 2: {dataset_id_2}")
print(f"Dataset ID 3: {dataset_id_3}")
print(f"Dataset ID 4: {dataset_id_4}")

In [None]:
# Download here

# Download the dataset (as TSV) to the 'data'folder
tsv_data2 = gi.datasets.download_dataset(
    dataset_id_2,
    file_path="../data/summary_detected_genes.tsv",
    use_default_filename=False,
)
tsv_data3 = gi.datasets.download_dataset(
    dataset_id_3,
    file_path="../data/summary_detected_features.tsv",
    use_default_filename=False,
)
tsv_data4 = gi.datasets.download_dataset(
    dataset_id_4, file_path="../data/summary_detected_BGC.tsv", use_default_filename=False
)

In [None]:
# Read the TSV File into a panda DataFrame

# df_detected_BGC = pd.read_csv('detected_BGC.tsv', sep='\t')
df_summary_detected_genes = pd.read_csv("../data/summary_detected_genes.tsv", sep="\t")
df_summary_detected_features = pd.read_csv("../data/summary_detected_features.tsv", sep="\t")
df_summary_detected_BGC = pd.read_csv("../data/summary_detected_BGC.tsv", sep="\t")

### Display the first few rows of each DataFrame

In [None]:
df_summary_detected_genes.head()

In [None]:
df_summary_detected_features.head()

In [None]:
df_summary_detected_BGC.head()