# Jupyter to Galaxy for nextflow with data from MGnify

* need to upload nextflow directory with the files and dependencies
* trigger the Galaxy workflow
* retrieve results

In [1]:
import sys
import os
import io

if 'google.colab' in str(get_ipython()):
    # clone the momics-demos repository to use the utils module from there
    # TODO: eventually utils from momics will be used for that
    try:
        os.system('git clone https://github.com/palec87/momics-demos.git')
        print(f"Repository cloned")
    except OSError as e:
        print(f"An error occurred while cloning the repository: {e}")

    sys.path.insert(0,'/content/momics-demos')

else:
    sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

    # downside of this is that all the deps need to be installed in the current (momics-demos) environment
    sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../../marine-omics')))  # local momics package, to be removed too

from utils import init_setup, get_notebook_environment
init_setup()

# Initialize the environment variable
notebook_environment = 'unknown'
# Determine the notebook environment
env = get_notebook_environment()
print(f"Environment: {env}")

Platform: local Linux
Environment: vscode


## XML for galaxy

**XML taken from [here](https://galaxyproject.org/blog/2022-08-15-making-nextflow-work-with-galaxy-at-cfsan-fda/)**
```xml
<requirements>
    <requirement type="package" version="22.04">nextflow</requirement>
    <requirement type="package">graphviz</requirement>
</requirements>
<command detect_errors="exit_code"><![CDATA[
    nextflow run $__tool_directory__/nf-pipelines/my_pipeline.nf --input /path/to/input
]]></command>
```

**nf-core/rnaseq**
```bash
nextflow run nf-core/rnaseq \
    --input <SAMPLESHEET> \
    --outdir <OUTDIR> \
    --gtf <GTF> \
    --fasta <GENOME FASTA> \
    -profile <docker/singularity/.../institute>
```

**with the input sample `.csv` sheet as follows**
```csv
sample,fastq_1,fastq_2,strandedness
CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz,auto
CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz,auto
CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz,auto
```


In [2]:
import os
import sys
import json
from datetime import datetime
from platform import python_version
import logging

from jsonapi_client import Session as APISession
from jsonapi_client import Modifier
import requests

# Import
import bioblend.galaxy as g  # BioBlend is a Python library, wrapping the functionality of Galaxy and CloudMan APIs
# import boto3
import pandas as pd
from bioblend.galaxy import GalaxyInstance
from bioblend.galaxy.datasets import DatasetClient

from momics.galaxy.blue_cloud import BCGalaxy
# instead of the jupyter magic, you can also use
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
# Read your secrets from the .env file
# To see your API key login -> click 'user' (top right) -> click 'preferences' -> click 'Manage API Key' (menu to the left) -> click the icon to 'copy key'
GALAXY_URL = os.getenv("GALAXY_EARTH_URL")  # alternatively os.environ.get('GALAXY_URL'), "https://earth-system.usegalaxy.eu/"
GALAXY_KEY = os.getenv("GALAXY_EARTH_KEY")  # alternatively os.environ.get('GALAXY_KEY')

In [None]:
# my class init, but not using it for anything here
exp = BCGalaxy("GALAXY_EARTH_URL", "GALAXY_EARTH_KEY")

In [5]:
config = exp.gi.config.get_config()
# config

## Datasets

In [8]:
ds_client = g.datasets.DatasetClient(exp.gi)

In [9]:
ds_list = ds_client.get_datasets()
ds_list[0].keys()

dict_keys(['id', 'name', 'history_id', 'hid', 'deleted', 'visible', 'type_id', 'type', 'create_time', 'update_time', 'url', 'tags', 'history_content_type', 'dataset_id', 'state', 'extension', 'purged', 'genome_build', 'object_store_id', 'quota_source_label'])

In [16]:
# get only the datasets (dataset word is specific for the test workflow)
[k for k in ds_list if 'dataset' in k['name']]

[{'id': '4838ba20a6d867655f7d33d07e6b9e78',
  'name': 'Concatenate datasets on data 1 and data 2',
  'history_id': '7d721489019e2043',
  'hid': 1,
  'deleted': False,
  'visible': True,
  'type_id': 'dataset-4838ba20a6d867655f7d33d07e6b9e78',
  'type': 'file',
  'create_time': '2025-02-14T14:33:14.729402',
  'update_time': '2025-02-14T14:33:32.064405',
  'url': '/api/histories/7d721489019e2043/contents/4838ba20a6d867655f7d33d07e6b9e78',
  'tags': [],
  'history_content_type': 'dataset',
  'dataset_id': '4838ba20a6d8676553252dc7f1253d16',
  'state': 'queued',
  'extension': 'txt',
  'purged': False,
  'genome_build': '?',
  'object_store_id': 'files27',
  'quota_source_label': None},
 {'id': '4838ba20a6d8676558e4e84c5b89daec',
  'name': 'Concatenate datasets on data 1 and data 2',
  'history_id': '2e8a4915680c5db1',
  'hid': 3,
  'deleted': False,
  'visible': True,
  'type_id': 'dataset-4838ba20a6d8676558e4e84c5b89daec',
  'type': 'file',
  'create_time': '2025-02-13T15:43:38.597584',


In [11]:
# history_id, dataset_id
# TODO: find the right way to query those
# do i HAVE to initiate the dataset client for that?
dataset = exp.gi.histories.show_dataset('7011ecc923ced444', '4838ba20a6d867658edecd7071a62aef')
dataset, type(dataset)

({'model_class': 'HistoryDatasetAssociation',
  'id': '4838ba20a6d867658edecd7071a62aef',
  'name': 'dataset2.txt',
  'history_id': '2e8a4915680c5db1',
  'hid': 2,
  'deleted': False,
  'visible': True,
  'type_id': 'dataset-4838ba20a6d867658edecd7071a62aef',
  'type': 'file',
  'create_time': '2025-02-13T15:42:28.038460',
  'update_time': '2025-02-13T15:43:31.691163',
  'url': '/api/histories/2e8a4915680c5db1/contents/4838ba20a6d867658edecd7071a62aef',
  'tags': [],
  'history_content_type': 'dataset',
  'copied_from_ldda_id': None,
  'dataset_id': '4838ba20a6d86765c27d49db44672e67',
  'state': 'ok',
  'extension': 'txt',
  'purged': False,
  'genome_build': '?',
  'hda_ldda': 'hda',
  'accessible': True,
  'misc_info': 'uploaded txt file',
  'misc_blurb': '3 lines',
  'file_ext': 'txt',
  'file_size': 18,
  'resubmitted': False,
  'meta_files': [],
  'data_type': 'galaxy.datatypes.data.Text',
  'peek': '<table cellspacing="0" cellpadding="3"><tr><td>this</td></tr><tr><td>is</td></tr>

In [17]:
histories = exp.gi.histories.get_histories(history_id=None, name=None, deleted=False)
len(histories), histories[0].keys(), [(k['name'], k['id']) for k in histories]

(6,
 dict_keys(['model_class', 'id', 'name', 'deleted', 'purged', 'archived', 'url', 'published', 'count', 'annotation', 'tags', 'update_time', 'preferred_object_store_id']),
 [('History from planemo run tutorial workflow', '7d721489019e2043'),
  ('History from planemo run tutorial workflow', '7011ecc923ced444'),
  ('CWL Target History', '2e8a4915680c5db1'),
  ('GECCO Run', '115816a852b41534'),
  ('Unnamed history', '227d4225a27186af'),
  ('GECCO Run', '8d8d4bf21253beda')])

## Workflow and invocations

In [18]:
workflows = exp.gi.workflows.get_workflows(workflow_id=None, name=None, published=False)
workflows[0].keys(), [(k['name'], k['id']) for k in workflows]

(dict_keys(['model_class', 'id', 'latest_workflow_id', 'name', 'create_time', 'update_time', 'published', 'importable', 'deleted', 'hidden', 'tags', 'latest_workflow_uuid', 'annotations', 'url', 'owner', 'source_metadata', 'number_of_steps', 'show_in_tool_panel']),
 [('planemo run tutorial', '7cc90b8a12c8c643'),
  ('test_nextflow', '659ce0d877aab15e')])

In [23]:
# to select the planemo workflow
planemo_id = [k['id'] for k in workflows if 'planemo' in k['name']][0]  # going thourgh WFs and mathces on name string
wf = exp.gi.workflows.show_workflow(planemo_id)
wf['inputs']

{'0': {'label': 'Dataset 2',
  'value': '',
  'uuid': '004a6ce7-a8f3-4a54-b513-e1456a3695e8'},
 '1': {'label': 'Dataset 1',
  'value': '',
  'uuid': 'b42b2137-033e-4e7e-849e-21def05f4a70'},
 '2': {'label': 'Number of lines',
  'value': '',
  'uuid': '442ca31b-cdcf-46eb-815c-c23baf2c4dc5'}}

In [None]:
# invoke

# datamap constructed manually with the data ids
datamap = {'0': {'src':'hda', 'id':'4838ba20a6d867658edecd7071a62aef'},
           '1': {'src':'hda', 'id':'4838ba20a6d867654ef0f38728567a4a'},
           '2': {'value': '3'},
}
w = exp.gi.workflows.invoke_workflow(
    planemo_id,
    inputs=datamap,
    params=None,
    history_id=None,
    history_name=None,
    import_inputs_to_history=False,
    replacement_params=None,
    allow_tool_state_corrections=None,
    )



In [34]:
exp.gi.histories.show_history('7d721489019e2043')['state_ids']

{'new': ['4838ba20a6d867655cd296330f06d83a'],
 'upload': [],
 'queued': ['4838ba20a6d867655f7d33d07e6b9e78'],
 'running': [],
 'ok': [],
 'empty': [],
 'error': [],
 'paused': [],
 'setting_metadata': [],
 'failed_metadata': [],
 'deferred': [],
 'discarded': []}

## Use planemo from here

In [42]:
def run_planemo():
    # invoke

    # datamap constructed manually with the data ids
    os.system('planemo run tutorial.ga tutorial-job.yml \
        --engine external_galaxy \
        --galaxy_url https://earth-system.usegalaxy.eu/ \
        --galaxy_user_key 154f80453cb792de3ccb35f6be337475')

In [43]:
run_planemo()

Usage: planemo run [OPTIONS] RUNNABLE_PATH_OR_ID JOB_PATH
Try 'planemo run --help' for help.

Error: Invalid value for 'JOB_PATH': File 'tutorial-job.yml' does not exist.


## How to package nextflow file into the workflow

In [37]:
#jobs
jobs = exp.gi.jobs.get_jobs()
len(jobs), jobs[0].keys(), [(k['id'], k['state']) for k in jobs]

(9,
 dict_keys(['model_class', 'id', 'history_id', 'tool_id', 'state', 'exit_code', 'create_time', 'update_time', 'galaxy_version', 'external_id', 'handler', 'job_runner_name', 'command_line', 'user_email']),
 [('11ac94870d0bb33ad6fc7dbb812f1217', 'queued'),
  ('11ac94870d0bb33adedcde9f82791389', 'new'),
  ('11ac94870d0bb33a1219c0f07f0981f8', 'ok'),
  ('11ac94870d0bb33ad39f874d7ca97db4', 'ok'),
  ('11ac94870d0bb33a83cf798ca735de65', 'ok'),
  ('11ac94870d0bb33a072bd9a33dc8c14d', 'ok'),
  ('11ac94870d0bb33ac814be21ef6fbc2d', 'deleted'),
  ('11ac94870d0bb33a4a74056d2ffeb889', 'ok'),
  ('11ac94870d0bb33aea8210495860d0f4', 'ok')])

In [38]:
jobs[0]

{'model_class': 'Job',
 'id': '11ac94870d0bb33ad6fc7dbb812f1217',
 'history_id': '7d721489019e2043',
 'tool_id': 'cat1',
 'state': 'queued',
 'exit_code': None,
 'create_time': '2025-02-14T14:33:14.709536',
 'update_time': '2025-02-14T14:33:32.393319',
 'galaxy_version': '24.2',
 'external_id': None,
 'handler': None,
 'job_runner_name': None,
 'command_line': None,
 'user_email': None}

In [31]:
## tools
tools = exp.gi.tools.get_tools()
len(tools), tools[0].keys(), [(k['name'], k['id']) for k in tools]

(13585,
 dict_keys(['model_class', 'id', 'name', 'version', 'description', 'labels', 'edam_operations', 'edam_topics', 'hidden', 'is_workflow_compatible', 'xrefs', 'link', 'min_width', 'target', 'panel_section_id', 'panel_section_name', 'form_style']),
 [('Upload File', 'upload1'),
  ('SEEK test', 'ds_seek_test'),
  ('UCSC Main', 'ucsc_table_direct1'),
  ('UCSC Archaea', 'ucsc_table_direct_archaea1'),
  ('NCBI Datasets Genomes', 'ncbi_datasets_source'),
  ('downloads', 'lftp'),
  ('EBI SRA', 'ebi_sra_main'),
  ('SRA', 'sra_source'),
  ('Get Microbial Data', 'microbial_import1'),
  ('BioMart', 'biomart'),
  ('BioMart', 'biomart_test'),
  ('CBI Rice Mart', 'cbi_rice_mart'),
  ('GrameneMart', 'gramenemart'),
  ('modENCODE fly', 'modENCODEfly'),
  ('InterMine', 'intermine'),
  ('Flymine', 'flymine'),
  ('modENCODE modMine', 'modmine'),
  ('MouseMine', 'mousemine'),
  ('Ratmine', 'ratmine'),
  ('YeastMine', 'yeastmine'),
  ('metabolicMine', 'metabolicmine'),
  ('modENCODE worm', 'modENCODEw

In [33]:
t1 = exp.gi.tools.show_tool('cat1')
t1

{'model_class': 'Tool',
 'id': 'cat1',
 'name': 'Concatenate datasets',
 'version': '1.0.0',
 'description': 'tail-to-head',
 'labels': [],
 'edam_operations': ['operation_3436'],
 'edam_topics': [],
 'hidden': '',
 'is_workflow_compatible': True,
 'xrefs': [],
 'panel_section_id': 'text_manipulation',
 'panel_section_name': 'Text Manipulation',
 'form_style': 'regular'}

## This is quite unrelated
## Downloads of fastq files from ena are under heavy construction

In [None]:
r = requests.get(f"https://www.ebi.ac.uk/ena/portal/api/results?dataPortal=ena")
content = r.content.decode('utf-8')
print(content)

In [None]:
r = requests.get(f"https://www.ebi.ac.uk/ena/portal/api/searchFields?result=read_run")
content = r.content.decode('utf-8')
print(content)

In [None]:
# this actually works but it is not waht I want
search = "ERS1153743"
r = requests.get(f"https://www.ebi.ac.uk/ena/portal/api/search?result=read_run&{search}?dataPortal=ena")
content = r.content.decode('utf-8')
print(content)

In [None]:
r = requests.get(f"https://www.ebi.ac.uk/ena/browser/api/fasta/{search}")
content = r.content.decode('utf-8')
print(content)

## Loading local FASTQ files