Skip to content

Commit

Permalink
Aquainfra importer (#108)
Browse files Browse the repository at this point in the history
* init data source tool for downoading aquainfra content

* refinements

* add data_source.py

* Update tools/aquainfra_importer/aquainfra_importer.xml

Co-authored-by: Wolfgang Maier <maierw@posteo.de>

* update importer outputs

* add new line at the end of file

* some linting

* some linting

* some linting

* some linting

* some linting

* some linting

* some linting

* some linting

* some linting

* some linting

* Update tools/aquainfra_importer/aquainfra_importer.xml

Co-authored-by: Wolfgang Maier <maierw@posteo.de>

---------

Co-authored-by: Wolfgang Maier <maierw@posteo.de>
Co-authored-by: Björn Grüning <bjoern@gruenings.eu>
  • Loading branch information
3 people committed May 14, 2024
1 parent e03f4d2 commit 2b586af
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 0 deletions.
11 changes: 11 additions & 0 deletions tools/aquainfra_importer/.shed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
categories:
- Ecology
owner: ecology
remote_repository_url: https://github.com/AquaINFRA/tools-ecology/tree/aquainfra_importer
homepage_url: https://github.com/AquaINFRA/galaxy
long_description: |
A data source tool for downloading datasets via the AquaINFRA Interaction Platform.
type: unrestricted
auto_tool_repositories:
name_template: "{{ tool_id }}"
description_template: "Data source tool for content provided in AquaINRA: {{ tool_name }}."
13 changes: 13 additions & 0 deletions tools/aquainfra_importer/aquainfra_importer.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<tool id="aquainfra_importer" name="AquaINFRA Importer" tool_type="data_source" version="1.0" profile="20.09">
<description>downloads content via the AquaINFRA interaction platform</description>
<command><![CDATA[
python '$__tool_directory__/data_source.py' '$output' $__app__.config.output_size_limit
]]></command>
<inputs action="https://aquainfra.dev.52north.org/" check_values="false" method="get">
<param name="GALAXY_URL" type="baseurl" value="/tool_runner" />
<param name="tool_id" type="hidden" value="aquainfra_importer" />
</inputs>
<outputs>
<data name="output" format="auto" label="AquaINFRA Resource"/>
</outputs>
</tool>
119 changes: 119 additions & 0 deletions tools/aquainfra_importer/data_source.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#!/usr/bin/env python
# Retrieves data from external data source applications and
# stores in a dataset file.
#
# Data source application parameters are temporarily stored
# in the dataset file.
import json
import os
import sys
from urllib.parse import urlencode, urlparse
from urllib.request import urlopen

from galaxy.datatypes import sniff
from galaxy.datatypes.registry import Registry
from galaxy.util import (
DEFAULT_SOCKET_TIMEOUT,
get_charset_from_http_headers,
stream_to_open_named_file,
)

GALAXY_PARAM_PREFIX = "GALAXY"
GALAXY_ROOT_DIR = os.path.realpath(
os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)
)
GALAXY_DATATYPES_CONF_FILE = os.path.join(
GALAXY_ROOT_DIR, "datatypes_conf.xml"
)


def main():
if len(sys.argv) >= 3:
max_file_size = int(sys.argv[2])
else:
max_file_size = 0

with open(sys.argv[1]) as fh:
params = json.load(fh)

out_data_name = params["output_data"][0]["out_data_name"]

URL = params["param_dict"].get("URL", None)
URL_method = params["param_dict"].get("URL_method", "get")

datatypes_registry = Registry()
datatypes_registry.load_datatypes(
root_dir=params["job_config"]["GALAXY_ROOT_DIR"],
config=params["job_config"]["GALAXY_DATATYPES_CONF_FILE"],
)

for data_dict in params["output_data"]:
cur_filename = data_dict["file_name"]
cur_URL = params["param_dict"].get(
"%s|%s|URL" % (GALAXY_PARAM_PREFIX,
data_dict["out_data_name"]), URL
)
if not cur_URL or urlparse(cur_URL).scheme not in ("http", "https",
"ftp"):
open(cur_filename, "w").write("")
sys.exit(
"The remote data source application has not sent "
"back a URL parameter in the request."
)

try:
if URL_method == "get":
page = urlopen(cur_URL, timeout=DEFAULT_SOCKET_TIMEOUT)
elif URL_method == "post":
param_dict = params["param_dict"]
page = urlopen(
cur_URL,
urlencode(param_dict["incoming_request_params"]).encode(
"utf-8"
),
timeout=DEFAULT_SOCKET_TIMEOUT,
)
except Exception as e:
sys.exit(
"The remote data source application may "
"be off line, please try again later. Error: %s"
% str(e)
)
if max_file_size:
file_size = int(page.info().get("Content-Length", 0))
if file_size > max_file_size:
sys.exit(
"The requested data size (%d bytes) exceeds the maximum"
"allowed size (%d bytes) on this server."
% (file_size, max_file_size)
)
try:
cur_filename = stream_to_open_named_file(
page,
os.open(
cur_filename,
os.O_WRONLY | os.O_TRUNC | os.O_CREAT
),
cur_filename,
source_encoding=get_charset_from_http_headers(page.headers),
)
except Exception as e:
sys.exit("Unable to fetch %s:\n%s" % (cur_URL, e))

try:
ext = sniff.handle_uploaded_dataset_file(
cur_filename, datatypes_registry, ext=data_dict["ext"]
)
except Exception as e:
sys.exit(str(e))

tool_provided_metadata = {out_data_name: {"ext": ext}}

with open(
params["job_config"]["TOOL_PROVIDED_JOB_METADATA_FILE"], "w"
) as json_file:
json.dump(tool_provided_metadata, json_file)


if __name__ == "__main__":
main()

0 comments on commit 2b586af

Please sign in to comment.