Aquainfra importer (#108)

* init data source tool for downoading aquainfra content * refinements * add data_source.py * Update tools/aquainfra_importer/aquainfra_importer.xml Co-authored-by: Wolfgang Maier <maierw@posteo.de> * update importer outputs * add new line at the end of file * some linting * some linting * some linting * some linting * some linting * some linting * some linting * some linting * some linting * some linting * Update tools/aquainfra_importer/aquainfra_importer.xml Co-authored-by: Wolfgang Maier <maierw@posteo.de> --------- Co-authored-by: Wolfgang Maier <maierw@posteo.de> Co-authored-by: Björn Grüning <bjoern@gruenings.eu>
galaxyecology · May 14, 2024 · 2b586af · 2b586af
1 parent e03f4d2
commit 2b586af
Show file tree

Hide file tree

Showing 3 changed files with 143 additions and 0 deletions.
diff --git a/tools/aquainfra_importer/.shed.yml b/tools/aquainfra_importer/.shed.yml
@@ -0,0 +1,11 @@
+categories:
+    - Ecology
+owner: ecology
+remote_repository_url: https://github.com/AquaINFRA/tools-ecology/tree/aquainfra_importer
+homepage_url: https://github.com/AquaINFRA/galaxy
+long_description: |
+  A data source tool for downloading datasets via the AquaINFRA Interaction Platform.
+type: unrestricted
+auto_tool_repositories:
+  name_template: "{{ tool_id }}"
+  description_template: "Data source tool for content provided in AquaINRA: {{ tool_name }}."
diff --git a/tools/aquainfra_importer/aquainfra_importer.xml b/tools/aquainfra_importer/aquainfra_importer.xml
@@ -0,0 +1,13 @@
+<tool id="aquainfra_importer" name="AquaINFRA Importer" tool_type="data_source" version="1.0" profile="20.09">
+    <description>downloads content via the AquaINFRA interaction platform</description>
+    <command><![CDATA[
+        python '$__tool_directory__/data_source.py' '$output' $__app__.config.output_size_limit
+    ]]></command>
+    <inputs action="https://aquainfra.dev.52north.org/" check_values="false" method="get">
+        <param name="GALAXY_URL" type="baseurl" value="/tool_runner" />
+        <param name="tool_id" type="hidden" value="aquainfra_importer" />
+    </inputs>
+    <outputs>
+        <data name="output" format="auto" label="AquaINFRA Resource"/>
+    </outputs>
+</tool>
diff --git a/tools/aquainfra_importer/data_source.py b/tools/aquainfra_importer/data_source.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python
+# Retrieves data from external data source applications and
+# stores in a dataset file.
+#
+# Data source application parameters are temporarily stored
+# in the dataset file.
+import json
+import os
+import sys
+from urllib.parse import urlencode, urlparse
+from urllib.request import urlopen
+
+from galaxy.datatypes import sniff
+from galaxy.datatypes.registry import Registry
+from galaxy.util import (
+    DEFAULT_SOCKET_TIMEOUT,
+    get_charset_from_http_headers,
+    stream_to_open_named_file,
+)
+
+GALAXY_PARAM_PREFIX = "GALAXY"
+GALAXY_ROOT_DIR = os.path.realpath(
+    os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)
+)
+GALAXY_DATATYPES_CONF_FILE = os.path.join(
+    GALAXY_ROOT_DIR, "datatypes_conf.xml"
+)
+
+
+def main():
+    if len(sys.argv) >= 3:
+        max_file_size = int(sys.argv[2])
+    else:
+        max_file_size = 0
+
+    with open(sys.argv[1]) as fh:
+        params = json.load(fh)
+
+    out_data_name = params["output_data"][0]["out_data_name"]
+
+    URL = params["param_dict"].get("URL", None)
+    URL_method = params["param_dict"].get("URL_method", "get")
+
+    datatypes_registry = Registry()
+    datatypes_registry.load_datatypes(
+        root_dir=params["job_config"]["GALAXY_ROOT_DIR"],
+        config=params["job_config"]["GALAXY_DATATYPES_CONF_FILE"],
+    )
+
+    for data_dict in params["output_data"]:
+        cur_filename = data_dict["file_name"]
+        cur_URL = params["param_dict"].get(
+            "%s|%s|URL" % (GALAXY_PARAM_PREFIX,
+                           data_dict["out_data_name"]), URL
+        )
+        if not cur_URL or urlparse(cur_URL).scheme not in ("http", "https",
+                                                           "ftp"):
+            open(cur_filename, "w").write("")
+            sys.exit(
+                "The remote data source application has not sent "
+                "back a URL parameter in the request."
+            )
+
+        try:
+            if URL_method == "get":
+                page = urlopen(cur_URL, timeout=DEFAULT_SOCKET_TIMEOUT)
+            elif URL_method == "post":
+                param_dict = params["param_dict"]
+                page = urlopen(
+                    cur_URL,
+                    urlencode(param_dict["incoming_request_params"]).encode(
+                        "utf-8"
+                    ),
+                    timeout=DEFAULT_SOCKET_TIMEOUT,
+                )
+        except Exception as e:
+            sys.exit(
+                "The remote data source application may "
+                "be off line, please try again later. Error: %s"
+                % str(e)
+            )
+        if max_file_size:
+            file_size = int(page.info().get("Content-Length", 0))
+            if file_size > max_file_size:
+                sys.exit(
+                    "The requested data size (%d bytes) exceeds the maximum"
+                    "allowed size (%d bytes) on this server."
+                    % (file_size, max_file_size)
+                )
+        try:
+            cur_filename = stream_to_open_named_file(
+                page,
+                os.open(
+                    cur_filename,
+                    os.O_WRONLY | os.O_TRUNC | os.O_CREAT
+                ),
+                cur_filename,
+                source_encoding=get_charset_from_http_headers(page.headers),
+            )
+        except Exception as e:
+            sys.exit("Unable to fetch %s:\n%s" % (cur_URL, e))
+
+        try:
+            ext = sniff.handle_uploaded_dataset_file(
+                cur_filename, datatypes_registry, ext=data_dict["ext"]
+            )
+        except Exception as e:
+            sys.exit(str(e))
+
+        tool_provided_metadata = {out_data_name: {"ext": ext}}
+
+        with open(
+            params["job_config"]["TOOL_PROVIDED_JOB_METADATA_FILE"], "w"
+        ) as json_file:
+            json.dump(tool_provided_metadata, json_file)
+
+
+if __name__ == "__main__":
+    main()