Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add data manager for pangoLEARN #3617

Merged
merged 12 commits into from Apr 24, 2021
10 changes: 10 additions & 0 deletions data_managers/data_manager_pangolearn/.shed.yml
@@ -0,0 +1,10 @@
categories:
- Data Managers
description: Install pangoLEARN databases for pangolin tool
long_description: |
This data managers fetches models (from the pangoLEARN repository) for the pangolin
SARS-CoV-2 lineage typing tool and updates the pangolearn data table.
name: data_manager_pangolearn
owner: iuc
remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_pangolearn
type: unrestricted
154 changes: 154 additions & 0 deletions data_managers/data_manager_pangolearn/data_manager/pangolearn_dm.py
@@ -0,0 +1,154 @@
#!/usr/bin/env py

import argparse
import datetime
import json
import operator
import os
import shutil
import sys
import tarfile

import requests


def extract_date(tag_str):
parts = tag_str.split("_")
assert len(parts) < 3, "expected maximum of two parts, got " + str(parts)
tag_date = datetime.datetime.strptime(parts[0], "%Y-%m-%d")
if len(parts) == 2:
version = int(parts[1])
assert (
version < 24 * 60
) # because the code stores versions as minutes of the day, it can't handle versions > 1440
tag_date += datetime.timedelta(minutes=version)
return tag_date


def get_model_list(
existing_release_tags,
url="https://api.github.com/repos/cov-lineages/pangoLEARN/releases",
):
response = requests.get(url)
if response.status_code == 200:
release_list = json.loads(response.text)
release_info = [
dict(
tag_name=e["tag_name"],
name=e["name"],
date=extract_date(e["tag_name"]),
tarball_url=e["tarball_url"],
)
for e in release_list
if e["tag_name"] not in existing_release_tags
]
return release_info
else:
response.raise_for_status()


def filter_by_date(existing_release_tags, start_date=None, end_date=None):
release_list = get_model_list(existing_release_tags)
return [
element
for element in release_list
if not (
(end_date is not None and element["date"] > end_date)
or (start_date is not None and element["date"] < start_date)
)
]


def download_and_unpack(url, output_directory):
response = requests.get(url)
if response.status_code == 200:
tmp_filename = url.split("/")[-1]
tmpfile = open(tmp_filename, "wb")
tmpfile.write(response.content)
tmpfile.close()
shutil.copy(tmp_filename, "/tmp")
tf = tarfile.open(tmp_filename)
pl_path = tf.next().name
tf.extractall(output_directory)
os.unlink(tmp_filename)
os.rename(
output_directory + "/" + pl_path + "/" + "pangoLEARN",
output_directory + "/" + tmp_filename,
)
shutil.rmtree(output_directory + "/" + pl_path)
return tmp_filename
else:
response.raise_for_status()


def parse_date(d):
return datetime.datetime.strptime(d, "%Y-%m-%d")


if __name__ == "__main__":

parser = argparse.ArgumentParser()
parser.add_argument("--testmode", default=False, action="store_true")
parser.add_argument("--latest", default=False, action="store_true")
parser.add_argument("--start_date", type=parse_date)
parser.add_argument("--end_date", type=parse_date)
parser.add_argument("--overwrite", default=False, action="store_true")
parser.add_argument('--pangolearn_format_version', default="1.0")
parser.add_argument("datatable_name")
parser.add_argument("galaxy_datamanager_filename")
args = parser.parse_args()

if args.testmode:
releases = filter_by_date(start_date=args.start_date, end_date=args.end_date)
for release in releases:
print(release["tag_name"], release["tarball_url"].split("/")[-1])
sys.exit(0)

with open(args.galaxy_datamanager_filename) as fh:
config = json.load(fh)

output_directory = config.get("output_data", [{}])[0].get("extra_files_path", None)
data_manager_dict = {}
data_manager_dict["data_tables"] = config.get("data_tables", {})
data_manager_dict["data_tables"][args.datatable_name] = data_manager_dict[
"data_tables"
].get(args.datatable_name, [])

# NOTE: the data_manager_dict["data_tables"][args.datatable_name] is not actually populated with the
# contents of the existing data table, so the "no-overwrite" logic and the
# only-download-what-we-don't-have logic does not in fact work. It is left but unused for now.
if not args.overwrite:
existing_release_tags = set(
[
el["value"]
for el in data_manager_dict["data_tables"][args.datatable_name]
]
)
else:
existing_release_tags = set()
if args.latest:
releases = [get_model_list(existing_release_tags)[0]]
else:
releases = filter_by_date(
existing_release_tags, start_date=args.start_date, end_date=args.end_date
)
releases_to_download = [
release
for release in releases
if release["tag_name"] not in existing_release_tags
]
for release in releases_to_download:
tag = download_and_unpack(release["tarball_url"], output_directory)
data_manager_dict["data_tables"][args.datatable_name].append(
dict(
value=tag,
description=release["name"],
format_version=args.pangolearn_format_version,
path=output_directory + "/" + tag,
)
)
data_manager_dict["data_tables"][args.datatable_name].sort(
key=operator.itemgetter("value"), reverse=True
)
with open(args.galaxy_datamanager_filename, "w") as fh:
json.dump(data_manager_dict, fh, indent=2, sort_keys=True)
@@ -0,0 +1,74 @@
<tool id="data_manager_pangolearn" name="PANGOlearn data manager" version="0.0.1" tool_type="manage_data" profile="19.05">
<requirements>
<requirement type="package" version="3.8">python</requirement>
<requirement type="package" version="2.24.0">requests</requirement>
</requirements>
<command detect_errors="exit_code"><![CDATA[
python '$__tool_directory__/pangolearn_dm.py'
#if $release.which == "latest"
--latest
#else if $release.which == "date_range"
#if str($release.start_date).strip() != ""
--start_date '$release.start_date'
#end if
#if str($release.end_date).strip() != ""
--end_date '$release.end_date'
#end if
#end if
'pangolearn'
'${output_file}'
]]></command>
<inputs>
<conditional name="release">
<param name="which" type="select" label="Select PANGOlearn release">
<option value="latest" selected="true">Latest</option>
<option value="date_range">Date range</option>
<!-- <option value="history">From history</option> -->
</param>
<when value="latest">
</when>
<when value="date_range">
<param name="start_date" type="text" label="Start date " help="Don't download models older than this date" optional="true">
<validator type="regex">\d{4}-\d{2}-\d{2}$</validator>
</param>
<param name="end_date" type="text" label="End date (YYYY-MM-DD)" help="Don't download models newer than this date" optional="true">
<validator type="regex">\d{4}-\d{2}-\d{2}$</validator>
</param>
</when>
</conditional>
</inputs>
<outputs>
<data name="output_file" format="data_manager_json"/>
</outputs>
<tests>
<test>
<conditional name="release">
<param name="which" value="date_range" />
<param name="start_date" value="2021-04-01" />
<param name="end_date" value="2021-04-01" />
</conditional>
<output name="output_file">
<assert_contents>
<has_text text="pangoLEARN data release 2021-04-01"/>
</assert_contents>
</output>
</test>
</tests>
<help><![CDATA[
This data managers fetches models (from the pangoLEARN_ repository) for the pangolin_
SARS-CoV-2 lineage typing tool and updates the pangolearn data table.

.. _pangoLEARN: https://github.com/cov-lineages/pangoLEARN
.. _pangolin: https://github.com/cov-lineages/pangolin
]]></help>
<citations>
<citation type="bibtex">
@unpublished{None,
author = {Aine O'Tool},
title = {pangoLEARN},
year = {2020},
eprint = {None},
url = {https://github.com/cov-lineages/pangoLEARN}
}</citation>
</citations>
</tool>
21 changes: 21 additions & 0 deletions data_managers/data_manager_pangolearn/data_manager_conf.xml
@@ -0,0 +1,21 @@
<?xml version="1.0"?>
<data_managers>
<data_manager tool_file="data_manager/pangolearn_dm.xml" id="data_manager_pangolearn">
<data_table name="pangolearn">
<output>
<column name="value" />
abretaud marked this conversation as resolved.
Show resolved Hide resolved
<column name="description" />
<column name="format_version" />
pvanheus marked this conversation as resolved.
Show resolved Hide resolved
<column name="path" output_ref="output_file" >
<!-- note: the Python script sanitises the possibly user-supplied scheme name ('value') -->
<move type="directory" relativize_symlinks="True">
<src>${path}</src>
<target base="${GALAXY_DATA_MANAGER_DATA_PATH}">pangolearn/</target>
</move>
<value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/pangolearn/#echo str($value)#</value_translation>
<value_translation type="function">abspath</value_translation>
</column>
</output>
</data_table>
</data_manager>
</data_managers>
@@ -0,0 +1,9 @@
# this is a tab separated file describing the location of pangoLEARN databases used for the
# pangolin SARS-CoV-2 lineage typing tool
#
# the columns are:
# value description format_version path
#
# for example
# 2021-04-14 pangoLEARN data release 2021-04-14 1.0 /tmp/database/pangolearn/pangolearn/2021-04-14
2021-04-21 pangoLEARN data release 2021-04-21 1.0 /home/pvh/Documents/code/SANBI/pvh-forks/galaxy/tool-data/pangolearn/2021-04-21
@@ -0,0 +1,8 @@
# this is a tab separated file describing the location of pangoLEARN databases used for the
# pangolin SARS-CoV-2 lineage typing tool
#
# the columns are:
# value description format_version path
#
# for example
# 2021-04-14 pangoLEARN data release 2021-04-14 1.0 /tmp/database/pangolearn/pangolearn/2021-04-14
@@ -0,0 +1,6 @@
<tables>
<table name="pangolearn" comment_char="#" allow_duplicate_entries="False">
<columns>value, description, format_version, path</columns>
<file path="tool-data/pangolearn.loc" />
</table>
</tables>
@@ -0,0 +1,6 @@
<tables>
<table name="pangolearn" comment_char="#" allow_duplicate_entries="False">
<columns>value, description, format_version, path</columns>
<file path="${__HERE__}/test-data/pangolearn.loc" />
</table>
</tables>
Expand Up @@ -3,7 +3,7 @@
<requirement type="package" version="2.24.0">requests</requirement>
</requirements>
<!-- fetch all the primers in one go -->
<command detect_errors="exit_code">
<command detect_errors="exit_code"><![CDATA[
python '$__tool_directory__/install_primer_scheme_bedfiles.py'
'${output_file}'
#if $input.input_type == "ARTIC"
Expand All @@ -13,7 +13,7 @@
--primer_name '${input.primer_name}'
--primer_description '${input.primer_description}'
#end if
</command>
]]></command>
<inputs>
<conditional name="input">
<param name="input_type" label="Choose the source for primer schemes" type="select">
Expand Down