Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow assignment cache to be saved to and loaded from datadir #444

Merged
11 changes: 10 additions & 1 deletion .github/workflows/pangolin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,13 @@ jobs:
run: pangolin --update-data 2>&1 | tee pangolin_update_data.log
- name: Run pangolin verbose mode
run: pangolin --verbose pangolin/test/test_seqs.fasta 2>&1 | tee pangolin_verbose.log

- name: Add assignment cache
run: pangolin --add-assignment-cache
- name: Test use-assignment-cache
run: pangolin --use-assignment-cache pangolin/test/test_seqs.fasta 2>&1 | grep 'Using pangolin-assignment cache'
- name: remove assignment cache
run: pip uninstall -y pangolin-assignment
- name: Add assignment cache to datadir
run: mkdir ac && pangolin --add-assignment-cache --datadir ac
- name: Test use-assignment-cache with datadir
run: pangolin --use-assignment-cache --datadir ac pangolin/test/test_seqs.fasta 2>&1 | grep 'Using pangolin-assignment cache'
17 changes: 10 additions & 7 deletions pangolin/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,17 @@
from pangolin.utils import data_checks
try:
import pangolin_data
except:
except ImportError:
data_checks.install_error("pangolin_data", "https://github.com/cov-lineages/pangolin-data.git")

try:
import scorpio
except:
except ImportError:
data_checks.install_error("scorpio", "https://github.com/cov-lineages/scorpio.git")

try:
import constellations
except:
except ImportError:
data_checks.install_error("constellations", "https://github.com/cov-lineages/constellations.git")

import os
Expand Down Expand Up @@ -81,6 +81,7 @@ def main(sysargs = sys.argv[1:]):
d_group.add_argument('--add-assignment-cache', action='store_true', dest="add_assignment_cache", default=False, help="Install the pangolin-assignment repository for use with --use-assignment-cache. This makes updates slower and makes pangolin slower for small numbers of input sequences but much faster for large numbers of input sequences.")
d_group.add_argument('--use-assignment-cache', action='store_true', dest="use_assignment_cache", default=False, help="Use assignment cache from optional pangolin-assignment repository. NOTE: the repository must be installed by --add-assignment-cache before using --use-assignment-cache.")
d_group.add_argument('-d', '--datadir', action='store',dest="datadir",help="Data directory minimally containing the pangoLEARN model, header files and UShER tree. Default: Installed pangolin-data package.")
d_group.add_argument('--use_old_datadir', action='store_true', default=False, help="Use the data from data directory even if older than data installed via Python packages. Default: False")
pvanheus marked this conversation as resolved.
Show resolved Hide resolved
d_group.add_argument('--usher-tree', action='store', dest='usher_protobuf', help="UShER Mutation Annotated Tree protobuf file to use instead of default from pangolin-data repository or --datadir.")
d_group.add_argument('--assignment-cache', action='store', dest='assignment_cache', help="Cached precomputed assignment file to use instead of default from pangolin-assignment repository. Does not require installation of pangolin-assignment.")

Expand All @@ -107,23 +108,25 @@ def main(sysargs = sys.argv[1:]):
if args.usher:
sys.stderr.write(cyan(f"--usher is a pangolin v3 option and is deprecated in pangolin v4. UShER is now the default analysis mode. Use --analysis-mode to explicitly set mode.\n"))

setup_data(args.datadir,config[KEY_ANALYSIS_MODE], config)
setup_data(args.datadir,config[KEY_ANALYSIS_MODE], config, args.use_old_datadir)

if args.add_assignment_cache:
update.install_pangolin_assignment()
update.install_pangolin_assignment(config[KEY_PANGOLIN_ASSIGNMENT_VERSION], args.datadir)

if args.update:
version_dictionary = {'pangolin': __version__,
'pangolin-data': config[KEY_PANGOLIN_DATA_VERSION],
'constellations': config[KEY_CONSTELLATIONS_VERSION],
'scorpio': config[KEY_SCORPIO_VERSION]}
update.add_pangolin_assignment_if_installed(version_dictionary)
if config[KEY_PANGOLIN_ASSIGNMENT_VERSION] is not None:
version_dictionary['pangolin-assignment'] = config[KEY_PANGOLIN_ASSIGNMENT_VERSION]
update.update(version_dictionary)

if args.update_data:
version_dictionary = {'pangolin-data': config[KEY_PANGOLIN_DATA_VERSION],
'constellations': config[KEY_CONSTELLATIONS_VERSION]}
update.add_pangolin_assignment_if_installed(version_dictionary)
if config[KEY_PANGOLIN_ASSIGNMENT_VERSION] is not None:
version_dictionary['pangolin-assignment'] = config[KEY_PANGOLIN_ASSIGNMENT_VERSION]
update.update(version_dictionary, args.datadir)

# install_pangolin_assignment doesn't exit so that --update/--update-data can be given at the
Expand Down
2 changes: 2 additions & 0 deletions pangolin/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
KEY_PANGOLIN_VERSION="pangolin_version"
KEY_CONSTELLATIONS_VERSION="constellation_version"
KEY_SCORPIO_VERSION="scorpio_version"
KEY_PANGOLIN_ASSIGNMENT_VERSION="pangolin_assignment_version"
KEY_PANGOLIN_ASSIGNMENT_PATH="pangolin_assignment_path"

KEY_VERBOSE="verbose"
KEY_LOG_API = "log_api"
Expand Down
19 changes: 13 additions & 6 deletions pangolin/utils/data_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,25 +79,24 @@ def install_error(package, url):

def get_assignment_cache(cache_file, config):
cache = ""
try:
import pangolin_assignment
pangolin_assignment_dir = pangolin_assignment.__path__[0]
if config[KEY_PANGOLIN_ASSIGNMENT_VERSION] is not None:
pangolin_assignment_dir = config[KEY_PANGOLIN_ASSIGNMENT_PATH]
for r, d, f in os.walk(pangolin_assignment_dir):
for fn in f:
if fn == cache_file and cache == "":
cache = os.path.join(r, fn)
if not os.path.exists(cache):
sys.stderr.write(cyan(f'Error: cannot find assignment cache file {cache_file} in pangolin_assignment\n'))
sys.exit(-1)
except:
else:
sys.stderr.write(cyan('\nError: "pangolin --add-assignment-cache" is required before '
'"pangolin --use-assignment-cache", in order to install optional '
'pangolin-assignment repository (that will make future data updates slower).\n'))
sys.exit(-1)

# Check versions of pangolin-data and pangolin-assignment to make sure they are consistent.
if pangolin_assignment.__version__.lstrip('v') != config[KEY_PANGOLIN_DATA_VERSION].lstrip('v'):
print(cyan(f'Error: pangolin_assignment cache version {pangolin_assignment.__version__} '
if config[KEY_PANGOLIN_ASSIGNMENT_VERSION].lstrip('v') != config[KEY_PANGOLIN_DATA_VERSION].lstrip('v'):
print(cyan(f'Error: pangolin_assignment cache version {config[KEY_PANGOLIN_ASSIGNMENT_VERSION]} '
f'does not match pangolin_data version {config[KEY_PANGOLIN_DATA_VERSION]}. '
'Run "pangolin --update-data" to fetch latest versions of both.'))
sys.exit(-1)
Expand All @@ -115,5 +114,13 @@ def get_assignment_cache(cache_file, config):
sys.exit(-1)
return cache

def get_constellation_files(path):
constellation_files = []
for r, _, f in os.walk(path):
for fn in f:
if (r.endswith('/constellations') or r.endswith('/constellations/definitions')) and fn.endswith('.json'):
constellation_files.append(os.path.join(r, fn))
return constellation_files

# config={}
# check_install()
105 changes: 40 additions & 65 deletions pangolin/utils/initialising.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,14 @@
from pangolin import __version__

import pangolin_data
class PangolinAssignmentWrapper():
__version__ = None
__path__ = [None]
try:
import pangolin_assignment
except ImportError:
# if we can't import the module, leave the variables we replace it with a mock with suitable attributes
pangolin_assignment = PangolinAssignmentWrapper()
import scorpio
import constellations

Expand Down Expand Up @@ -54,6 +62,8 @@ def setup_config_dict(cwd):
KEY_PANGOLIN_DATA_VERSION: pangolin_data.__version__,
KEY_SCORPIO_VERSION: scorpio.__version__,
KEY_CONSTELLATIONS_VERSION: constellations.__version__,
KEY_PANGOLIN_ASSIGNMENT_VERSION: pangolin_assignment.__version__,
KEY_PANGOLIN_ASSIGNMENT_PATH: pangolin_assignment.__path__[0],

KEY_VERBOSE: False,
KEY_LOG_API: "",
Expand Down Expand Up @@ -118,67 +128,36 @@ def version_from_init(init_file):
break
return version

def setup_data(datadir_arg,analysis_mode, config):

def setup_data(datadir_arg, analysis_mode, config, use_old_data):
datadir = check_datadir(datadir_arg)

pangolin_data_dir = pangolin_data.__path__[0]
constellations_dir = constellations.__path__[0]
constellation_files = []

data_locations = [os.walk(constellations_dir)]

if datadir:
data_locations.append(os.walk(datadir))

# the logic of this is to search the "built-in" constellations
# path first and then if as custom datadir is passed, follow up with those, so that
# any files found in the datadir supercede the "built-in" modules. The assumption
# here is that the datadir contains newer (user updated) data
for r, _, f in itertools.chain.from_iterable(data_locations):
if r.endswith('/constellations') or r.endswith('/constellations/definitions'):
constellation_files = [] # only collect the constellations from the last directory found
for fn in f:
if r.endswith('/constellations') and fn == '__init__.py':
constellations_version = version_from_init(os.path.join(r, fn))
elif (r.endswith('/constellations') or r.endswith('/constellations/definitions')) and fn.endswith('.json'):
constellation_files.append(os.path.join(r, fn))

pangolin_data_version = pangolin_data.__version__
use_datadir = False
datadir_too_old = False
config[KEY_PANGOLIN_DATA_VERSION] = pangolin_data.__version__
config[KEY_DATADIR] = pangolin_data.__path__[0]
config[KEY_CONSTELLATIONS_VERSION] = constellations.__version__
config[KEY_CONSTELLATION_FILES] = get_constellation_files(constellations.__path__[0])
config[KEY_PANGOLIN_ASSIGNMENT_VERSION] = pangolin_assignment.__version__
config[KEY_PANGOLIN_ASSIGNMENT_PATH] = pangolin_assignment.__path__[0]

if datadir:
version = "Unknown"
for r,d,f in os.walk(datadir):
for fn in f:
# pangolin-data/__init__.py not constellations/__init__.py:
if r.endswith('data') and fn == "__init__.py":
# print("Found " + os.path.join(r, fn))
version = version_from_init(os.path.join(r, fn))
if not version:
continue

if LooseVersion(version) >= LooseVersion(pangolin_data.__version__):
# only use this if the version is >= than what we already have
pangolin_data_version = version
use_datadir = True
else:
datadir_too_old = True
sys.stderr.write(cyan(f"Warning: Ignoring specified datadir {datadir} - it contains pangoLEARN model files older ({version}) than those installed ({pangolin_data.__version__})\n"))

if use_datadir == False:
# we haven't got a viable datadir from searching args.datadir
if datadir and not datadir_too_old:
sys.stderr.write(cyan(
f"Warning: Ignoring specified datadir {datadir} - could not find __init__.py file to check versions \n"))

pangolin_data_dir = pangolin_data.__path__[0]
datadir = os.path.join(pangolin_data_dir,"data")

config[KEY_PANGOLIN_DATA_VERSION] = pangolin_data_version
config[KEY_CONSTELLATIONS_VERSION] = constellations_version
config[KEY_DATADIR] = datadir
config[KEY_CONSTELLATION_FILES] = constellation_files
for module_name in ('constellations', 'pangolin_data', 'pangolin_assignment'):
for r, _, f in os.walk(datadir):
for fn in f:
if r.endswith('/' + module_name) and fn == '__init__.py':
version = version_from_init(os.path.join(r, fn))
# module_name has been imported so exists in global namespace
current_version = getattr(globals()[module_name], '__version__', '0')
if use_old_data or current_version is None or LooseVersion(version) >= LooseVersion(current_version):
if module_name == "pangolin_data":
config[KEY_PANGOLIN_DATA_VERSION] = version
config[KEY_DATADIR] = os.path.join(datadir, r)
elif module_name == "pangolin_assignment":
config[KEY_PANGOLIN_ASSIGNMENT_VERSION] = version
config[KEY_PANGOLIN_ASSIGNMENT_PATH] = os.path.join(datadir, r)
elif module_name == "constellations":
config[KEY_CONSTELLATIONS_VERSION] = version
config[KEY_CONSTELLATION_FILES] = get_constellation_files(r)
else:
sys.stderr.write(cyan(f"Warning: Ignoring {module_name} in specified datadir {datadir} - it contains {module_name} with older ({version}) than those installed ({current_version})\n"))

def parse_qc_thresholds(maxambig, minlen, reference_fasta, config):

Expand Down Expand Up @@ -207,11 +186,10 @@ def parse_qc_thresholds(maxambig, minlen, reference_fasta, config):

print(green(f"Maximum ambiguity allowed is {config[KEY_MAXAMBIG]}.\n****"))


def print_ram_warning(analysis_mode):
if analysis_mode == "pangolearn":
print(cyan("Warning: pangoLEARN mode may use a significant amount of RAM, be aware that it will not suit every system."))

def print_alias_file_exit(alias_file):
with open(alias_file, 'r') as handle:
for line in handle:
Expand Down Expand Up @@ -242,11 +220,8 @@ def print_versions_exit(config):
f"constellations: {config[KEY_CONSTELLATIONS_VERSION]}\n"
f"scorpio: {config[KEY_SCORPIO_VERSION]}")
# Report pangolin_assignment version if it is installed, otherwise ignore
try:
import pangolin_assignment
print(f"pangolin-assignment: {pangolin_assignment.__version__}")
except:
pass
if config[KEY_PANGOLIN_ASSIGNMENT_VERSION] is not None:
print(f"pangolin-assignment: {config[KEY_PANGOLIN_ASSIGNMENT_VERSION]}")
# Print versions of other important tools used by pangolin
print_conda_version(['usher', 'ucsc-fatovcf', 'gofasta', 'minimap2'])
sys.exit(0)
Expand Down
54 changes: 15 additions & 39 deletions pangolin/utils/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,41 +64,33 @@ def git_lfs_install():
sys.stderr.write(cyan(f"Error: {e}:\n{stderr}\n"))
sys.exit(-1)

def pip_install_dep(dependency, release):
def pip_install_dep(dependency, release, datadir=None):
"""
Use pip install to install a cov-lineages repository with the specificed release
"""
env_vars = None
pvanheus marked this conversation as resolved.
Show resolved Hide resolved
url = f"git+https://github.com/cov-lineages/{dependency}.git@{release}"
subprocess.run([sys.executable, '-m', 'pip', 'install', '--upgrade', url],
pip_command = [sys.executable, '-m', 'pip', 'install', '--upgrade']
if datadir is not None:
pip_command.extend(['--target', datadir])
pip_command.append(url)
subprocess.run(pip_command,
check=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL)
stderr=subprocess.DEVNULL,
env=env_vars)


def install_pangolin_assignment():
def install_pangolin_assignment(pangolin_assignment_version, datadir=None):
"""
If the pangolin-assignment repo has not been installed already then install the latest release.
"""
try:
import pangolin_assignment
print(f"pangolin-assignment already installed with version {pangolin_assignment.__version__}; use --update or --update-data if you wish to update it.", file=sys.stderr)

except:
if pangolin_assignment_version is not None:
print(f"pangolin-assignment already installed with version {pangolin_assignment_version}; use --update or --update-data if you wish to update it.", file=sys.stderr)
else:
git_lfs_install()
latest_release, tarball = get_latest_release('pangolin-assignment')
pip_install_dep('pangolin-assignment', latest_release)
print(f"pangolin-assignment installed with latest release ({latest_release})")
pvanheus marked this conversation as resolved.
Show resolved Hide resolved


def add_pangolin_assignment_if_installed(version_dictionary):
"""
If pangolin_assignment has been installed then add it to version_dictionary, else ignore.
"""
try:
import pangolin_assignment
version_dictionary["pangolin-assignment"] = pangolin_assignment.__version__
except:
pass
pip_install_dep('pangolin-assignment', latest_release, datadir)


def update(version_dictionary, data_dir=None):
Expand Down Expand Up @@ -154,23 +146,7 @@ def update(version_dictionary, data_dir=None):
version = LooseVersion(version)

if version < latest_release_tidied:
if data_dir is not None:
# this path only gets followed when the user has --update_data and they
# have also specified a --datadir
with TemporaryDirectory() as tempdir:
dependency_package = package_names.get(dependency, dependency)
tarball_path = os.path.join(tempdir, 'tarball.tgz')
open(tarball_path, 'wb').write(request.urlopen(latest_release_tarball).read())
tf = tarfile.open(tarball_path)
extracted_dir = tf.next().name
tf.extractall(path=tempdir)
tf.close()
destination_directory = os.path.join(data_dir, dependency_package)
if os.path.isdir(destination_directory):
shutil.rmtree(destination_directory)
shutil.move(os.path.join(tempdir, extracted_dir, dependency_package), destination_directory)
else:
pip_install_dep(dependency, latest_release)
pip_install_dep(dependency, latest_release, data_dir)
print(f"{dependency} updated to {latest_release}", file=sys.stderr)
elif version > latest_release_tidied:
print(f"{dependency} ({version}) is newer than latest stable "
Expand Down