From d7bb1c142ea4eaf5422203e65a6775f1f63244d5 Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Thu, 2 May 2024 11:07:15 +0300 Subject: [PATCH 01/34] Make dataset cache lifetime configurable --- cvat/apps/dataset_manager/apps.py | 18 ++++++++++++++++++ cvat/apps/dataset_manager/default_settings.py | 8 ++++++++ cvat/apps/dataset_manager/views.py | 5 +++-- 3 files changed, 29 insertions(+), 2 deletions(-) create mode 100644 cvat/apps/dataset_manager/apps.py create mode 100644 cvat/apps/dataset_manager/default_settings.py diff --git a/cvat/apps/dataset_manager/apps.py b/cvat/apps/dataset_manager/apps.py new file mode 100644 index 00000000000..3e62d078171 --- /dev/null +++ b/cvat/apps/dataset_manager/apps.py @@ -0,0 +1,18 @@ +# Copyright (C) 2024 CVAT.ai Corporation +# +# SPDX-License-Identifier: MIT + +from django.apps import AppConfig + + +class DatasetManagerConfig(AppConfig): + name = "cvat.apps.dataset_manager" + + def ready(self) -> None: + from django.conf import settings + + from . import default_settings + + for key in dir(default_settings): + if key.isupper() and not hasattr(settings, key): + setattr(settings, key, getattr(default_settings, key)) diff --git a/cvat/apps/dataset_manager/default_settings.py b/cvat/apps/dataset_manager/default_settings.py new file mode 100644 index 00000000000..0f168b0ab07 --- /dev/null +++ b/cvat/apps/dataset_manager/default_settings.py @@ -0,0 +1,8 @@ +# Copyright (C) 2024 CVAT.ai Corporation +# +# SPDX-License-Identifier: MIT + +import os + +DATASET_CACHE_TTL = int(os.getenv("CVAT_DATASET_CACHE_TTL", 60 * 60 * 10)) +"Base lifetime for cached exported datasets, in seconds" diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 41bc6ae1b67..9c16c29ee0c 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -1,5 +1,5 @@ # Copyright (C) 2019-2022 Intel Corporation -# Copyright (C) 2023 CVAT.ai Corporation +# Copyright (C) 2023-2024 CVAT.ai Corporation # # SPDX-License-Identifier: MIT @@ -41,11 +41,12 @@ def get_export_cache_dir(db_instance): else: raise FileNotFoundError('{} dir {} does not exist'.format(db_instance.__class__.__name__, base_dir)) -DEFAULT_CACHE_TTL = timedelta(hours=10) +DEFAULT_CACHE_TTL = timedelta(seconds=settings.DATASET_CACHE_TTL) TASK_CACHE_TTL = DEFAULT_CACHE_TTL PROJECT_CACHE_TTL = DEFAULT_CACHE_TTL / 3 JOB_CACHE_TTL = DEFAULT_CACHE_TTL + def export(dst_format, project_id=None, task_id=None, job_id=None, server_url=None, save_images=False): try: if task_id is not None: From 7f151b107cf8c4b4f159991d062690b06e1e779f Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Fri, 3 May 2024 15:46:45 +0300 Subject: [PATCH 02/34] Use flock to lock export cache --- cvat/apps/dataset_manager/default_settings.py | 3 + cvat/apps/dataset_manager/util.py | 167 +++++++++++++++++- cvat/apps/dataset_manager/views.py | 77 ++++---- cvat/apps/engine/views.py | 47 +++-- 4 files changed, 241 insertions(+), 53 deletions(-) diff --git a/cvat/apps/dataset_manager/default_settings.py b/cvat/apps/dataset_manager/default_settings.py index 0f168b0ab07..9aa97bb5cb0 100644 --- a/cvat/apps/dataset_manager/default_settings.py +++ b/cvat/apps/dataset_manager/default_settings.py @@ -6,3 +6,6 @@ DATASET_CACHE_TTL = int(os.getenv("CVAT_DATASET_CACHE_TTL", 60 * 60 * 10)) "Base lifetime for cached exported datasets, in seconds" + +DATASET_CACHE_LOCKING_TIMEOUT = int(os.getenv("CVAT_DATASET_CACHE_LOCKING_TIMEOUT", 10)) +"Timeout for cache lock acquiring, in seconds" \ No newline at end of file diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index 387b74d2177..431f1ee19ab 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -1,13 +1,20 @@ - # Copyright (C) 2019-2022 Intel Corporation +# Copyright (C) 2023-2024 CVAT.ai Corporation # # SPDX-License-Identifier: MIT from copy import deepcopy -from typing import Sequence +from datetime import datetime, timedelta +import random +from time import sleep +from typing import IO, Any, Optional, Protocol, Sequence, Union +from enum import IntEnum, auto import inspect import os, os.path as osp import zipfile +import fcntl +import os +import errno from django.conf import settings from django.db import models @@ -80,3 +87,159 @@ def deepcopy_simple(v): return v else: return deepcopy(v) + + + +class LockError(Exception): + pass + +class LockTimeoutError(LockError): + pass + +class LockNotAvailableError(LockError): + pass + +EXPORT_CACHE_DIR_LOCK_FILENAME = "dir.lock" + +class LockMode(IntEnum): + shared = auto() + exclusive = auto() + +class HasFileno(Protocol): + def fileno(self) -> int: ... + +FileDescriptorLike = Union[int, HasFileno] + + +def _lock_file( + f: FileDescriptorLike, + mode: LockMode = LockMode.exclusive, + block: bool = False, +): + """ + Creates an advisory, filesystem-level lock using BSD Flock. + """ + + flags = 0 + + if mode == LockMode.exclusive: + flags |= fcntl.LOCK_EX + elif mode == LockMode.shared: + flags |= fcntl.LOCK_SH + else: + assert False + + if not block: + flags |= fcntl.LOCK_NB + + try: + # Flock call details: + # https://manpages.debian.org/bookworm/manpages-dev/flock.2.en.html + fcntl.flock(f, flags) + except OSError as e: + # Interrupts (errno.EINTR) should be forwarded + if e.errno in [errno.EWOULDBLOCK, errno.ENOLCK]: + raise LockNotAvailableError + else: + raise + +def _unlock_file(f: FileDescriptorLike): + # Flock call details: + # https://manpages.debian.org/bookworm/manpages-dev/flock.2.en.html + fcntl.flock(f, fcntl.LOCK_UN) + + +class AtomicOpen: + """ + Class for ensuring that all file operations are atomic, treat + initialization like a standard call to 'open' that happens to be atomic. + This file opener *must* be used in a "with" block. + """ + + def __init__( + self, + path: str, + *args, + lock_mode: LockMode = LockMode.exclusive, + block: bool = True, + **kwargs, + ): + # Open the file to obtain a file descriptor for locking + self.file: IO[Any] = open(path, *args, **kwargs) + + # Lock the opened file + _lock_file(self.file, mode=lock_mode, block=block) + + # Return the opened file object (knowing a lock has been obtained). + def __enter__(self, *args, **kwargs): + return self.file + + # Unlock the file and close the file object. + def __exit__(self, exc_type=None, exc_value=None, traceback=None): + # Flush to make sure all buffered contents are written to file. + self.file.flush() + os.fsync(self.file.fileno()) + + # Release the lock on the file. + _unlock_file(self.file) + + self.file.close() + + # Handle exceptions that may have come up during execution, by + # default any exceptions are raised to the user. + if (exc_type != None): + return False + else: + return True + + +def get_file_lock( + file_path: os.PathLike[str], + *open_args, + lock_mode: LockMode = LockMode.exclusive, + block: bool = False, + timeout: Optional[int | timedelta] = None, + **open_kwargs +) -> AtomicOpen: + if block and timeout: + raise NotImplementedError("Can't use timeout with blocking mode") + + time_start = None + if timeout: + time_start = datetime.now() + + if isinstance(timeout, int): + timeout = timedelta(seconds=timeout) + + while True: + if timeout and time_start + timeout < datetime.now(): + raise LockTimeoutError + + try: + return AtomicOpen( + file_path, *open_args, block=block, lock_mode=lock_mode, **open_kwargs + ) + except LockNotAvailableError as e: + if timeout: + delay = 0.1 + random.random(1) + if time_start + timeout < datetime.now() + delay: + raise LockTimeoutError from e + + sleep(delay) + else: + raise + + assert False, "Unreachable" + + +def get_dataset_cache_lock( + export_dir: os.PathLike[str], + *, + mode: LockMode = LockMode.exclusive, + block: bool = False, + timeout: Optional[int | timedelta] = None, +) -> AtomicOpen: + # Lock the directory using a file inside the directory + lock_file_path = osp.join(export_dir, EXPORT_CACHE_DIR_LOCK_FILENAME) + + return get_file_lock(lock_file_path, mode='a', lock_mode=mode, block=block, timeout=timeout) diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 9c16c29ee0c..b148f6b1ac4 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -20,7 +20,7 @@ from cvat.apps.engine.models import Project, Task, Job from .formats.registry import EXPORT_FORMATS, IMPORT_FORMATS -from .util import current_function_name +from .util import current_function_name, get_dataset_cache_lock, LockMode slogger = ServerLogManager(__name__) @@ -32,12 +32,13 @@ def log_exception(logger=None, exc_info=True): (_MODULE_NAME, current_function_name(2)), exc_info=exc_info) +EXPORT_CACHE_DIR_NAME = 'export_cache' def get_export_cache_dir(db_instance): base_dir = osp.abspath(db_instance.get_dirname()) if osp.isdir(base_dir): - return osp.join(base_dir, 'export_cache') + return osp.join(base_dir, EXPORT_CACHE_DIR_NAME) else: raise FileNotFoundError('{} dir {} does not exist'.format(db_instance.__class__.__name__, base_dir)) @@ -46,6 +47,8 @@ def get_export_cache_dir(db_instance): PROJECT_CACHE_TTL = DEFAULT_CACHE_TTL / 3 JOB_CACHE_TTL = DEFAULT_CACHE_TTL +EXPORT_CACHE_LOCKING_TIMEOUT = timedelta(seconds=settings.DATASET_CACHE_LOCKING_TIMEOUT) + def export(dst_format, project_id=None, task_id=None, job_id=None, server_url=None, save_images=False): try: @@ -78,31 +81,35 @@ def export(dst_format, project_id=None, task_id=None, job_id=None, server_url=No tasks_update = list(map(lambda db_task: timezone.localtime( db_task.updated_date).timestamp(), db_instance.tasks.all())) instance_time = max(tasks_update + [instance_time]) - if not (osp.exists(output_path) and \ - instance_time <= osp.getmtime(output_path)): - os.makedirs(cache_dir, exist_ok=True) - with tempfile.TemporaryDirectory(dir=cache_dir) as temp_dir: - temp_file = osp.join(temp_dir, 'result') - export_fn(db_instance.id, temp_file, dst_format, - server_url=server_url, save_images=save_images) - os.replace(temp_file, output_path) - - archive_ctime = osp.getctime(output_path) - scheduler = django_rq.get_scheduler(settings.CVAT_QUEUES.EXPORT_DATA.value) - cleaning_job = scheduler.enqueue_in(time_delta=cache_ttl, - func=clear_export_cache, - file_path=output_path, - file_ctime=archive_ctime, - logger=logger) - logger.info( - "The {} '{}' is exported as '{}' at '{}' " - "and available for downloading for the next {}. " - "Export cache cleaning job is enqueued, id '{}'".format( - db_instance.__class__.__name__.lower(), - db_instance.name if isinstance(db_instance, (Project, Task)) else db_instance.id, - dst_format, output_path, cache_ttl, - cleaning_job.id - )) + + os.makedirs(cache_dir, exist_ok=True) + with get_dataset_cache_lock( + cache_dir, mode=LockMode.exclusive, block=False, timeout=EXPORT_CACHE_LOCKING_TIMEOUT + ): + if not (osp.exists(output_path) and instance_time <= osp.getmtime(output_path)): + with tempfile.TemporaryDirectory(dir=cache_dir) as temp_dir: + temp_file = osp.join(temp_dir, 'result') + export_fn(db_instance.id, temp_file, dst_format, + server_url=server_url, save_images=save_images) + os.replace(temp_file, output_path) + + archive_ctime = osp.getctime(output_path) + scheduler = django_rq.get_scheduler(settings.CVAT_QUEUES.EXPORT_DATA.value) + cleaning_job = scheduler.enqueue_in( + time_delta=cache_ttl, + func=clear_export_cache, + file_path=output_path, + file_ctime=archive_ctime, + logger=logger) + logger.info( + "The {} '{}' is exported as '{}' at '{}' " + "and available for downloading for the next {}. " + "Export cache cleaning job is enqueued, id '{}'".format( + db_instance.__class__.__name__.lower(), + db_instance.name if isinstance(db_instance, (Project, Task)) else db_instance.id, + dst_format, output_path, cache_ttl, + cleaning_job.id + )) return output_path except Exception: @@ -130,12 +137,16 @@ def export_project_annotations(project_id, dst_format=None, server_url=None): def clear_export_cache(file_path, file_ctime, logger): try: - if osp.exists(file_path) and osp.getctime(file_path) == file_ctime: - os.remove(file_path) - - logger.info( - "Export cache file '{}' successfully removed" \ - .format(file_path)) + cache_dir = osp.dirname(file_path) + with get_dataset_cache_lock( + cache_dir, mode=LockMode.exclusive, block=False, timeout=EXPORT_CACHE_LOCKING_TIMEOUT + ): + if osp.getctime(file_path) == file_ctime: + os.remove(file_path) + + logger.info("Export cache file '{}' successfully removed".format(file_path)) + except FileNotFoundError: + return except Exception: log_exception(logger) raise diff --git a/cvat/apps/engine/views.py b/cvat/apps/engine/views.py index d1ceaf24714..c68b1ef94ad 100644 --- a/cvat/apps/engine/views.py +++ b/cvat/apps/engine/views.py @@ -1,5 +1,5 @@ # Copyright (C) 2018-2022 Intel Corporation -# Copyright (C) 2022-2023 CVAT.ai Corporation +# Copyright (C) 2022-2024 CVAT.ai Corporation # # SPDX-License-Identifier: MIT @@ -2986,24 +2986,35 @@ def _export_annotations( if not file_path: return Response('A result for exporting job was not found for finished RQ job', status=status.HTTP_500_INTERNAL_SERVER_ERROR) - elif not osp.exists(file_path): - return Response('The result file does not exist in export cache', status=status.HTTP_500_INTERNAL_SERVER_ERROR) - - if action == "download": - filename = filename or \ - build_annotations_file_name( - class_name=db_instance.__class__.__name__, - identifier=db_instance.name if isinstance(db_instance, (Task, Project)) else db_instance.id, - timestamp=timestamp, - format_name=format_name, - is_annotation_file=is_annotation_file, - extension=osp.splitext(file_path)[1] - ) - - rq_job.delete() - return sendfile(request, file_path, attachment=True, attachment_filename=filename) - return Response(status=status.HTTP_201_CREATED) + cache_dir = osp.dirname(file_path) + with dm.util.get_dataset_cache_lock( + cache_dir, + mode=dm.util.LockMode.shared, + block=False, + timeout=dm.views.EXPORT_CACHE_LOCKING_TIMEOUT + ): + if action == "download": + if not osp.exists(file_path): + return Response('The result file does not exist in export cache', status=status.HTTP_500_INTERNAL_SERVER_ERROR) + + filename = filename or \ + build_annotations_file_name( + class_name=db_instance.__class__.__name__, + identifier=db_instance.name if isinstance(db_instance, (Task, Project)) else db_instance.id, + timestamp=timestamp, + format_name=format_name, + is_annotation_file=is_annotation_file, + extension=osp.splitext(file_path)[1] + ) + + rq_job.delete() + return sendfile(request, file_path, attachment=True, attachment_filename=filename) + else: + if osp.exists(file_path): + return Response(status=status.HTTP_201_CREATED) + else: + rq_job.delete() else: raise NotImplementedError(f"Export to {location} location is not implemented yet") elif rq_job.is_failed: From f48e807e450cafcd4f55718c7ffe8d7f16a3b6f3 Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Tue, 7 May 2024 19:54:38 +0300 Subject: [PATCH 03/34] - use redlock for export cache locking - prolong export cache lifetime on cache requests - change 500 to 404 on missing cache file during downloading - remove rq job removal after result retrieval - make export filenames more distinct - fix possible infinite deferring of dependent rq jobs --- cvat/apps/dataset_manager/default_settings.py | 2 +- cvat/apps/dataset_manager/util.py | 187 +++++------------- cvat/apps/dataset_manager/views.py | 138 ++++++++++--- cvat/apps/engine/views.py | 79 +++++--- 4 files changed, 201 insertions(+), 205 deletions(-) diff --git a/cvat/apps/dataset_manager/default_settings.py b/cvat/apps/dataset_manager/default_settings.py index 9aa97bb5cb0..fe1530a5b4f 100644 --- a/cvat/apps/dataset_manager/default_settings.py +++ b/cvat/apps/dataset_manager/default_settings.py @@ -7,5 +7,5 @@ DATASET_CACHE_TTL = int(os.getenv("CVAT_DATASET_CACHE_TTL", 60 * 60 * 10)) "Base lifetime for cached exported datasets, in seconds" -DATASET_CACHE_LOCKING_TIMEOUT = int(os.getenv("CVAT_DATASET_CACHE_LOCKING_TIMEOUT", 10)) +DATASET_CACHE_LOCK_TIMEOUT = int(os.getenv("CVAT_DATASET_CACHE_LOCK_TIMEOUT", 10)) "Timeout for cache lock acquiring, in seconds" \ No newline at end of file diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index 431f1ee19ab..3de1e391dea 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -3,21 +3,21 @@ # # SPDX-License-Identifier: MIT +from contextlib import contextmanager, suppress from copy import deepcopy -from datetime import datetime, timedelta -import random -from time import sleep -from typing import IO, Any, Optional, Protocol, Sequence, Union +from datetime import timedelta +from threading import Lock +from typing import Any, Generator, Optional, Sequence from enum import IntEnum, auto import inspect -import os, os.path as osp -import zipfile -import fcntl import os -import errno +import os.path as osp +import zipfile +import django_rq from django.conf import settings from django.db import models +from pottery import Redlock, ReleaseUnlockedLock def current_function_name(depth=1): @@ -99,147 +99,48 @@ class LockTimeoutError(LockError): class LockNotAvailableError(LockError): pass -EXPORT_CACHE_DIR_LOCK_FILENAME = "dir.lock" - class LockMode(IntEnum): shared = auto() exclusive = auto() -class HasFileno(Protocol): - def fileno(self) -> int: ... - -FileDescriptorLike = Union[int, HasFileno] - - -def _lock_file( - f: FileDescriptorLike, - mode: LockMode = LockMode.exclusive, - block: bool = False, -): - """ - Creates an advisory, filesystem-level lock using BSD Flock. - """ - - flags = 0 - - if mode == LockMode.exclusive: - flags |= fcntl.LOCK_EX - elif mode == LockMode.shared: - flags |= fcntl.LOCK_SH - else: - assert False - - if not block: - flags |= fcntl.LOCK_NB +@contextmanager +def get_dataset_cache_lock( + export_path: os.PathLike[str], + *, + ttl: int | timedelta, + block: bool = True, + acquire_timeout: Optional[int | timedelta] = None, +) -> Generator[Lock, Any, Any]: + if isinstance(acquire_timeout, timedelta): + acquire_timeout = acquire_timeout.total_seconds() + + if acquire_timeout is None: + acquire_timeout = -1 + + if isinstance(ttl, timedelta): + ttl = ttl.total_seconds() + elif not ttl: + raise ValueError("max_ttl must be a positive number") + + lock = Redlock( + key=export_path, + masters={ + django_rq.get_connection(settings.CVAT_QUEUES.EXPORT_DATA.value) + }, + auto_release_time=ttl, + raise_on_redis_errors=True, + ) try: - # Flock call details: - # https://manpages.debian.org/bookworm/manpages-dev/flock.2.en.html - fcntl.flock(f, flags) - except OSError as e: - # Interrupts (errno.EINTR) should be forwarded - if e.errno in [errno.EWOULDBLOCK, errno.ENOLCK]: - raise LockNotAvailableError - else: - raise - -def _unlock_file(f: FileDescriptorLike): - # Flock call details: - # https://manpages.debian.org/bookworm/manpages-dev/flock.2.en.html - fcntl.flock(f, fcntl.LOCK_UN) - - -class AtomicOpen: - """ - Class for ensuring that all file operations are atomic, treat - initialization like a standard call to 'open' that happens to be atomic. - This file opener *must* be used in a "with" block. - """ - - def __init__( - self, - path: str, - *args, - lock_mode: LockMode = LockMode.exclusive, - block: bool = True, - **kwargs, - ): - # Open the file to obtain a file descriptor for locking - self.file: IO[Any] = open(path, *args, **kwargs) - - # Lock the opened file - _lock_file(self.file, mode=lock_mode, block=block) - - # Return the opened file object (knowing a lock has been obtained). - def __enter__(self, *args, **kwargs): - return self.file - - # Unlock the file and close the file object. - def __exit__(self, exc_type=None, exc_value=None, traceback=None): - # Flush to make sure all buffered contents are written to file. - self.file.flush() - os.fsync(self.file.fileno()) - - # Release the lock on the file. - _unlock_file(self.file) - - self.file.close() - - # Handle exceptions that may have come up during execution, by - # default any exceptions are raised to the user. - if (exc_type != None): - return False + acquired = lock.acquire(blocking=block, timeout=acquire_timeout) + if acquired: + yield lock else: - return True - - -def get_file_lock( - file_path: os.PathLike[str], - *open_args, - lock_mode: LockMode = LockMode.exclusive, - block: bool = False, - timeout: Optional[int | timedelta] = None, - **open_kwargs -) -> AtomicOpen: - if block and timeout: - raise NotImplementedError("Can't use timeout with blocking mode") - - time_start = None - if timeout: - time_start = datetime.now() - - if isinstance(timeout, int): - timeout = timedelta(seconds=timeout) - - while True: - if timeout and time_start + timeout < datetime.now(): - raise LockTimeoutError - - try: - return AtomicOpen( - file_path, *open_args, block=block, lock_mode=lock_mode, **open_kwargs - ) - except LockNotAvailableError as e: - if timeout: - delay = 0.1 + random.random(1) - if time_start + timeout < datetime.now() + delay: - raise LockTimeoutError from e - - sleep(delay) + if acquire_timeout > 0: + raise LockTimeoutError else: - raise - - assert False, "Unreachable" + raise LockNotAvailableError - -def get_dataset_cache_lock( - export_dir: os.PathLike[str], - *, - mode: LockMode = LockMode.exclusive, - block: bool = False, - timeout: Optional[int | timedelta] = None, -) -> AtomicOpen: - # Lock the directory using a file inside the directory - lock_file_path = osp.join(export_dir, EXPORT_CACHE_DIR_LOCK_FILENAME) - - return get_file_lock(lock_file_path, mode='a', lock_mode=mode, block=block, timeout=timeout) + finally: + with suppress(ReleaseUnlockedLock): + lock.release() diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index b148f6b1ac4..3837719b0f0 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -3,12 +3,16 @@ # # SPDX-License-Identifier: MIT +from contextlib import suppress +import logging import os import os.path as osp +import re import tempfile from datetime import timedelta import django_rq +import rq from datumaro.util.os_util import make_file_name from datumaro.util import to_snake_case from django.utils import timezone @@ -20,7 +24,7 @@ from cvat.apps.engine.models import Project, Task, Job from .formats.registry import EXPORT_FORMATS, IMPORT_FORMATS -from .util import current_function_name, get_dataset_cache_lock, LockMode +from .util import current_function_name, get_dataset_cache_lock, LockMode, LockError slogger = ServerLogManager(__name__) @@ -40,67 +44,99 @@ def get_export_cache_dir(db_instance): if osp.isdir(base_dir): return osp.join(base_dir, EXPORT_CACHE_DIR_NAME) else: - raise FileNotFoundError('{} dir {} does not exist'.format(db_instance.__class__.__name__, base_dir)) + raise FileNotFoundError( + '{} dir {} does not exist'.format(db_instance.__class__.__name__, base_dir) + ) DEFAULT_CACHE_TTL = timedelta(seconds=settings.DATASET_CACHE_TTL) -TASK_CACHE_TTL = DEFAULT_CACHE_TTL PROJECT_CACHE_TTL = DEFAULT_CACHE_TTL / 3 +TASK_CACHE_TTL = DEFAULT_CACHE_TTL JOB_CACHE_TTL = DEFAULT_CACHE_TTL -EXPORT_CACHE_LOCKING_TIMEOUT = timedelta(seconds=settings.DATASET_CACHE_LOCKING_TIMEOUT) +EXPORT_CACHE_LOCK_TIMEOUT = timedelta(seconds=settings.DATASET_CACHE_LOCK_TIMEOUT) + +def get_export_cache_ttl(db_instance: str | Project | Task | Job) -> int: + TTL_CONSTS = { + 'project': PROJECT_CACHE_TTL, + 'task': TASK_CACHE_TTL, + 'job': JOB_CACHE_TTL, + } + + if not isinstance(db_instance, str): + db_instance = db_instance.__class__.__name__.lower() + + return TTL_CONSTS[db_instance].total_seconds() + +def get_file_instance_timestamp(file_path: str) -> float: + match = re.search(r'instance(\d+\.\d+)', osp.basename(file_path)) + return float(match.group(1)) def export(dst_format, project_id=None, task_id=None, job_id=None, server_url=None, save_images=False): try: if task_id is not None: logger = slogger.task[task_id] - cache_ttl = TASK_CACHE_TTL export_fn = task.export_task db_instance = Task.objects.get(pk=task_id) elif project_id is not None: logger = slogger.project[project_id] - cache_ttl = PROJECT_CACHE_TTL export_fn = project.export_project db_instance = Project.objects.get(pk=project_id) else: logger = slogger.job[job_id] - cache_ttl = JOB_CACHE_TTL export_fn = task.export_job db_instance = Job.objects.get(pk=job_id) - cache_dir = get_export_cache_dir(db_instance) + cache_ttl = timedelta(seconds=get_export_cache_ttl(db_instance)) + cache_dir = get_export_cache_dir(db_instance) exporter = EXPORT_FORMATS[dst_format] - output_base = '%s_%s' % ('dataset' if save_images else 'annotations', - make_file_name(to_snake_case(dst_format))) - output_path = '%s.%s' % (output_base, exporter.EXT) - output_path = osp.join(cache_dir, output_path) - instance_time = timezone.localtime(db_instance.updated_date).timestamp() + instance_update_time = timezone.localtime(db_instance.updated_date) if isinstance(db_instance, Project): - tasks_update = list(map(lambda db_task: timezone.localtime( - db_task.updated_date).timestamp(), db_instance.tasks.all())) - instance_time = max(tasks_update + [instance_time]) + tasks_update = list(map( + lambda db_task: timezone.localtime(db_task.updated_date), + db_instance.tasks.all() + )) + instance_update_time = max(tasks_update + [instance_update_time]) + + output_path = '%s-instance%s_%s.%s' % ( + 'dataset' if save_images else 'annotations', + # store the instance timestamp in the file name to reliably get this information + # ctime / mtime do not return file creation time on linux + # mtime is used for file usage checks + instance_update_time.timestamp(), + make_file_name(to_snake_case(dst_format)), + exporter.EXT, + ) + output_path = osp.join(cache_dir, output_path) os.makedirs(cache_dir, exist_ok=True) + with get_dataset_cache_lock( - cache_dir, mode=LockMode.exclusive, block=False, timeout=EXPORT_CACHE_LOCKING_TIMEOUT + output_path, + block=True, + acquire_timeout=EXPORT_CACHE_LOCK_TIMEOUT, + ttl=rq.get_current_job().timeout, ): - if not (osp.exists(output_path) and instance_time <= osp.getmtime(output_path)): + if not ( + osp.exists(output_path) + and instance_update_time.timestamp() <= get_file_instance_timestamp(output_path) + ): with tempfile.TemporaryDirectory(dir=cache_dir) as temp_dir: temp_file = osp.join(temp_dir, 'result') export_fn(db_instance.id, temp_file, dst_format, server_url=server_url, save_images=save_images) os.replace(temp_file, output_path) - archive_ctime = osp.getctime(output_path) scheduler = django_rq.get_scheduler(settings.CVAT_QUEUES.EXPORT_DATA.value) cleaning_job = scheduler.enqueue_in( time_delta=cache_ttl, func=clear_export_cache, file_path=output_path, - file_ctime=archive_ctime, - logger=logger) + file_ctime=instance_update_time.timestamp(), + logger=logger + ) logger.info( "The {} '{}' is exported as '{}' at '{}' " "and available for downloading for the next {}. " @@ -109,9 +145,14 @@ def export(dst_format, project_id=None, task_id=None, job_id=None, server_url=No db_instance.name if isinstance(db_instance, (Project, Task)) else db_instance.id, dst_format, output_path, cache_ttl, cleaning_job.id - )) + ) + ) return output_path + except LockError: + rq_job = rq.get_current_job() + rq_job.retry(django_rq.get_queue(settings.CVAT_QUEUES.EXPORT_DATA.value)) + return except Exception: log_exception(logger) raise @@ -135,17 +176,54 @@ def export_project_annotations(project_id, dst_format=None, server_url=None): return export(dst_format, project_id=project_id, server_url=server_url, save_images=False) -def clear_export_cache(file_path, file_ctime, logger): +def clear_export_cache(file_path: str, file_ctime: float, logger: logging.Logger) -> None: try: - cache_dir = osp.dirname(file_path) with get_dataset_cache_lock( - cache_dir, mode=LockMode.exclusive, block=False, timeout=EXPORT_CACHE_LOCKING_TIMEOUT + file_path, + block=True, + acquire_timeout=EXPORT_CACHE_LOCK_TIMEOUT, + ttl=rq.get_current_job().timeout, ): - if osp.getctime(file_path) == file_ctime: - os.remove(file_path) - - logger.info("Export cache file '{}' successfully removed".format(file_path)) - except FileNotFoundError: + if 'job' in file_path: + instance_type = 'job' + elif 'task' in file_path: + instance_type = 'task' + elif 'project' in file_path: + instance_type = 'project' + else: + assert False + + cache_ttl = get_export_cache_ttl(instance_type) + + instance_timestamp = None + with suppress(AttributeError): # backward compatibility + instance_timestamp = get_file_instance_timestamp(file_path) + if instance_timestamp and instance_timestamp != file_ctime: + logger.info("Export cache file '{}' has changed, skipping".format(file_path)) + return + + if timezone.now().timestamp() <= osp.getmtime(file_path) + cache_ttl: + # Need to retry later, the export is in use + rq_job = rq.get_current_job() + rq_job.retries_left = 1 + rq_job.retry_intervals = [cache_ttl] + rq_job.retry( + django_rq.get_queue(settings.CVAT_QUEUES.EXPORT_DATA.value), pipeline=None + ) + logger.info( + "Export cache file '{}' is recently accessed, will retry in {}".format( + file_path, timedelta(seconds=cache_ttl) + ) + ) + return + + # TODO: maybe remove all outdated exports + os.remove(file_path) + logger.info("Export cache file '{}' successfully removed".format(file_path)) + except LockError: + # Need to retry later if the lock was not available + rq_job = rq.get_current_job() + rq_job.retry(django_rq.get_queue(settings.CVAT_QUEUES.EXPORT_DATA.value), pipeline=None) return except Exception: log_exception(logger) diff --git a/cvat/apps/engine/views.py b/cvat/apps/engine/views.py index c68b1ef94ad..6751065b5b3 100644 --- a/cvat/apps/engine/views.py +++ b/cvat/apps/engine/views.py @@ -2959,72 +2959,95 @@ def _export_annotations( f"Unexpected location {location} specified for the request" ) - last_instance_update_time = timezone.localtime(db_instance.updated_date) + cache_ttl = dm.views.get_export_cache_ttl(db_instance) + instance_update_time = timezone.localtime(db_instance.updated_date) if isinstance(db_instance, Project): tasks_update = list(map(lambda db_task: timezone.localtime(db_task.updated_date), db_instance.tasks.all())) - last_instance_update_time = max(tasks_update + [last_instance_update_time]) + instance_update_time = max(tasks_update + [instance_update_time]) - timestamp = datetime.strftime(last_instance_update_time, "%Y_%m_%d_%H_%M_%S") + instance_timestamp = datetime.strftime(instance_update_time, "%Y_%m_%d_%H_%M_%S") is_annotation_file = rq_id.startswith('export:annotations') if rq_job: rq_request = rq_job.meta.get('request', None) request_time = rq_request.get('timestamp', None) if rq_request else None - if request_time is None or request_time < last_instance_update_time: - # in case the server is configured with ONE_RUNNING_JOB_IN_QUEUE_PER_USER - # we have to enqueue dependent jobs after canceling one + if request_time is None or request_time < instance_update_time: + # The result is outdated, need to restart the export. + # Cancel the current job. + # The new attempt will be made after the last existing job. + # In the case the server is configured with ONE_RUNNING_JOB_IN_QUEUE_PER_USER + # we have to enqueue dependent jobs after canceling one. rq_job.cancel(enqueue_dependents=settings.ONE_RUNNING_JOB_IN_QUEUE_PER_USER) - rq_job.delete() else: if rq_job.is_finished: if location == Location.CLOUD_STORAGE: - rq_job.delete() return Response(status=status.HTTP_200_OK) elif location == Location.LOCAL: file_path = rq_job.return_value() if not file_path: - return Response('A result for exporting job was not found for finished RQ job', status=status.HTTP_500_INTERNAL_SERVER_ERROR) + return Response( + 'A result for exporting job was not found for finished RQ job', + status=status.HTTP_500_INTERNAL_SERVER_ERROR + ) - cache_dir = osp.dirname(file_path) with dm.util.get_dataset_cache_lock( - cache_dir, - mode=dm.util.LockMode.shared, - block=False, - timeout=dm.views.EXPORT_CACHE_LOCKING_TIMEOUT + file_path, ttl=60, # request timeout ): if action == "download": if not osp.exists(file_path): - return Response('The result file does not exist in export cache', status=status.HTTP_500_INTERNAL_SERVER_ERROR) + return Response( + "The exported file has expired, please retry exporting", + status=status.HTTP_404_NOT_FOUND + ) filename = filename or \ build_annotations_file_name( class_name=db_instance.__class__.__name__, identifier=db_instance.name if isinstance(db_instance, (Task, Project)) else db_instance.id, - timestamp=timestamp, + timestamp=instance_timestamp, format_name=format_name, is_annotation_file=is_annotation_file, extension=osp.splitext(file_path)[1] ) - rq_job.delete() return sendfile(request, file_path, attachment=True, attachment_filename=filename) else: if osp.exists(file_path): + # Update last update time to prolong the export lifetime + # as the last access time is not available on every filesystem + os.utime(file_path, None) + return Response(status=status.HTTP_201_CREATED) else: - rq_job.delete() + # Cancel and reenqueue the job. + # The new attempt will be made after the last existing job. + # In the case the server is configured with ONE_RUNNING_JOB_IN_QUEUE_PER_USER + # we have to enqueue dependent jobs after canceling one. + rq_job.cancel(enqueue_dependents=settings.ONE_RUNNING_JOB_IN_QUEUE_PER_USER) else: raise NotImplementedError(f"Export to {location} location is not implemented yet") elif rq_job.is_failed: exc_info = rq_job.meta.get('formatted_exception', str(rq_job.exc_info)) - rq_job.delete() - return Response(exc_info, - status=status.HTTP_500_INTERNAL_SERVER_ERROR) + return Response(exc_info, status=status.HTTP_500_INTERNAL_SERVER_ERROR) + elif rq_job.is_deferred and rq_id not in queue.deferred_job_registry.get_job_ids(): + # Sometimes jobs can depend on outdated jobs in the deferred jobs registry. + # They can be fetched by their specific ids, but are not listed by get_job_ids(). + # Supposedly, this can happen because of the server restarts + # (potentially, because the redis used for the queue is inmemory). + # Another potential reason is canceling without enqueueing dependents. + # Such dependencies are never removed or finished, + # as there is no TTL for deferred jobs, + # so the current job can be blocked indefinitely. + + # Cancel the current job and then reenqueue it, considering the current situation. + # The new attempt will be made after the last existing job. + # In the case the server is configured with ONE_RUNNING_JOB_IN_QUEUE_PER_USER + # we have to enqueue dependent jobs after canceling one. + rq_job.cancel(enqueue_dependents=settings.ONE_RUNNING_JOB_IN_QUEUE_PER_USER) else: return Response(status=status.HTTP_202_ACCEPTED) - try: if request.scheme: server_address = request.scheme + '://' @@ -3032,12 +3055,6 @@ def _export_annotations( except Exception: server_address = None - TTL_CONSTS = { - 'project': dm.views.PROJECT_CACHE_TTL, - 'task': dm.views.TASK_CACHE_TTL, - 'job': dm.views.JOB_CACHE_TTL, - } - ttl = TTL_CONSTS[db_instance.__class__.__name__.lower()].total_seconds() user_id = request.user.id func = callback if location == Location.LOCAL else export_resource_to_cloud_storage @@ -3057,7 +3074,7 @@ def _export_annotations( filename_pattern = build_annotations_file_name( class_name=db_instance.__class__.__name__, identifier=db_instance.name if isinstance(db_instance, (Task, Project)) else db_instance.id, - timestamp=timestamp, + timestamp=instance_timestamp, format_name=format_name, is_annotation_file=is_annotation_file, ) @@ -3072,8 +3089,8 @@ def _export_annotations( job_id=rq_id, meta=get_rq_job_meta(request=request, db_obj=db_instance), depends_on=define_dependent_job(queue, user_id, rq_id=rq_id), - result_ttl=ttl, - failure_ttl=ttl, + result_ttl=cache_ttl, + failure_ttl=cache_ttl, ) handle_dataset_export(db_instance, From 4c58bf56d8e68d636496072eb841268ba6447bc1 Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Tue, 7 May 2024 19:55:25 +0300 Subject: [PATCH 04/34] update server deps --- cvat/requirements/base.in | 1 + cvat/requirements/base.txt | 36 ++++++++++++++----------- cvat/requirements/development.txt | 6 ++--- cvat/requirements/production.txt | 2 +- utils/dataset_manifest/requirements.txt | 4 +-- 5 files changed, 28 insertions(+), 21 deletions(-) diff --git a/cvat/requirements/base.in b/cvat/requirements/base.in index 063eb8e4a2e..490b0b0b26e 100644 --- a/cvat/requirements/base.in +++ b/cvat/requirements/base.in @@ -37,6 +37,7 @@ patool==1.12 pdf2image==1.14.0 Pillow>=10.3.0 +pottery==3.0.0 psutil==5.9.4 psycopg2-binary==2.9.5 python-ldap==3.4.3 diff --git a/cvat/requirements/base.txt b/cvat/requirements/base.txt index 949cc17a341..6e8d4e6a436 100644 --- a/cvat/requirements/base.txt +++ b/cvat/requirements/base.txt @@ -1,4 +1,4 @@ -# SHA1:324d0ff72d2920a7d7c735fd480ddb30424c06ec +# SHA1:8394699e81766ae7a750bc3f6bf8726886ca0217 # # This file is autogenerated by pip-compile-multi # To update, run: @@ -50,7 +50,7 @@ coreschema==0.0.4 # via coreapi crontab==1.0.1 # via rq-scheduler -cryptography==42.0.5 +cryptography==42.0.6 # via # azure-storage-blob # pyjwt @@ -99,7 +99,7 @@ django-crum==0.7.9 # via -r cvat/requirements/base.in django-filter==2.4.0 # via -r cvat/requirements/base.in -django-health-check==3.18.1 +django-health-check==3.18.2 # via -r cvat/requirements/base.in django-rq==2.8.1 # via -r cvat/requirements/base.in @@ -116,13 +116,13 @@ easyprocess==1.1 # via pyunpack entrypoint2==1.1 # via pyunpack -fonttools==4.50.0 +fonttools==4.51.0 # via matplotlib -freezegun==1.4.0 +freezegun==1.5.0 # via rq-scheduler furl==2.1.0 # via -r cvat/requirements/base.in -google-api-core==2.18.0 +google-api-core==2.19.0 # via # google-cloud-core # google-cloud-storage @@ -141,9 +141,9 @@ google-resumable-media==2.7.0 # via google-cloud-storage googleapis-common-protos==1.63.0 # via google-api-core -h5py==3.10.0 +h5py==3.11.0 # via datumaro -idna==3.6 +idna==3.7 # via requests importlib-metadata==7.1.0 # via clickhouse-connect @@ -155,7 +155,7 @@ isodate==0.6.1 # via msrest itypes==1.2.0 # via coreapi -jinja2==3.1.3 +jinja2==3.1.4 # via coreschema jmespath==0.10.0 # via @@ -165,7 +165,7 @@ jsonschema==4.17.3 # via drf-spectacular kiwisolver==1.4.5 # via matplotlib -limits==3.10.1 +limits==3.11.0 # via python-logstash-async lxml==5.2.1 # via datumaro @@ -177,9 +177,11 @@ matplotlib==3.8.4 # via # datumaro # pycocotools +mmh3==4.1.0 + # via pottery msrest==0.7.1 # via azure-storage-blob -networkx==3.2.1 +networkx==3.3 # via datumaro nibabel==5.2.1 # via datumaro @@ -187,7 +189,7 @@ oauthlib==3.2.2 # via requests-oauthlib orderedmultidict==1.0.1 # via furl -orjson==3.10.0 +orjson==3.10.3 # via datumaro packaging==24.0 # via @@ -195,12 +197,14 @@ packaging==24.0 # matplotlib # nibabel # tensorboardx -pandas==2.2.1 +pandas==2.2.2 # via datumaro patool==1.12 # via -r cvat/requirements/base.in pdf2image==1.14.0 # via -r cvat/requirements/base.in +pottery==3.0.0 + # via -r cvat/requirements/base.in proto-plus==1.23.0 # via google-api-core protobuf==4.25.3 @@ -266,6 +270,7 @@ redis==4.5.4 # via # -r cvat/requirements/base.in # django-rq + # pottery # rq requests==2.31.0 # via @@ -313,16 +318,17 @@ six==1.16.0 # isodate # orderedmultidict # python-dateutil -sqlparse==0.4.4 +sqlparse==0.5.0 # via django tensorboardx==2.6.2.2 # via datumaro -typing-extensions==4.10.0 +typing-extensions==4.11.0 # via # asgiref # azure-core # datumaro # limits + # pottery tzdata==2024.1 # via pandas uritemplate==4.1.1 diff --git a/cvat/requirements/development.txt b/cvat/requirements/development.txt index 2c9f1bcc0a3..459f8ccf154 100644 --- a/cvat/requirements/development.txt +++ b/cvat/requirements/development.txt @@ -10,7 +10,7 @@ astroid==2.11.7 # via pylint autopep8==2.1.0 # via django-silk -black==24.3.0 +black==24.4.2 # via -r cvat/requirements/development.in dill==0.3.8 # via pylint @@ -30,7 +30,7 @@ mypy-extensions==1.0.0 # via black pathspec==0.12.1 # via black -platformdirs==4.2.0 +platformdirs==4.2.1 # via # black # pylint @@ -62,5 +62,5 @@ tornado==6.4 # via snakeviz # The following packages are considered to be unsafe in a requirements file: -setuptools==69.2.0 +setuptools==69.5.1 # via astroid diff --git a/cvat/requirements/production.txt b/cvat/requirements/production.txt index 1f4b6e44f27..a3eddfbd44e 100644 --- a/cvat/requirements/production.txt +++ b/cvat/requirements/production.txt @@ -10,7 +10,7 @@ anyio==4.3.0 # via watchfiles coverage==7.2.3 # via -r cvat/requirements/production.in -exceptiongroup==1.2.0 +exceptiongroup==1.2.1 # via anyio h11==0.14.0 # via uvicorn diff --git a/utils/dataset_manifest/requirements.txt b/utils/dataset_manifest/requirements.txt index 0edc970086d..c69cb304507 100644 --- a/utils/dataset_manifest/requirements.txt +++ b/utils/dataset_manifest/requirements.txt @@ -1,4 +1,4 @@ -# SHA1:c60d1ed19f53b618c5528cd4b4fc708bc7ba404f +# SHA1:3671835f743ca6c6c8d49b36eda2bb7e0763fa0b # # This file is autogenerated by pip-compile-multi # To update, run: @@ -15,5 +15,5 @@ opencv-python-headless==4.9.0.80 # via -r utils/dataset_manifest/requirements.in pillow==10.3.0 # via -r utils/dataset_manifest/requirements.in -tqdm==4.66.2 +tqdm==4.66.4 # via -r utils/dataset_manifest/requirements.in From 5c22d284f1b8d9c3a0d4a5b62012a283741171be Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Wed, 8 May 2024 13:58:49 +0300 Subject: [PATCH 05/34] Fix deps after merge --- cvat/requirements/base.txt | 17 ++++++----------- cvat/requirements/development.txt | 3 +++ cvat/requirements/testing.txt | 3 +++ utils/dataset_manifest/requirements.txt | 3 +++ 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/cvat/requirements/base.txt b/cvat/requirements/base.txt index 218eab91f1f..f216a3cf2a7 100644 --- a/cvat/requirements/base.txt +++ b/cvat/requirements/base.txt @@ -1,8 +1,4 @@ -<<<<<<< HEAD -# SHA1:8394699e81766ae7a750bc3f6bf8726886ca0217 -======= -# SHA1:dd0cf6b5e35e6d492749b679dccd43a87410571e ->>>>>>> develop +# SHA1:e887f730916864158338619867ddf1eed8f4d223 # # This file is autogenerated by pip-compile-multi # To update, run: @@ -10,6 +6,9 @@ # pip-compile-multi # -r ../../utils/dataset_manifest/requirements.txt +--no-binary lxml +--no-binary xmlsec + asgiref==3.8.1 # via django async-timeout==4.0.3 @@ -54,7 +53,7 @@ coreschema==0.0.4 # via coreapi crontab==1.0.1 # via rq-scheduler -cryptography==42.0.6 +cryptography==42.0.7 # via # azure-storage-blob # pyjwt @@ -72,7 +71,7 @@ dj-pagination==2.5.0 # via -r cvat/requirements/base.in dj-rest-auth[with_social]==5.0.2 # via -r cvat/requirements/base.in -django==4.2.11 +django==4.2.13 # via # -r cvat/requirements/base.in # dj-rest-auth @@ -199,11 +198,7 @@ oauthlib==3.2.2 # via requests-oauthlib orderedmultidict==1.0.1 # via furl -<<<<<<< HEAD orjson==3.10.3 -======= -orjson==3.10.2 ->>>>>>> develop # via datumaro packaging==24.0 # via diff --git a/cvat/requirements/development.txt b/cvat/requirements/development.txt index 459f8ccf154..625afe4f3c0 100644 --- a/cvat/requirements/development.txt +++ b/cvat/requirements/development.txt @@ -6,6 +6,9 @@ # pip-compile-multi # -r base.txt +--no-binary lxml +--no-binary xmlsec + astroid==2.11.7 # via pylint autopep8==2.1.0 diff --git a/cvat/requirements/testing.txt b/cvat/requirements/testing.txt index f746e59ede4..2575e19731b 100644 --- a/cvat/requirements/testing.txt +++ b/cvat/requirements/testing.txt @@ -6,6 +6,9 @@ # pip-compile-multi # -r development.txt +--no-binary lxml +--no-binary xmlsec + coverage==7.2.3 # via -r cvat/requirements/testing.in fakeredis==2.10.3 diff --git a/utils/dataset_manifest/requirements.txt b/utils/dataset_manifest/requirements.txt index c69cb304507..d9272cb8552 100644 --- a/utils/dataset_manifest/requirements.txt +++ b/utils/dataset_manifest/requirements.txt @@ -5,6 +5,9 @@ # # pip-compile-multi # +--no-binary lxml +--no-binary xmlsec + av==9.2.0 # via -r utils/dataset_manifest/requirements.in natsort==8.0.0 From 49baadfc1d9ccf87e3db974bad330b082d8ae747 Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Wed, 8 May 2024 13:59:07 +0300 Subject: [PATCH 06/34] Refactor some code --- cvat/apps/dataset_manager/util.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index 3de1e391dea..7a911763557 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -8,7 +8,6 @@ from datetime import timedelta from threading import Lock from typing import Any, Generator, Optional, Sequence -from enum import IntEnum, auto import inspect import os import os.path as osp @@ -93,16 +92,14 @@ def deepcopy_simple(v): class LockError(Exception): pass + class LockTimeoutError(LockError): pass + class LockNotAvailableError(LockError): pass -class LockMode(IntEnum): - shared = auto() - exclusive = auto() - @contextmanager def get_dataset_cache_lock( @@ -114,29 +111,27 @@ def get_dataset_cache_lock( ) -> Generator[Lock, Any, Any]: if isinstance(acquire_timeout, timedelta): acquire_timeout = acquire_timeout.total_seconds() - - if acquire_timeout is None: + elif acquire_timeout and acquire_timeout < 0: + raise ValueError("acquire_timeout must be a positive number") + elif acquire_timeout is None: acquire_timeout = -1 if isinstance(ttl, timedelta): ttl = ttl.total_seconds() - elif not ttl: - raise ValueError("max_ttl must be a positive number") + elif not ttl or ttl < 0: + raise ValueError("ttl must be a positive number") lock = Redlock( key=export_path, - masters={ - django_rq.get_connection(settings.CVAT_QUEUES.EXPORT_DATA.value) - }, + masters={django_rq.get_connection(settings.CVAT_QUEUES.EXPORT_DATA.value)}, auto_release_time=ttl, - raise_on_redis_errors=True, ) try: acquired = lock.acquire(blocking=block, timeout=acquire_timeout) if acquired: yield lock else: - if acquire_timeout > 0: + if 0 < acquire_timeout: raise LockTimeoutError else: raise LockNotAvailableError From de73fe6b422e2df6bc92b64430cb83eca81bf5c6 Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Wed, 8 May 2024 14:22:02 +0300 Subject: [PATCH 07/34] fix import --- cvat/apps/dataset_manager/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 3837719b0f0..2d4b852199d 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -24,7 +24,7 @@ from cvat.apps.engine.models import Project, Task, Job from .formats.registry import EXPORT_FORMATS, IMPORT_FORMATS -from .util import current_function_name, get_dataset_cache_lock, LockMode, LockError +from .util import current_function_name, get_dataset_cache_lock, LockError slogger = ServerLogManager(__name__) From 35ce1f6e58a147a11069e9b49da961f3d1a64362 Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Wed, 8 May 2024 14:27:56 +0300 Subject: [PATCH 08/34] Fix linter --- cvat/apps/dataset_manager/default_settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cvat/apps/dataset_manager/default_settings.py b/cvat/apps/dataset_manager/default_settings.py index fe1530a5b4f..60a1f60a40d 100644 --- a/cvat/apps/dataset_manager/default_settings.py +++ b/cvat/apps/dataset_manager/default_settings.py @@ -8,4 +8,4 @@ "Base lifetime for cached exported datasets, in seconds" DATASET_CACHE_LOCK_TIMEOUT = int(os.getenv("CVAT_DATASET_CACHE_LOCK_TIMEOUT", 10)) -"Timeout for cache lock acquiring, in seconds" \ No newline at end of file +"Timeout for cache lock acquiring, in seconds" From 7b671f5c77c2ebbcb8fbf4ac4245171d0b32712a Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Wed, 8 May 2024 14:44:40 +0300 Subject: [PATCH 09/34] Update changelog --- .../20240508_144202_mzhiltso_fix_dataset_downloading.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 changelog.d/20240508_144202_mzhiltso_fix_dataset_downloading.md diff --git a/changelog.d/20240508_144202_mzhiltso_fix_dataset_downloading.md b/changelog.d/20240508_144202_mzhiltso_fix_dataset_downloading.md new file mode 100644 index 00000000000..a33c56df635 --- /dev/null +++ b/changelog.d/20240508_144202_mzhiltso_fix_dataset_downloading.md @@ -0,0 +1,5 @@ +### Fixed + +- The 500 / "The result file does not exist in export cache" error + on dataset export request + () From 4a4b5d0e303d74d8bcf4bbab2017ec097fabef7b Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Thu, 9 May 2024 13:13:01 +0300 Subject: [PATCH 10/34] Fix failing cloud storage tests --- tests/python/rest_api/test_resource_import_export.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/python/rest_api/test_resource_import_export.py b/tests/python/rest_api/test_resource_import_export.py index 78b1d0cf1b2..833661fcfab 100644 --- a/tests/python/rest_api/test_resource_import_export.py +++ b/tests/python/rest_api/test_resource_import_export.py @@ -1,5 +1,5 @@ # Copyright (C) 2021-2022 Intel Corporation -# Copyright (C) 2022-2023 CVAT.ai Corporation +# Copyright (C) 2022-2024 CVAT.ai Corporation # # SPDX-License-Identifier: MIT @@ -29,6 +29,7 @@ def _make_client(): @pytest.mark.usefixtures("restore_db_per_class") class TestExportResourceToS3(_S3ResourceTest): + @pytest.mark.usefixtures("restore_redis_inmem_per_function") @pytest.mark.parametrize("cloud_storage_id", [3]) @pytest.mark.parametrize( "obj_id, obj, resource", @@ -53,6 +54,7 @@ def test_save_resource_to_cloud_storage_with_specific_location( self._export_resource(cloud_storage, obj_id, obj, resource, **kwargs) + @pytest.mark.usefixtures("restore_redis_inmem_per_function") @pytest.mark.parametrize("user_type", ["admin", "assigned_supervisor_org_member"]) @pytest.mark.parametrize( "obj_id, obj, resource", @@ -177,6 +179,7 @@ def test_user_cannot_export_to_cloud_storage_with_specific_location_without_acce @pytest.mark.usefixtures("restore_db_per_function") @pytest.mark.usefixtures("restore_cvat_data") class TestImportResourceFromS3(_S3ResourceTest): + @pytest.mark.usefixtures("restore_redis_inmem_per_function") @pytest.mark.parametrize("cloud_storage_id", [3]) @pytest.mark.parametrize( "obj_id, obj, resource", @@ -201,6 +204,7 @@ def test_import_resource_from_cloud_storage_with_specific_location( self._export_resource(cloud_storage, obj_id, obj, resource, **export_kwargs) self._import_resource(cloud_storage, resource, obj_id, obj, **kwargs) + @pytest.mark.usefixtures("restore_redis_inmem_per_function") @pytest.mark.parametrize( "user_type", ["admin", "assigned_supervisor_org_member"], From 2c4be731b1cd0b1528bcf211bc14fd7990f7cefd Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Tue, 14 May 2024 10:51:31 +0300 Subject: [PATCH 11/34] Add rough test --- .../tests/test_rest_api_formats.py | 219 +++++++++++++++++- 1 file changed, 217 insertions(+), 2 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 7c30344d2ba..7efe327e5d4 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -1,20 +1,26 @@ # Copyright (C) 2021-2022 Intel Corporation -# Copyright (C) 2022-2023 CVAT.ai Corporation +# Copyright (C) 2022-2024 CVAT.ai Corporation # # SPDX-License-Identifier: MIT import copy +import itertools import json import os.path as osp import os +import multiprocessing import av import numpy as np import random import xml.etree.ElementTree as ET import zipfile +from contextlib import ExitStack, contextmanager, suppress +from functools import partial from io import BytesIO -import itertools +from typing import Any +from unittest.mock import MagicMock, patch, DEFAULT as MOCK_DEFAULT +from attr import define, field from datumaro.components.dataset import Dataset from datumaro.util.test_utils import compare_datasets, TestDir from django.contrib.auth.models import Group, User @@ -25,6 +31,7 @@ import cvat.apps.dataset_manager as dm from cvat.apps.dataset_manager.bindings import CvatTaskOrJobDataExtractor, TaskData from cvat.apps.dataset_manager.task import TaskAnnotation +from cvat.apps.dataset_manager.views import clear_export_cache, export from cvat.apps.engine.models import Task from cvat.apps.engine.tests.utils import get_paginated_collection @@ -1265,6 +1272,214 @@ def test_api_v2_check_attribute_import_in_tracks(self): data_from_task_after_upload = self._get_data_from_task(task_id, include_images) compare_datasets(self, data_from_task_before_upload, data_from_task_after_upload) + def test_concurrent_export_and_cleanup(self): + @define + class SharedBool: + value: multiprocessing.Value = field( + factory=partial(multiprocessing.Value, 'i', 0) + ) + condition: multiprocessing.Condition = field(factory=multiprocessing.Condition) + + def set(self, value: bool = True): + self.value.value = int(value) + + def get(self): + return bool(self.value.value) + + class _LockTimeoutError(Exception): + pass + + export_checked_the_file = SharedBool() + export_created_the_file = SharedBool() + clear_removed_the_file = SharedBool() + + def _set_condition(var: SharedBool): + with var.condition: + var.set() + var.condition.notify() + + def _wait_condition(var: SharedBool, timeout: int = 5): + with var.condition: + if not var.condition.wait(timeout): + raise _LockTimeoutError + + def _side_effect(f, *args, **kwargs): + """ + Wraps the passed function to be executed with the given parameters + and return the regular mock output + """ + + def wrapped(*_, **__): + f(*args, **kwargs) + return MOCK_DEFAULT + + return wrapped + + def _chain_side_effects(*calls): + """ + Makes a callable that calls all the passed functions sequentially, + and returns the last call result + """ + + def wrapped(*args, **kwargs): + result = MOCK_DEFAULT + + for f in calls: + new_result = f(*args, **kwargs) + if new_result is not MOCK_DEFAULT: + result = new_result + + return result + + return wrapped + + @contextmanager + def process_closing(process: multiprocessing.Process): + try: + yield process + finally: + if process.is_alive(): + process.terminate() + + process.join(timeout=10) + process.close() + + + format_name = "CVAT for images 1.1" + + def _export(*_, task_id: int): + from os.path import exists as original_exists + from os import replace as original_replace + from cvat.apps.dataset_manager.views import log_exception as original_log_exception + import sys + + def patched_log_exception(logger=None, exc_info=True): + cur_exc_info = sys.exc_info() if exc_info is True else exc_info + if cur_exc_info and cur_exc_info[1] and isinstance(cur_exc_info[1], _LockTimeoutError): + return # don't spam in logs with expected errors + + original_log_exception(logger, exc_info) + + with ( + patch('cvat.apps.dataset_manager.views.osp.exists') as mock_osp_exists, + patch('cvat.apps.dataset_manager.views.os.replace') as mock_os_replace, + patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, + patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), + patch('cvat.apps.dataset_manager.views.log_exception', new=patched_log_exception), + ): + mock_osp_exists.side_effect = _chain_side_effects( + original_exists, + _side_effect(_set_condition, export_checked_the_file), + ) + + mock_os_replace.side_effect = _chain_side_effects( + original_replace, + _side_effect(_set_condition, export_created_the_file), + _side_effect(_wait_condition, clear_removed_the_file), + ) + + mock_rq_job = MagicMock() + mock_rq_job.timeout = 5 + mock_rq_get_current_job.return_value = mock_rq_job + + with suppress(_LockTimeoutError): + export(dst_format=format_name, task_id=task_id) + + + def _clear(*_, file_path: str, file_ctime: str): + from os import remove as original_remove + + with ( + patch('cvat.apps.dataset_manager.views.os.remove') as mock_os_remove, + patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, + patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), + ): + mock_os_remove.side_effect = _chain_side_effects( + _side_effect(_wait_condition, export_created_the_file), + original_remove, + _side_effect(_set_condition, clear_removed_the_file), + ) + + mock_rq_job = MagicMock() + mock_rq_job.timeout = 5 + mock_rq_get_current_job.return_value = mock_rq_job + + clear_export_cache(file_path=file_path, file_ctime=file_ctime, logger=MagicMock()) + + + # The problem checked is TOCTOU / race condition for file existence check and + # further file creation / removal. There are several possible variants of the problem. + # An example: + # 1. export checks the file exists, but outdated + # 2. clear checks the file exists, and matches the creation timestamp + # 3. export creates the new export file + # 4. remove removes the new export file (instead of the one that it checked) + # Thus, we have no exported file after the successful export. + # + # Other variants can be variations on the intermediate calls, such as getmtime: + # - export: exists() + # - clear: remove() + # - export: getmtime() -> an exception + # etc. + + images = self._generate_task_images(3) + task = self._create_task(tasks["main"], images) + self._create_annotations(task, f'{format_name} many jobs', "default") + task_id = task["id"] + + with ( + patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, + patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), + ): + mock_rq_job = MagicMock() + mock_rq_job.timeout = 5 + mock_rq_get_current_job.return_value = mock_rq_job + + export_path = export(dst_format=format_name, task_id=task_id) + + export_birth_time = osp.getmtime(export_path) + + self._create_annotations(task, f'{format_name} many jobs', "default") + + with ExitStack() as es: + # Run both operations concurrently + # Threads could be faster, but they can't be terminated + export_process = es.enter_context(process_closing(multiprocessing.Process( + target=_export, + args=(export_checked_the_file, export_created_the_file), + kwargs=dict(task_id=task_id), + ))) + clear_process = es.enter_context(process_closing(multiprocessing.Process( + target=_clear, + args=(export_checked_the_file, export_created_the_file), + kwargs=dict(file_path=export_path, file_ctime=export_birth_time), + ))) + + export_process.start() + + _wait_condition(export_checked_the_file) # ensure the expected execution order + clear_process.start() + + # A deadlock (interrupted by a timeout error) is the positive outcome in this test, + # if the problem is fixed. + # export() must wait for the clear() file existence check and fail because of timeout + export_process.join(timeout=10) + + # clear() must wait for the export cache lock release (acquired by export()). + # It must be finished by a timeout, as export() holds it + clear_process.join(timeout=10) + + self.assertFalse(export_process.is_alive()) + self.assertFalse(clear_process.is_alive()) + + # terminate() may make locks broken, don't try to acquire + self.assertTrue(export_checked_the_file.get()) + self.assertTrue(export_created_the_file.get()) + self.assertFalse(clear_removed_the_file.get()) + + self.assertTrue(osp.isfile(export_path)) + + class ProjectDumpUpload(_DbTestBase): def test_api_v2_export_import_dataset(self): test_name = self._testMethodName From 5dc3e56b0d192ff7802c0c179ea17a88f02bdaf0 Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Tue, 14 May 2024 15:47:57 +0300 Subject: [PATCH 12/34] Refactor code --- cvat/apps/dataset_manager/default_settings.py | 3 + cvat/apps/dataset_manager/util.py | 126 +++++++++++++--- cvat/apps/dataset_manager/views.py | 137 +++++++----------- cvat/apps/engine/views.py | 4 +- 4 files changed, 164 insertions(+), 106 deletions(-) diff --git a/cvat/apps/dataset_manager/default_settings.py b/cvat/apps/dataset_manager/default_settings.py index 60a1f60a40d..1499bd2857c 100644 --- a/cvat/apps/dataset_manager/default_settings.py +++ b/cvat/apps/dataset_manager/default_settings.py @@ -9,3 +9,6 @@ DATASET_CACHE_LOCK_TIMEOUT = int(os.getenv("CVAT_DATASET_CACHE_LOCK_TIMEOUT", 10)) "Timeout for cache lock acquiring, in seconds" + +DATASET_EXPORT_LOCKED_RETRY_INTERVAL = int(os.getenv("CVAT_DATASET_EXPORT_LOCKED_RETRY_INTERVAL", 60)) +"Retry interval for cases the export cache lock was unavailable, in seconds" diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index 7a911763557..335fcfe2b5f 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -3,21 +3,27 @@ # # SPDX-License-Identifier: MIT -from contextlib import contextmanager, suppress -from copy import deepcopy -from datetime import timedelta -from threading import Lock -from typing import Any, Generator, Optional, Sequence import inspect import os import os.path as osp +import re import zipfile +from contextlib import contextmanager, suppress +from copy import deepcopy +from datetime import datetime, timedelta +from threading import Lock +from typing import Any, Generator, Optional, Sequence +import attrs import django_rq +from datumaro.util import to_snake_case +from datumaro.util.os_util import make_file_name from django.conf import settings from django.db import models from pottery import Redlock, ReleaseUnlockedLock +from cvat.apps.engine.models import Job, Project, Task + def current_function_name(depth=1): return inspect.getouterframes(inspect.currentframe())[depth].function @@ -88,18 +94,12 @@ def deepcopy_simple(v): return deepcopy(v) - -class LockError(Exception): +class LockNotAvailableError(Exception): pass -class LockTimeoutError(LockError): - pass - - -class LockNotAvailableError(LockError): - pass - +def make_export_lock_key(filename: os.PathLike[str]) -> str: + return f"export_lock:{filename}" @contextmanager def get_dataset_cache_lock( @@ -111,18 +111,18 @@ def get_dataset_cache_lock( ) -> Generator[Lock, Any, Any]: if isinstance(acquire_timeout, timedelta): acquire_timeout = acquire_timeout.total_seconds() - elif acquire_timeout and acquire_timeout < 0: + if acquire_timeout and acquire_timeout < 0: raise ValueError("acquire_timeout must be a positive number") elif acquire_timeout is None: acquire_timeout = -1 if isinstance(ttl, timedelta): ttl = ttl.total_seconds() - elif not ttl or ttl < 0: + if not ttl or ttl < 0: raise ValueError("ttl must be a positive number") lock = Redlock( - key=export_path, + key=make_export_lock_key(export_path), masters={django_rq.get_connection(settings.CVAT_QUEUES.EXPORT_DATA.value)}, auto_release_time=ttl, ) @@ -131,11 +131,95 @@ def get_dataset_cache_lock( if acquired: yield lock else: - if 0 < acquire_timeout: - raise LockTimeoutError - else: - raise LockNotAvailableError + raise LockNotAvailableError finally: with suppress(ReleaseUnlockedLock): lock.release() + + +EXPORT_CACHE_DIR_NAME = 'export_cache' + +def get_export_cache_dir(db_instance: Project | Task | Job) -> str: + base_dir = osp.abspath(db_instance.get_dirname()) + + if osp.isdir(base_dir): + return osp.join(base_dir, EXPORT_CACHE_DIR_NAME) + else: + raise FileNotFoundError( + '{} dir {} does not exist'.format(db_instance.__class__.__name__, base_dir) + ) + + +def make_export_filename( + dst_dir: str, + save_images: bool, + instance_update_time: datetime, + format_name: str, +) -> str: + from .formats.registry import EXPORT_FORMATS + file_ext = EXPORT_FORMATS[format_name].EXT + + filename = '%s-instance%s-%s.%s' % ( + 'dataset' if save_images else 'annotations', + # store the instance timestamp in the file name to reliably get this information + # ctime / mtime do not return file creation time on linux + # mtime is used for file usage checks + instance_update_time.timestamp(), + make_file_name(to_snake_case(format_name)), + file_ext, + ) + return osp.join(dst_dir, filename) + + +@attrs.define +class ParsedExportFilename: + instance_type: str + has_images: bool + instance_timestamp: Optional[float] + format_repr: str + file_ext: str + + +def parse_export_filename(file_path: os.PathLike[str]) -> ParsedExportFilename: + file_path = osp.normpath(file_path) + dirname, basename = osp.split(file_path) + + basename_match = re.match( + ( + r'(?Pdataset|annotations)' + r'(?:-instance(?P\d+\.\d+))?' # optional for backward compatibility + r'-(?P.+)' + r'\.(?P.+)' + ), + basename + ) + if not basename_match: + raise ValueError(f"Couldn't parse filename components in '{basename}'") + + dirname_match = re.search(rf'/(jobs|tasks|projects)/\d+/{EXPORT_CACHE_DIR_NAME}$', dirname) + if not dirname_match: + raise ValueError(f"Couldn't parse instance type in '{dirname}'") + + match dirname_match.group(1): + case 'jobs': + instance_type_name = 'job' + case 'tasks': + instance_type_name = 'task' + case 'projects': + instance_type_name = 'project' + case _: + assert False + + if 'instance_timestamp' in basename_match.groupdict(): + instance_timestamp = float(basename_match.group('instance_timestamp')) + else: + instance_timestamp = None + + return ParsedExportFilename( + instance_type=instance_type_name, + has_images=basename_match.group('export_mode') == 'dataset', + instance_timestamp=instance_timestamp, + format_repr=basename_match.group('format_tag'), + file_ext=basename_match.group('file_ext'), + ) diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 2d4b852199d..e2e6f43da92 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -3,28 +3,30 @@ # # SPDX-License-Identifier: MIT -from contextlib import suppress import logging import os import os.path as osp -import re import tempfile from datetime import timedelta import django_rq import rq -from datumaro.util.os_util import make_file_name -from datumaro.util import to_snake_case -from django.utils import timezone from django.conf import settings +from django.utils import timezone -import cvat.apps.dataset_manager.task as task import cvat.apps.dataset_manager.project as project +import cvat.apps.dataset_manager.task as task from cvat.apps.engine.log import ServerLogManager -from cvat.apps.engine.models import Project, Task, Job +from cvat.apps.engine.models import Job, Project, Task from .formats.registry import EXPORT_FORMATS, IMPORT_FORMATS -from .util import current_function_name, get_dataset_cache_lock, LockError +from .util import ( + LockNotAvailableError, + current_function_name, get_dataset_cache_lock, + get_export_cache_dir, make_export_filename, + parse_export_filename +) +from .util import EXPORT_CACHE_DIR_NAME # pylint: disable=unused-import slogger = ServerLogManager(__name__) @@ -36,40 +38,25 @@ def log_exception(logger=None, exc_info=True): (_MODULE_NAME, current_function_name(2)), exc_info=exc_info) -EXPORT_CACHE_DIR_NAME = 'export_cache' - -def get_export_cache_dir(db_instance): - base_dir = osp.abspath(db_instance.get_dirname()) - - if osp.isdir(base_dir): - return osp.join(base_dir, EXPORT_CACHE_DIR_NAME) - else: - raise FileNotFoundError( - '{} dir {} does not exist'.format(db_instance.__class__.__name__, base_dir) - ) - DEFAULT_CACHE_TTL = timedelta(seconds=settings.DATASET_CACHE_TTL) PROJECT_CACHE_TTL = DEFAULT_CACHE_TTL / 3 TASK_CACHE_TTL = DEFAULT_CACHE_TTL JOB_CACHE_TTL = DEFAULT_CACHE_TTL +TTL_CONSTS = { + 'project': PROJECT_CACHE_TTL, + 'task': TASK_CACHE_TTL, + 'job': JOB_CACHE_TTL, +} EXPORT_CACHE_LOCK_TIMEOUT = timedelta(seconds=settings.DATASET_CACHE_LOCK_TIMEOUT) +EXPORT_LOCKED_RETRY_INTERVAL = timedelta(seconds=settings.DATASET_EXPORT_LOCKED_RETRY_INTERVAL) -def get_export_cache_ttl(db_instance: str | Project | Task | Job) -> int: - TTL_CONSTS = { - 'project': PROJECT_CACHE_TTL, - 'task': TASK_CACHE_TTL, - 'job': JOB_CACHE_TTL, - } - if not isinstance(db_instance, str): - db_instance = db_instance.__class__.__name__.lower() +def get_export_cache_ttl(db_instance: str | Project | Task | Job) -> timedelta: + if isinstance(db_instance, (Project, Task, Job)): + db_instance = db_instance.__class__.__name__ - return TTL_CONSTS[db_instance].total_seconds() - -def get_file_instance_timestamp(file_path: str) -> float: - match = re.search(r'instance(\d+\.\d+)', osp.basename(file_path)) - return float(match.group(1)) + return TTL_CONSTS[db_instance.lower()] def export(dst_format, project_id=None, task_id=None, job_id=None, server_url=None, save_images=False): @@ -87,11 +74,14 @@ def export(dst_format, project_id=None, task_id=None, job_id=None, server_url=No export_fn = task.export_job db_instance = Job.objects.get(pk=job_id) - cache_ttl = timedelta(seconds=get_export_cache_ttl(db_instance)) + cache_ttl = get_export_cache_ttl(db_instance) cache_dir = get_export_cache_dir(db_instance) - exporter = EXPORT_FORMATS[dst_format] + # As we're not locking the db object here, it can be updated by the time of actual export. + # The file will be saved with older timestamp. + # When it's time to download the file, it will be handled - the export will be restarted. + # The situation is considered rare, so no locking is used. instance_update_time = timezone.localtime(db_instance.updated_date) if isinstance(db_instance, Project): tasks_update = list(map( @@ -100,16 +90,7 @@ def export(dst_format, project_id=None, task_id=None, job_id=None, server_url=No )) instance_update_time = max(tasks_update + [instance_update_time]) - output_path = '%s-instance%s_%s.%s' % ( - 'dataset' if save_images else 'annotations', - # store the instance timestamp in the file name to reliably get this information - # ctime / mtime do not return file creation time on linux - # mtime is used for file usage checks - instance_update_time.timestamp(), - make_file_name(to_snake_case(dst_format)), - exporter.EXT, - ) - output_path = osp.join(cache_dir, output_path) + output_path = make_export_filename(cache_dir, save_images, instance_update_time, dst_format) os.makedirs(cache_dir, exist_ok=True) @@ -119,10 +100,7 @@ def export(dst_format, project_id=None, task_id=None, job_id=None, server_url=No acquire_timeout=EXPORT_CACHE_LOCK_TIMEOUT, ttl=rq.get_current_job().timeout, ): - if not ( - osp.exists(output_path) - and instance_update_time.timestamp() <= get_file_instance_timestamp(output_path) - ): + if not osp.exists(output_path): with tempfile.TemporaryDirectory(dir=cache_dir) as temp_dir: temp_file = osp.join(temp_dir, 'result') export_fn(db_instance.id, temp_file, dst_format, @@ -149,10 +127,12 @@ def export(dst_format, project_id=None, task_id=None, job_id=None, server_url=No ) return output_path - except LockError: - rq_job = rq.get_current_job() - rq_job.retry(django_rq.get_queue(settings.CVAT_QUEUES.EXPORT_DATA.value)) - return + except LockNotAvailableError: + # Need to retry later if the lock was not available + rq_job = rq.get_current_job() # the worker references the same object + rq_job.retries_left = 1 + rq_job.retry_intervals = [EXPORT_LOCKED_RETRY_INTERVAL.total_seconds()] + raise # should be handled by the worker except Exception: log_exception(logger) raise @@ -176,7 +156,12 @@ def export_project_annotations(project_id, dst_format=None, server_url=None): return export(dst_format, project_id=project_id, server_url=server_url, save_images=False) +class FileIsBeingUsedError(Exception): + pass + def clear_export_cache(file_path: str, file_ctime: float, logger: logging.Logger) -> None: + # file_ctime is for backward compatibility with older RQ jobs, not needed now + try: with get_dataset_cache_lock( file_path, @@ -184,47 +169,33 @@ def clear_export_cache(file_path: str, file_ctime: float, logger: logging.Logger acquire_timeout=EXPORT_CACHE_LOCK_TIMEOUT, ttl=rq.get_current_job().timeout, ): - if 'job' in file_path: - instance_type = 'job' - elif 'task' in file_path: - instance_type = 'task' - elif 'project' in file_path: - instance_type = 'project' - else: - assert False - - cache_ttl = get_export_cache_ttl(instance_type) - - instance_timestamp = None - with suppress(AttributeError): # backward compatibility - instance_timestamp = get_file_instance_timestamp(file_path) - if instance_timestamp and instance_timestamp != file_ctime: - logger.info("Export cache file '{}' has changed, skipping".format(file_path)) - return - - if timezone.now().timestamp() <= osp.getmtime(file_path) + cache_ttl: + if not osp.exists(file_path): + raise FileNotFoundError("Export cache file '{}' doesn't exist".format(file_path)) + + parsed_filename = parse_export_filename(file_path) + cache_ttl = get_export_cache_ttl(parsed_filename.instance_type) + + if timezone.now().timestamp() <= osp.getmtime(file_path) + cache_ttl.total_seconds(): # Need to retry later, the export is in use - rq_job = rq.get_current_job() + rq_job = rq.get_current_job() # the worker references the same object rq_job.retries_left = 1 - rq_job.retry_intervals = [cache_ttl] - rq_job.retry( - django_rq.get_queue(settings.CVAT_QUEUES.EXPORT_DATA.value), pipeline=None - ) + rq_job.retry_intervals = [cache_ttl.total_seconds()] logger.info( "Export cache file '{}' is recently accessed, will retry in {}".format( - file_path, timedelta(seconds=cache_ttl) + file_path, cache_ttl ) ) - return + raise FileIsBeingUsedError # should be handled by the worker # TODO: maybe remove all outdated exports os.remove(file_path) logger.info("Export cache file '{}' successfully removed".format(file_path)) - except LockError: + except LockNotAvailableError: # Need to retry later if the lock was not available - rq_job = rq.get_current_job() - rq_job.retry(django_rq.get_queue(settings.CVAT_QUEUES.EXPORT_DATA.value), pipeline=None) - return + rq_job = rq.get_current_job() # the worker references the same object + rq_job.retries_left = 1 + rq_job.retry_intervals = [EXPORT_LOCKED_RETRY_INTERVAL.total_seconds()] + raise # should be handled by the worker except Exception: log_exception(logger) raise diff --git a/cvat/apps/engine/views.py b/cvat/apps/engine/views.py index 9e1326896b3..63f88421dd0 100644 --- a/cvat/apps/engine/views.py +++ b/cvat/apps/engine/views.py @@ -3090,8 +3090,8 @@ def _export_annotations( job_id=rq_id, meta=get_rq_job_meta(request=request, db_obj=db_instance), depends_on=define_dependent_job(queue, user_id, rq_id=rq_id), - result_ttl=cache_ttl, - failure_ttl=cache_ttl, + result_ttl=cache_ttl.total_seconds(), + failure_ttl=cache_ttl.total_seconds(), ) handle_dataset_export(db_instance, From 4c24e10f7770c7cbf6c28a5f655f79fc3f792dbb Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Tue, 14 May 2024 15:48:17 +0300 Subject: [PATCH 13/34] Refactor test --- .../tests/test_rest_api_formats.py | 239 ++++++++++++------ 1 file changed, 167 insertions(+), 72 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 7efe327e5d4..7ce60a4bd5f 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -15,9 +15,10 @@ import xml.etree.ElementTree as ET import zipfile from contextlib import ExitStack, contextmanager, suppress +from datetime import timedelta from functools import partial from io import BytesIO -from typing import Any +from typing import Any, Callable, ClassVar, Optional, overload from unittest.mock import MagicMock, patch, DEFAULT as MOCK_DEFAULT from attr import define, field @@ -31,7 +32,7 @@ import cvat.apps.dataset_manager as dm from cvat.apps.dataset_manager.bindings import CvatTaskOrJobDataExtractor, TaskData from cvat.apps.dataset_manager.task import TaskAnnotation -from cvat.apps.dataset_manager.views import clear_export_cache, export +from cvat.apps.dataset_manager.views import clear_export_cache, export, parse_export_filename from cvat.apps.engine.models import Task from cvat.apps.engine.tests.utils import get_paginated_collection @@ -1272,87 +1273,158 @@ def test_api_v2_check_attribute_import_in_tracks(self): data_from_task_after_upload = self._get_data_from_task(task_id, include_images) compare_datasets(self, data_from_task_before_upload, data_from_task_after_upload) - def test_concurrent_export_and_cleanup(self): - @define - class SharedBool: - value: multiprocessing.Value = field( - factory=partial(multiprocessing.Value, 'i', 0) - ) - condition: multiprocessing.Condition = field(factory=multiprocessing.Condition) +class ExportConcurrencyTest(_DbTestBase): + @define + class SharedBase: + condition: multiprocessing.Condition = field(factory=multiprocessing.Condition, init=False) - def set(self, value: bool = True): - self.value.value = int(value) + @define + class SharedBool(SharedBase): + value: multiprocessing.Value = field( + factory=partial(multiprocessing.Value, 'i', 0), init=False + ) - def get(self): - return bool(self.value.value) + def set(self, value: bool = True): + self.value.value = int(value) - class _LockTimeoutError(Exception): - pass + def get(self) -> bool: + return bool(self.value.value) - export_checked_the_file = SharedBool() - export_created_the_file = SharedBool() - clear_removed_the_file = SharedBool() + @define + class SharedString(SharedBase): + MAX_LEN: ClassVar[int] = 2048 - def _set_condition(var: SharedBool): - with var.condition: - var.set() - var.condition.notify() + value: multiprocessing.Value = field( + factory=partial(multiprocessing.Array, 'c', MAX_LEN), init=False + ) - def _wait_condition(var: SharedBool, timeout: int = 5): - with var.condition: - if not var.condition.wait(timeout): - raise _LockTimeoutError + def set(self, value: str): + self.value.get_obj().value = value.encode()[ : self.MAX_LEN - 1] - def _side_effect(f, *args, **kwargs): - """ - Wraps the passed function to be executed with the given parameters - and return the regular mock output - """ + def get(self) -> str: + return self.value.get_obj().value.decode() - def wrapped(*_, **__): - f(*args, **kwargs) - return MOCK_DEFAULT + class _LockTimeoutError(Exception): + pass - return wrapped + @overload + @classmethod + def set_condition(cls, var: SharedBool, value: bool = True): ... - def _chain_side_effects(*calls): - """ - Makes a callable that calls all the passed functions sequentially, - and returns the last call result - """ + @overload + @classmethod + def set_condition(cls, var: SharedBase, value: Any): ... - def wrapped(*args, **kwargs): - result = MOCK_DEFAULT + _not_set = object() - for f in calls: - new_result = f(*args, **kwargs) - if new_result is not MOCK_DEFAULT: - result = new_result + @classmethod + def set_condition(cls, var: SharedBase, value: Any = _not_set): + if isinstance(var, cls.SharedBool) and value is cls._not_set: + value = True - return result + with var.condition: + var.set(value) + var.condition.notify() - return wrapped + @classmethod + def wait_condition(cls, var: SharedBase, timeout: Optional[int] = 5): + with var.condition: + if not var.condition.wait(timeout): + raise cls._LockTimeoutError - @contextmanager - def process_closing(process: multiprocessing.Process): - try: - yield process - finally: - if process.is_alive(): - process.terminate() + @staticmethod + def side_effect(f: Callable, *args, **kwargs) -> Callable: + """ + Wraps the passed function to be executed with the given parameters + and return the regular mock output + """ - process.join(timeout=10) - process.close() + def wrapped(*_, **__): + f(*args, **kwargs) + return MOCK_DEFAULT + return wrapped + + @staticmethod + def chain_side_effects(*calls: Callable) -> Callable: + """ + Makes a callable that calls all the passed functions sequentially, + and returns the last call result + """ + + def wrapped(*args, **kwargs): + result = MOCK_DEFAULT + + for f in calls: + new_result = f(*args, **kwargs) + if new_result is not MOCK_DEFAULT: + result = new_result + + return result + + return wrapped + + @staticmethod + @contextmanager + def process_closing(process: multiprocessing.Process, *, timeout: Optional[int] = 10): + try: + yield process + finally: + if process.is_alive(): + process.terminate() + + process.join(timeout=timeout) + process.close() + + def test_concurrent_export_and_cleanup(self): + side_effect = self.side_effect + chain_side_effects = self.chain_side_effects + set_condition = self.set_condition + wait_condition = self.wait_condition + _LockTimeoutError = self._LockTimeoutError + process_closing = self.process_closing format_name = "CVAT for images 1.1" + export_cache_lock = multiprocessing.Lock() + + export_checked_the_file = self.SharedBool() + export_created_the_file = self.SharedBool() + export_file_path = self.SharedString() + clear_removed_the_file = self.SharedBool() + + @contextmanager + def patched_get_dataset_cache_lock(export_path, *, ttl, block=True, acquire_timeout=None): + # fakeredis lock acquired in a subprocess won't be visible to the other processes + # just implement the lock here + from cvat.apps.dataset_manager.util import LockNotAvailableError + + if isinstance(acquire_timeout, timedelta): + acquire_timeout = acquire_timeout.total_seconds() + + acquired = export_cache_lock.acquire( + block=block, + timeout=acquire_timeout if acquire_timeout > -1 else None + ) + + if not acquired: + raise LockNotAvailableError + + try: + yield + finally: + export_cache_lock.release() + def _export(*_, task_id: int): from os.path import exists as original_exists from os import replace as original_replace from cvat.apps.dataset_manager.views import log_exception as original_log_exception import sys + def replace_dst_recorder(_: str, dst: str): + set_condition(export_file_path, dst) + return MOCK_DEFAULT + def patched_log_exception(logger=None, exc_info=True): cur_exc_info = sys.exc_info() if exc_info is True else exc_info if cur_exc_info and cur_exc_info[1] and isinstance(cur_exc_info[1], _LockTimeoutError): @@ -1361,21 +1433,23 @@ def patched_log_exception(logger=None, exc_info=True): original_log_exception(logger, exc_info) with ( + patch('cvat.apps.dataset_manager.views.get_dataset_cache_lock', new=patched_get_dataset_cache_lock), patch('cvat.apps.dataset_manager.views.osp.exists') as mock_osp_exists, patch('cvat.apps.dataset_manager.views.os.replace') as mock_os_replace, patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), patch('cvat.apps.dataset_manager.views.log_exception', new=patched_log_exception), ): - mock_osp_exists.side_effect = _chain_side_effects( + mock_osp_exists.side_effect = chain_side_effects( original_exists, - _side_effect(_set_condition, export_checked_the_file), + side_effect(set_condition, export_checked_the_file), ) - mock_os_replace.side_effect = _chain_side_effects( + mock_os_replace.side_effect = chain_side_effects( original_replace, - _side_effect(_set_condition, export_created_the_file), - _side_effect(_wait_condition, clear_removed_the_file), + replace_dst_recorder, + side_effect(set_condition, export_created_the_file), + side_effect(wait_condition, clear_removed_the_file), ) mock_rq_job = MagicMock() @@ -1385,26 +1459,33 @@ def patched_log_exception(logger=None, exc_info=True): with suppress(_LockTimeoutError): export(dst_format=format_name, task_id=task_id) + mock_os_replace.assert_called_once() + def _clear(*_, file_path: str, file_ctime: str): from os import remove as original_remove with ( + patch('cvat.apps.dataset_manager.views.get_dataset_cache_lock', new=patched_get_dataset_cache_lock), patch('cvat.apps.dataset_manager.views.os.remove') as mock_os_remove, patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), + patch('cvat.apps.dataset_manager.views.TTL_CONSTS', new={'task': timedelta(seconds=0)}), ): - mock_os_remove.side_effect = _chain_side_effects( - _side_effect(_wait_condition, export_created_the_file), + mock_os_remove.side_effect = chain_side_effects( + side_effect(wait_condition, export_created_the_file), original_remove, - _side_effect(_set_condition, clear_removed_the_file), + side_effect(set_condition, clear_removed_the_file), ) mock_rq_job = MagicMock() mock_rq_job.timeout = 5 mock_rq_get_current_job.return_value = mock_rq_job - clear_export_cache(file_path=file_path, file_ctime=file_ctime, logger=MagicMock()) + with suppress(_LockTimeoutError): + clear_export_cache( + file_path=file_path, file_ctime=file_ctime, logger=MagicMock() + ) # The problem checked is TOCTOU / race condition for file existence check and @@ -1435,12 +1516,13 @@ def _clear(*_, file_path: str, file_ctime: str): mock_rq_job.timeout = 5 mock_rq_get_current_job.return_value = mock_rq_job - export_path = export(dst_format=format_name, task_id=task_id) + first_export_path = export(dst_format=format_name, task_id=task_id) - export_birth_time = osp.getmtime(export_path) + export_instance_time = parse_export_filename(first_export_path).instance_timestamp self._create_annotations(task, f'{format_name} many jobs', "default") + processes_finished_correctly = False with ExitStack() as es: # Run both operations concurrently # Threads could be faster, but they can't be terminated @@ -1452,12 +1534,12 @@ def _clear(*_, file_path: str, file_ctime: str): clear_process = es.enter_context(process_closing(multiprocessing.Process( target=_clear, args=(export_checked_the_file, export_created_the_file), - kwargs=dict(file_path=export_path, file_ctime=export_birth_time), + kwargs=dict(file_path=first_export_path, file_ctime=export_instance_time), ))) export_process.start() - _wait_condition(export_checked_the_file) # ensure the expected execution order + wait_condition(export_checked_the_file) # ensure the expected execution order clear_process.start() # A deadlock (interrupted by a timeout error) is the positive outcome in this test, @@ -1472,12 +1554,25 @@ def _clear(*_, file_path: str, file_ctime: str): self.assertFalse(export_process.is_alive()) self.assertFalse(clear_process.is_alive()) - # terminate() may make locks broken, don't try to acquire + # All the expected exceptions should be handled in the process callbacks. + # This is to avoid passing the test with unexpected errors + self.assertEqual(export_process.exitcode, 0) + self.assertEqual(clear_process.exitcode, 0) + + processes_finished_correctly = True + + self.assertTrue(processes_finished_correctly) + + # terminate() may break the locks, don't try to acquire + # https://docs.python.org/3/library/multiprocessing.html#multiprocessing.Process.terminate self.assertTrue(export_checked_the_file.get()) self.assertTrue(export_created_the_file.get()) + self.assertFalse(clear_removed_the_file.get()) - self.assertTrue(osp.isfile(export_path)) + new_export_path = export_file_path.get() + self.assertGreater(len(new_export_path), 0) + self.assertTrue(osp.isfile(new_export_path)) class ProjectDumpUpload(_DbTestBase): From 963f657d1797520b788cc9029814c1d3ab756182 Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Tue, 14 May 2024 16:02:12 +0300 Subject: [PATCH 14/34] Refactor some code --- cvat/apps/dataset_manager/util.py | 9 ++++++--- cvat/requirements/base.in | 2 +- cvat/requirements/testing.in | 2 +- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index 335fcfe2b5f..621a066deb3 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -111,16 +111,19 @@ def get_dataset_cache_lock( ) -> Generator[Lock, Any, Any]: if isinstance(acquire_timeout, timedelta): acquire_timeout = acquire_timeout.total_seconds() - if acquire_timeout and acquire_timeout < 0: - raise ValueError("acquire_timeout must be a positive number") + if acquire_timeout is not None and acquire_timeout < 0: + raise ValueError("acquire_timeout must be a non-negative number") elif acquire_timeout is None: acquire_timeout = -1 if isinstance(ttl, timedelta): ttl = ttl.total_seconds() if not ttl or ttl < 0: - raise ValueError("ttl must be a positive number") + raise ValueError("ttl must be a non-negative number") + # The lock is exclusive, so it may potentially reduce performance in some cases, + # where parallel access is potentially possible and valid, + # e.g. dataset downloading could use a shared lock instead. lock = Redlock( key=make_export_lock_key(export_path), masters={django_rq.get_connection(settings.CVAT_QUEUES.EXPORT_DATA.value)}, diff --git a/cvat/requirements/base.in b/cvat/requirements/base.in index 55e32ecbb95..e9829c3fdd6 100644 --- a/cvat/requirements/base.in +++ b/cvat/requirements/base.in @@ -38,7 +38,7 @@ patool==1.12 pdf2image==1.14.0 Pillow>=10.3.0 -pottery==3.0.0 +pottery~=3.0.0 psutil==5.9.4 psycopg2-binary==2.9.5 python-ldap==3.4.3 diff --git a/cvat/requirements/testing.in b/cvat/requirements/testing.in index 52aef23f3a9..4398c9efeda 100644 --- a/cvat/requirements/testing.in +++ b/cvat/requirements/testing.in @@ -1,4 +1,4 @@ -r development.in coverage==7.2.3 -fakeredis==2.10.3 +fakeredis[lua]==2.10.3 From 2bf19db14356004289422691e8c106c364b256dd Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Tue, 14 May 2024 16:13:49 +0300 Subject: [PATCH 15/34] Update deps --- cvat/requirements/base.txt | 8 ++++---- cvat/requirements/development.txt | 2 +- cvat/requirements/testing.txt | 6 ++++-- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/cvat/requirements/base.txt b/cvat/requirements/base.txt index f216a3cf2a7..96efef89268 100644 --- a/cvat/requirements/base.txt +++ b/cvat/requirements/base.txt @@ -1,4 +1,4 @@ -# SHA1:e887f730916864158338619867ddf1eed8f4d223 +# SHA1:b61b838ad51f0cd9ded9ce9a1a3cdafff2adf74a # # This file is autogenerated by pip-compile-multi # To update, run: @@ -121,7 +121,7 @@ entrypoint2==1.1 # via pyunpack fonttools==4.51.0 # via matplotlib -freezegun==1.5.0 +freezegun==1.5.1 # via rq-scheduler furl==2.1.0 # via -r cvat/requirements/base.in @@ -170,9 +170,9 @@ jsonschema==4.17.3 # via drf-spectacular kiwisolver==1.4.5 # via matplotlib -limits==3.11.0 +limits==3.12.0 # via python-logstash-async -lxml==5.2.1 +lxml==5.2.2 # via # -r cvat/requirements/base.in # datumaro diff --git a/cvat/requirements/development.txt b/cvat/requirements/development.txt index 625afe4f3c0..0fad8f2ba5f 100644 --- a/cvat/requirements/development.txt +++ b/cvat/requirements/development.txt @@ -59,7 +59,7 @@ tomli==2.0.1 # autopep8 # black # pylint -tomlkit==0.12.4 +tomlkit==0.12.5 # via pylint tornado==6.4 # via snakeviz diff --git a/cvat/requirements/testing.txt b/cvat/requirements/testing.txt index 2575e19731b..b378a14eaae 100644 --- a/cvat/requirements/testing.txt +++ b/cvat/requirements/testing.txt @@ -1,4 +1,4 @@ -# SHA1:429cfd9ce2f6b66fbb7c898a5c6279d9d8a61335 +# SHA1:6ed6047b6ebd6295b732838ffe76f05505bfc34a # # This file is autogenerated by pip-compile-multi # To update, run: @@ -11,8 +11,10 @@ coverage==7.2.3 # via -r cvat/requirements/testing.in -fakeredis==2.10.3 +fakeredis[lua]==2.10.3 # via -r cvat/requirements/testing.in +lupa==1.14.1 + # via fakeredis sortedcontainers==2.4.0 # via fakeredis From c0769f06d36003f3c991277c39d2d01b34b69fae Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Tue, 14 May 2024 16:17:36 +0300 Subject: [PATCH 16/34] Add redlock docs link --- cvat/apps/dataset_manager/util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index 621a066deb3..176d5de7520 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -121,6 +121,7 @@ def get_dataset_cache_lock( if not ttl or ttl < 0: raise ValueError("ttl must be a non-negative number") + # https://redis.io/docs/latest/develop/use/patterns/distributed-locks/ # The lock is exclusive, so it may potentially reduce performance in some cases, # where parallel access is potentially possible and valid, # e.g. dataset downloading could use a shared lock instead. From 087ac42ca48f20c40ab7d29abcee33d6cdb290db Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Tue, 14 May 2024 18:07:31 +0300 Subject: [PATCH 17/34] Add more tests --- .../tests/test_rest_api_formats.py | 171 +++++++++++++++++- 1 file changed, 162 insertions(+), 9 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 7ce60a4bd5f..2a336976992 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -1395,7 +1395,7 @@ def test_concurrent_export_and_cleanup(self): @contextmanager def patched_get_dataset_cache_lock(export_path, *, ttl, block=True, acquire_timeout=None): - # fakeredis lock acquired in a subprocess won't be visible to the other processes + # fakeredis lock acquired in a subprocess won't be visible to other processes # just implement the lock here from cvat.apps.dataset_manager.util import LockNotAvailableError @@ -1421,7 +1421,7 @@ def _export(*_, task_id: int): from cvat.apps.dataset_manager.views import log_exception as original_log_exception import sys - def replace_dst_recorder(_: str, dst: str): + def os_replace_dst_recorder(_: str, dst: str): set_condition(export_file_path, dst) return MOCK_DEFAULT @@ -1447,13 +1447,12 @@ def patched_log_exception(logger=None, exc_info=True): mock_os_replace.side_effect = chain_side_effects( original_replace, - replace_dst_recorder, + os_replace_dst_recorder, side_effect(set_condition, export_created_the_file), side_effect(wait_condition, clear_removed_the_file), ) - mock_rq_job = MagicMock() - mock_rq_job.timeout = 5 + mock_rq_job = MagicMock(timeout=5) mock_rq_get_current_job.return_value = mock_rq_job with suppress(_LockTimeoutError): @@ -1478,8 +1477,7 @@ def _clear(*_, file_path: str, file_ctime: str): side_effect(set_condition, clear_removed_the_file), ) - mock_rq_job = MagicMock() - mock_rq_job.timeout = 5 + mock_rq_job = MagicMock(timeout=5) mock_rq_get_current_job.return_value = mock_rq_job with suppress(_LockTimeoutError): @@ -1512,8 +1510,7 @@ def _clear(*_, file_path: str, file_ctime: str): patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), ): - mock_rq_job = MagicMock() - mock_rq_job.timeout = 5 + mock_rq_job = MagicMock(timeout=5) mock_rq_get_current_job.return_value = mock_rq_job first_export_path = export(dst_format=format_name, task_id=task_id) @@ -1574,6 +1571,162 @@ def _clear(*_, file_path: str, file_ctime: str): self.assertGreater(len(new_export_path), 0) self.assertTrue(osp.isfile(new_export_path)) + def test_export_can_create_file_and_cleanup_job(self): + format_name = "CVAT for images 1.1" + images = self._generate_task_images(3) + task = self._create_task(tasks["main"], images) + self._create_annotations(task, f'{format_name} many jobs', "default") + task_id = task["id"] + + with ( + patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, + patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler') as mock_rq_get_scheduler, + patch('cvat.apps.dataset_manager.views.TTL_CONSTS', new={'task': timedelta(seconds=0)}), + ): + mock_rq_job = MagicMock(timeout=5) + mock_rq_get_current_job.return_value = mock_rq_job + + mock_rq_scheduler = MagicMock() + mock_rq_get_scheduler.return_value = mock_rq_scheduler + + export_path = export(dst_format=format_name, task_id=task_id) + + self.assertTrue(osp.isfile(export_path)) + mock_rq_scheduler.enqueue_in.assert_called_once() + + def test_export_can_request_retry_on_locking_failure(self): + format_name = "CVAT for images 1.1" + images = self._generate_task_images(3) + task = self._create_task(tasks["main"], images) + self._create_annotations(task, f'{format_name} many jobs', "default") + task_id = task["id"] + + from cvat.apps.dataset_manager.util import LockNotAvailableError + with ( + patch( + 'cvat.apps.dataset_manager.views.get_dataset_cache_lock', + side_effect=LockNotAvailableError + ) as mock_get_dataset_cache_lock, + patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, + patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), + self.assertRaises(LockNotAvailableError), + ): + mock_rq_job = MagicMock(timeout=5) + mock_rq_get_current_job.return_value = mock_rq_job + + export(dst_format=format_name, task_id=task_id) + + mock_get_dataset_cache_lock.assert_called() + self.assertEqual(mock_rq_job.retries_left, 1) + self.assertEqual(len(mock_rq_job.retry_intervals), 1) + + def test_cleanup_can_remove_file(self): + format_name = "CVAT for images 1.1" + images = self._generate_task_images(3) + task = self._create_task(tasks["main"], images) + self._create_annotations(task, f'{format_name} many jobs', "default") + task_id = task["id"] + + with ( + patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, + patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), + ): + mock_rq_get_current_job.return_value = MagicMock(timeout=5) + + export_path = export(dst_format=format_name, task_id=task_id) + + with ( + patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, + patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), + patch('cvat.apps.dataset_manager.views.TTL_CONSTS', new={'task': timedelta(seconds=0)}), + ): + mock_rq_get_current_job.return_value = MagicMock(timeout=5) + + export_path = export(dst_format=format_name, task_id=task_id) + file_ctime = parse_export_filename(export_path).instance_timestamp + clear_export_cache(file_path=export_path, file_ctime=file_ctime, logger=MagicMock()) + + self.assertFalse(osp.isfile(export_path)) + + def test_cleanup_can_request_retry_on_locking_failure(self): + format_name = "CVAT for images 1.1" + images = self._generate_task_images(3) + task = self._create_task(tasks["main"], images) + self._create_annotations(task, f'{format_name} many jobs', "default") + task_id = task["id"] + + from cvat.apps.dataset_manager.util import LockNotAvailableError + with ( + patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, + patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), + ): + mock_rq_get_current_job.return_value = MagicMock(timeout=5) + + export_path = export(dst_format=format_name, task_id=task_id) + + with ( + patch( + 'cvat.apps.dataset_manager.views.get_dataset_cache_lock', + side_effect=LockNotAvailableError + ) as mock_get_dataset_cache_lock, + patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, + patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), + self.assertRaises(LockNotAvailableError), + ): + mock_rq_job = MagicMock(timeout=5) + mock_rq_get_current_job.return_value = mock_rq_job + + file_ctime = parse_export_filename(export_path).instance_timestamp + clear_export_cache(file_path=export_path, file_ctime=file_ctime, logger=MagicMock()) + + mock_get_dataset_cache_lock.assert_called() + self.assertEqual(mock_rq_job.retries_left, 1) + self.assertEqual(len(mock_rq_job.retry_intervals), 1) + self.assertTrue(osp.isfile(export_path)) + + def test_cleanup_can_fail_if_no_file(self): + with ( + patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, + patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), + self.assertRaises(FileNotFoundError), + ): + mock_rq_job = MagicMock(timeout=5) + mock_rq_get_current_job.return_value = mock_rq_job + + clear_export_cache(file_path="non existent file path", file_ctime=0, logger=MagicMock()) + + def test_cleanup_can_defer_removal_if_file_is_used_recently(self): + format_name = "CVAT for images 1.1" + images = self._generate_task_images(3) + task = self._create_task(tasks["main"], images) + self._create_annotations(task, f'{format_name} many jobs', "default") + task_id = task["id"] + + with ( + patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, + patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), + ): + mock_rq_get_current_job.return_value = MagicMock(timeout=5) + + export_path = export(dst_format=format_name, task_id=task_id) + + from cvat.apps.dataset_manager.views import FileIsBeingUsedError + with ( + patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, + patch('cvat.apps.dataset_manager.views.TTL_CONSTS', new={'task': timedelta(hours=1)}), + self.assertRaises(FileIsBeingUsedError), + ): + mock_rq_job = MagicMock(timeout=5) + mock_rq_get_current_job.return_value = mock_rq_job + + export_path = export(dst_format=format_name, task_id=task_id) + file_ctime = parse_export_filename(export_path).instance_timestamp + clear_export_cache(file_path=export_path, file_ctime=file_ctime, logger=MagicMock()) + + self.assertEqual(mock_rq_job.retries_left, 1) + self.assertEqual(len(mock_rq_job.retry_intervals), 1) + self.assertTrue(osp.isfile(export_path)) + class ProjectDumpUpload(_DbTestBase): def test_api_v2_export_import_dataset(self): From 0d75d4ab273b1a9f5e4a6c91f3ee4a943bcc068c Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Tue, 14 May 2024 18:20:34 +0300 Subject: [PATCH 18/34] Add more tests --- .../tests/test_rest_api_formats.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 2a336976992..ac9f31e7666 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -1620,6 +1620,36 @@ def test_export_can_request_retry_on_locking_failure(self): self.assertEqual(mock_rq_job.retries_left, 1) self.assertEqual(len(mock_rq_job.retry_intervals), 1) + def test_export_can_reuse_older_file_if_still_relevant(self): + format_name = "CVAT for images 1.1" + images = self._generate_task_images(3) + task = self._create_task(tasks["main"], images) + self._create_annotations(task, f'{format_name} many jobs', "default") + task_id = task["id"] + + with ( + patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, + patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), + ): + mock_rq_get_current_job.return_value = MagicMock(timeout=5) + + first_export_path = export(dst_format=format_name, task_id=task_id) + + from os.path import exists as original_exists + with ( + patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, + patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), + patch('cvat.apps.dataset_manager.views.osp.exists', side_effect=original_exists) as mock_osp_exists, + patch('cvat.apps.dataset_manager.views.os.replace') as mock_os_replace, + ): + mock_rq_get_current_job.return_value = MagicMock(timeout=5) + + second_export_path = export(dst_format=format_name, task_id=task_id) + + self.assertEqual(first_export_path, second_export_path) + mock_osp_exists.assert_called_with(first_export_path) + mock_os_replace.assert_not_called() + def test_cleanup_can_remove_file(self): format_name = "CVAT for images 1.1" images = self._generate_task_images(3) From ea71c7d7c979665607910654c6d0d905acdac673 Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Tue, 14 May 2024 20:03:08 +0300 Subject: [PATCH 19/34] Add more tests, update export test --- .../tests/test_rest_api_formats.py | 260 ++++++++++++++++-- 1 file changed, 244 insertions(+), 16 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index ac9f31e7666..53ad22ad2f6 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -14,10 +14,11 @@ import random import xml.etree.ElementTree as ET import zipfile -from contextlib import ExitStack, contextmanager, suppress +from contextlib import ExitStack, contextmanager from datetime import timedelta from functools import partial from io import BytesIO +from tempfile import TemporaryDirectory from typing import Any, Callable, ClassVar, Optional, overload from unittest.mock import MagicMock, patch, DEFAULT as MOCK_DEFAULT @@ -1273,7 +1274,7 @@ def test_api_v2_check_attribute_import_in_tracks(self): data_from_task_after_upload = self._get_data_from_task(task_id, include_images) compare_datasets(self, data_from_task_before_upload, data_from_task_after_upload) -class ExportConcurrencyTest(_DbTestBase): +class ExportBehaviorTest(_DbTestBase): @define class SharedBase: condition: multiprocessing.Condition = field(factory=multiprocessing.Condition, init=False) @@ -1401,6 +1402,8 @@ def patched_get_dataset_cache_lock(export_path, *, ttl, block=True, acquire_time if isinstance(acquire_timeout, timedelta): acquire_timeout = acquire_timeout.total_seconds() + if acquire_timeout is None: + acquire_timeout = -1 acquired = export_cache_lock.acquire( block=block, @@ -1433,7 +1436,11 @@ def patched_log_exception(logger=None, exc_info=True): original_log_exception(logger, exc_info) with ( - patch('cvat.apps.dataset_manager.views.get_dataset_cache_lock', new=patched_get_dataset_cache_lock), + patch('cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_TIMEOUT', new=5), + patch( + 'cvat.apps.dataset_manager.views.get_dataset_cache_lock', + new=patched_get_dataset_cache_lock + ), patch('cvat.apps.dataset_manager.views.osp.exists') as mock_osp_exists, patch('cvat.apps.dataset_manager.views.os.replace') as mock_os_replace, patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, @@ -1452,20 +1459,29 @@ def patched_log_exception(logger=None, exc_info=True): side_effect(wait_condition, clear_removed_the_file), ) - mock_rq_job = MagicMock(timeout=5) - mock_rq_get_current_job.return_value = mock_rq_job + mock_rq_get_current_job.return_value = MagicMock(timeout=5) - with suppress(_LockTimeoutError): + exited_by_timeout = False + try: export(dst_format=format_name, task_id=task_id) + except _LockTimeoutError: + # should come from waiting for clear_removed_the_file + exited_by_timeout = True + assert exited_by_timeout mock_os_replace.assert_called_once() def _clear(*_, file_path: str, file_ctime: str): from os import remove as original_remove + from cvat.apps.dataset_manager.util import LockNotAvailableError with ( - patch('cvat.apps.dataset_manager.views.get_dataset_cache_lock', new=patched_get_dataset_cache_lock), + patch('cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_TIMEOUT', new=5), + patch( + 'cvat.apps.dataset_manager.views.get_dataset_cache_lock', + new=patched_get_dataset_cache_lock + ), patch('cvat.apps.dataset_manager.views.os.remove') as mock_os_remove, patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), @@ -1477,13 +1493,18 @@ def _clear(*_, file_path: str, file_ctime: str): side_effect(set_condition, clear_removed_the_file), ) - mock_rq_job = MagicMock(timeout=5) - mock_rq_get_current_job.return_value = mock_rq_job + mock_rq_get_current_job.return_value = MagicMock(timeout=5) - with suppress(_LockTimeoutError): + exited_by_timeout = False + try: clear_export_cache( file_path=file_path, file_ctime=file_ctime, logger=MagicMock() ) + except LockNotAvailableError: + # should come from waiting for get_dataset_cache_lock + exited_by_timeout = True + + assert exited_by_timeout # The problem checked is TOCTOU / race condition for file existence check and @@ -1525,12 +1546,20 @@ def _clear(*_, file_path: str, file_ctime: str): # Threads could be faster, but they can't be terminated export_process = es.enter_context(process_closing(multiprocessing.Process( target=_export, - args=(export_checked_the_file, export_created_the_file), + args=( + export_cache_lock, + export_checked_the_file, export_created_the_file, + export_file_path, clear_removed_the_file, + ), kwargs=dict(task_id=task_id), ))) clear_process = es.enter_context(process_closing(multiprocessing.Process( target=_clear, - args=(export_checked_the_file, export_created_the_file), + args=( + export_cache_lock, + export_checked_the_file, export_created_the_file, + export_file_path, clear_removed_the_file, + ), kwargs=dict(file_path=first_export_path, file_ctime=export_instance_time), ))) @@ -1541,13 +1570,13 @@ def _clear(*_, file_path: str, file_ctime: str): # A deadlock (interrupted by a timeout error) is the positive outcome in this test, # if the problem is fixed. - # export() must wait for the clear() file existence check and fail because of timeout - export_process.join(timeout=10) - # clear() must wait for the export cache lock release (acquired by export()). - # It must be finished by a timeout, as export() holds it + # It must be finished by a timeout, as export() holds it, waiting clear_process.join(timeout=10) + # export() must wait for the clear() file existence check and fail because of timeout + export_process.join(timeout=10) + self.assertFalse(export_process.is_alive()) self.assertFalse(clear_process.is_alive()) @@ -1571,6 +1600,205 @@ def _clear(*_, file_path: str, file_ctime: str): self.assertGreater(len(new_export_path), 0) self.assertTrue(osp.isfile(new_export_path)) + def test_concurrent_download_and_cleanup(self): + side_effect = self.side_effect + chain_side_effects = self.chain_side_effects + set_condition = self.set_condition + wait_condition = self.wait_condition + _LockTimeoutError = self._LockTimeoutError + process_closing = self.process_closing + + format_name = "CVAT for images 1.1" + + export_cache_lock = multiprocessing.Lock() + + download_checked_the_file = self.SharedBool() + clear_removed_the_file = self.SharedBool() + + images = self._generate_task_images(3) + task = self._create_task(tasks["main"], images) + self._create_annotations(task, f'{format_name} many jobs', "default") + task_id = task["id"] + + download_url = self._generate_url_dump_tasks_annotations(task_id) + download_params = { + "format": format_name, + "action": "download", + } + + @contextmanager + def patched_get_dataset_cache_lock(export_path, *, ttl, block=True, acquire_timeout=None): + # fakeredis lock acquired in a subprocess won't be visible to other processes + # just implement the lock here + from cvat.apps.dataset_manager.util import LockNotAvailableError + + if isinstance(acquire_timeout, timedelta): + acquire_timeout = acquire_timeout.total_seconds() + if acquire_timeout is None: + acquire_timeout = -1 + + acquired = export_cache_lock.acquire( + block=block, + timeout=acquire_timeout if acquire_timeout > -1 else None + ) + + if not acquired: + raise LockNotAvailableError + + try: + yield + finally: + export_cache_lock.release() + + def _download(*_, task_id: int, export_path: str): + from os.path import exists as original_exists + + def patched_osp_exists(path: str): + result = original_exists(path) + + if path == export_path: + set_condition(download_checked_the_file) + wait_condition( + clear_removed_the_file, timeout=20 + ) # wait more than the process timeout + + return result + + with ( + patch( + 'cvat.apps.engine.views.dm.util.get_dataset_cache_lock', + new=patched_get_dataset_cache_lock + ), + patch('cvat.apps.dataset_manager.views.osp.exists') as mock_osp_exists, + TemporaryDirectory() as temp_dir, + ): + mock_osp_exists.side_effect = patched_osp_exists + + self._download_file( + download_url, download_params, self.admin, osp.join(temp_dir, "export.zip") + ) + + mock_osp_exists.assert_called() + + def _clear(*_, file_path: str, file_ctime: str): + from os import remove as original_remove + from cvat.apps.dataset_manager.util import LockNotAvailableError + + with ( + patch('cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_TIMEOUT', new=5), + patch( + 'cvat.apps.dataset_manager.views.get_dataset_cache_lock', + new=patched_get_dataset_cache_lock + ), + patch('cvat.apps.dataset_manager.views.os.remove') as mock_os_remove, + patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, + patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), + patch('cvat.apps.dataset_manager.views.TTL_CONSTS', new={'task': timedelta(seconds=0)}), + ): + mock_os_remove.side_effect = chain_side_effects( + original_remove, + side_effect(set_condition, clear_removed_the_file), + ) + + mock_rq_get_current_job.return_value = MagicMock(timeout=5) + + exited_by_timeout = False + try: + clear_export_cache( + file_path=file_path, file_ctime=file_ctime, logger=MagicMock() + ) + except LockNotAvailableError: + # should come from waiting for get_dataset_cache_lock + exited_by_timeout = True + + assert exited_by_timeout + + + # The problem checked is TOCTOU / race condition for file existence check and + # further file reading / removal. There are several possible variants of the problem. + # An example: + # 1. download exports the file + # 2. download checks the export is still relevant + # 3. clear checks the file exists + # 4. clear removes the export file + # 5. download checks if the file exists -> an exception + # + # There can be variations on the intermediate calls, such as: + # - download: exists() + # - clear: remove() + # - download: open() -> an exception + # etc. + + export_path = None + + def patched_export(*args, **kwargs): + nonlocal export_path + + result = export(*args, **kwargs) + export_path = result + + return result + + with ( + patch('cvat.apps.dataset_manager.views.export', new=patched_export), + TemporaryDirectory() as temp_dir, + ): + self._download_file( + download_url, download_params, self.admin, osp.join(temp_dir, "export.zip") + ) + + export_instance_time = parse_export_filename(export_path).instance_timestamp + + processes_finished_correctly = False + with ExitStack() as es: + # Run both operations concurrently + # Threads could be faster, but they can't be terminated + download_process = es.enter_context(process_closing(multiprocessing.Process( + target=_download, + args=(download_checked_the_file, clear_removed_the_file, export_cache_lock), + kwargs=dict(task_id=task_id, export_path=export_path), + ))) + clear_process = es.enter_context(process_closing(multiprocessing.Process( + target=_clear, + args=(download_checked_the_file, clear_removed_the_file, export_cache_lock), + kwargs=dict(file_path=export_path, file_ctime=export_instance_time), + ))) + + download_process.start() + + wait_condition(download_checked_the_file) # ensure the expected execution order + clear_process.start() + + # A deadlock (interrupted by a timeout error) is the positive outcome in this test, + # if the problem is fixed. + # clear() must wait for the export cache lock release (acquired by download()). + # It must be finished by a timeout, as download() holds it, waiting + clear_process.join(timeout=5) + + # download() must wait for the clear() file existence check and fail because of timeout + download_process.join(timeout=5) + + self.assertTrue(download_process.is_alive()) + self.assertFalse(clear_process.is_alive()) + + download_process.terminate() + download_process.join(timeout=5) + + # All the expected exceptions should be handled in the process callbacks. + # This is to avoid passing the test with unexpected errors + self.assertEqual(download_process.exitcode, -15) # sigterm + self.assertEqual(clear_process.exitcode, 0) + + processes_finished_correctly = True + + self.assertTrue(processes_finished_correctly) + + # terminate() may break the locks, don't try to acquire + # https://docs.python.org/3/library/multiprocessing.html#multiprocessing.Process.terminate + self.assertTrue(download_checked_the_file.get()) + + self.assertFalse(clear_removed_the_file.get()) + def test_export_can_create_file_and_cleanup_job(self): format_name = "CVAT for images 1.1" images = self._generate_task_images(3) From 708507678a0aad4bf1259f28888176829afd225f Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Thu, 16 May 2024 14:55:53 +0300 Subject: [PATCH 20/34] Update deps --- cvat/requirements/base.in | 2 +- cvat/requirements/base.txt | 4 ++-- cvat/requirements/development.txt | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cvat/requirements/base.in b/cvat/requirements/base.in index e9829c3fdd6..8c652d6f604 100644 --- a/cvat/requirements/base.in +++ b/cvat/requirements/base.in @@ -38,7 +38,7 @@ patool==1.12 pdf2image==1.14.0 Pillow>=10.3.0 -pottery~=3.0.0 +pottery~=3.0 psutil==5.9.4 psycopg2-binary==2.9.5 python-ldap==3.4.3 diff --git a/cvat/requirements/base.txt b/cvat/requirements/base.txt index 96efef89268..6629be5b038 100644 --- a/cvat/requirements/base.txt +++ b/cvat/requirements/base.txt @@ -1,4 +1,4 @@ -# SHA1:b61b838ad51f0cd9ded9ce9a1a3cdafff2adf74a +# SHA1:72d30d3a78182797ba879af4b7b50f17ecaf2e8f # # This file is autogenerated by pip-compile-multi # To update, run: @@ -357,7 +357,7 @@ xmlsec==1.3.14 # via # -r cvat/requirements/base.in # python3-saml -zipp==3.18.1 +zipp==3.18.2 # via importlib-metadata zstandard==0.22.0 # via clickhouse-connect diff --git a/cvat/requirements/development.txt b/cvat/requirements/development.txt index 0fad8f2ba5f..f5af9280496 100644 --- a/cvat/requirements/development.txt +++ b/cvat/requirements/development.txt @@ -33,7 +33,7 @@ mypy-extensions==1.0.0 # via black pathspec==0.12.1 # via black -platformdirs==4.2.1 +platformdirs==4.2.2 # via # black # pylint From f8d142fd3ff2ab76af47a26da8de24b9a3c755a9 Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Thu, 16 May 2024 14:57:33 +0300 Subject: [PATCH 21/34] Add rq cleanup to tests with multiple iterations --- .../tests/test_rest_api_formats.py | 36 +++++-------- cvat/apps/engine/tests/test_rest_api_3D.py | 27 +++------- cvat/apps/engine/tests/utils.py | 53 ++++++++++++++++++- 3 files changed, 72 insertions(+), 44 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 53ad22ad2f6..3fe259e7615 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -28,14 +28,13 @@ from django.contrib.auth.models import Group, User from PIL import Image from rest_framework import status -from rest_framework.test import APIClient, APITestCase import cvat.apps.dataset_manager as dm from cvat.apps.dataset_manager.bindings import CvatTaskOrJobDataExtractor, TaskData from cvat.apps.dataset_manager.task import TaskAnnotation from cvat.apps.dataset_manager.views import clear_export_cache, export, parse_export_filename from cvat.apps.engine.models import Task -from cvat.apps.engine.tests.utils import get_paginated_collection +from cvat.apps.engine.tests.utils import get_paginated_collection, ApiTestBase, ForceLogin projects_path = osp.join(osp.dirname(__file__), 'assets', 'projects.json') with open(projects_path) as file: @@ -95,26 +94,7 @@ def generate_video_file(filename, width=1280, height=720, duration=1, fps=25, co return [(width, height)] * total_frames, f -class ForceLogin: - def __init__(self, user, client): - self.user = user - self.client = client - - def __enter__(self): - if self.user: - self.client.force_login(self.user, - backend='django.contrib.auth.backends.ModelBackend') - - return self - - def __exit__(self, exception_type, exception_value, traceback): - if self.user: - self.client.logout() - -class _DbTestBase(APITestCase): - def setUp(self): - self.client = APIClient() - +class _DbTestBase(ApiTestBase): @classmethod def setUpTestData(cls): cls.create_db_users() @@ -436,6 +416,8 @@ def test_api_v2_dump_and_upload_annotations_with_objects_type_is_shape(self): url = self._generate_url_dump_tasks_annotations(task_id) for user, edata in list(expected.items()): + self._clear_rq_jobs() # clean up from previous tests and iterations + user_name = edata['name'] file_zip_name = osp.join(test_dir, f'{test_name}_{user_name}_{dump_format_name}.zip') data = { @@ -541,6 +523,8 @@ def test_api_v2_dump_annotations_with_objects_type_is_track(self): url = self._generate_url_dump_tasks_annotations(task_id) for user, edata in list(expected.items()): + self._clear_rq_jobs() # clean up from previous tests and iterations + user_name = edata['name'] file_zip_name = osp.join(test_dir, f'{test_name}_{user_name}_{dump_format_name}.zip') data = { @@ -626,6 +610,8 @@ def test_api_v2_dump_tag_annotations(self): for user, edata in list(expected.items()): with self.subTest(format=f"{edata['name']}"): with TestDir() as test_dir: + self._clear_rq_jobs() # clean up from previous tests and iterations + user_name = edata['name'] url = self._generate_url_dump_tasks_annotations(task_id) @@ -874,6 +860,8 @@ def test_api_v2_export_dataset(self): # dump annotations url = self._generate_url_dump_task_dataset(task_id) for user, edata in list(expected.items()): + self._clear_rq_jobs() # clean up from previous tests and iterations + user_name = edata['name'] file_zip_name = osp.join(test_dir, f'{test_name}_{user_name}_{dump_format_name}.zip') data = { @@ -2032,6 +2020,8 @@ def test_api_v2_export_import_dataset(self): self._create_annotations(task, dump_format_name, "random") for user, edata in list(expected.items()): + self._clear_rq_jobs() # clean up from previous tests and iterations + user_name = edata['name'] file_zip_name = osp.join(test_dir, f'{test_name}_{user_name}_{dump_format_name}.zip') data = { @@ -2112,6 +2102,8 @@ def test_api_v2_export_annotations(self): url = self._generate_url_dump_project_annotations(project['id'], dump_format_name) for user, edata in list(expected.items()): + self._clear_rq_jobs() # clean up from previous tests and iterations + user_name = edata['name'] file_zip_name = osp.join(test_dir, f'{test_name}_{user_name}_{dump_format_name}.zip') data = { diff --git a/cvat/apps/engine/tests/test_rest_api_3D.py b/cvat/apps/engine/tests/test_rest_api_3D.py index 84247b84d48..e90d54b3c30 100644 --- a/cvat/apps/engine/tests/test_rest_api_3D.py +++ b/cvat/apps/engine/tests/test_rest_api_3D.py @@ -25,33 +25,14 @@ from cvat.apps.dataset_manager.task import TaskAnnotation from datumaro.util.test_utils import TestDir -from cvat.apps.engine.tests.utils import get_paginated_collection +from cvat.apps.engine.tests.utils import get_paginated_collection, ApiTestBase, ForceLogin CREATE_ACTION = "create" UPDATE_ACTION = "update" DELETE_ACTION = "delete" -class ForceLogin: - def __init__(self, user, client): - self.user = user - self.client = client - - def __enter__(self): - if self.user: - self.client.force_login(self.user, - backend='django.contrib.auth.backends.ModelBackend') - - return self - - def __exit__(self, exception_type, exception_value, traceback): - if self.user: - self.client.logout() - -class _DbTestBase(APITestCase): - def setUp(self): - self.client = APIClient() - +class _DbTestBase(ApiTestBase): @classmethod def setUpTestData(cls): cls.create_db_users() @@ -547,6 +528,8 @@ def test_api_v2_dump_and_upload_annotation(self): for user, edata in list(self.expected_dump_upload.items()): with self.subTest(format=f"{format_name}_{edata['name']}_dump"): + self._clear_rq_jobs() # clean up from previous tests and iterations + url = self._generate_url_dump_tasks_annotations(task_id) file_name = osp.join(test_dir, f"{format_name}_{edata['name']}.zip") @@ -740,6 +723,8 @@ def test_api_v2_export_dataset(self): for user, edata in list(self.expected_dump_upload.items()): with self.subTest(format=f"{format_name}_{edata['name']}_export"): + self._clear_rq_jobs() # clean up from previous tests and iterations + url = self._generate_url_dump_dataset(task_id) file_name = osp.join(test_dir, f"{format_name}_{edata['name']}.zip") diff --git a/cvat/apps/engine/tests/utils.py b/cvat/apps/engine/tests/utils.py index f0b71463a20..b884b3e9b4c 100644 --- a/cvat/apps/engine/tests/utils.py +++ b/cvat/apps/engine/tests/utils.py @@ -1,4 +1,4 @@ -# Copyright (C) 2023 CVAT.ai Corporation +# Copyright (C) 2023-2024 CVAT.ai Corporation # # SPDX-License-Identifier: MIT @@ -9,11 +9,13 @@ import logging import os +from django.conf import settings from django.core.cache import caches from django.http.response import HttpResponse from PIL import Image from rest_framework.test import APIClient, APITestCase import av +import django_rq import numpy as np T = TypeVar('T') @@ -46,7 +48,53 @@ def __exit__(self, exception_type, exception_value, traceback): self.client.logout() +def clear_rq_jobs(): + for queue_name in settings.RQ_QUEUES: + queue = django_rq.get_queue(queue_name) + + # Remove actual jobs + queue.empty() + + # Clean up the registries + for registry in [ + queue.failed_job_registry, + queue.finished_job_registry, + queue.started_job_registry, + queue.scheduled_job_registry, + ]: + for job_id in registry.get_job_ids(): + registry.remove(job_id) + + # Remove orphaned jobs that can't be normally reported by DjangoRQ + # https://github.com/rq/django-rq/issues/73 + for key in queue.connection.keys('rq:job:*'): + job_id = key.decode().split('rq:job:', maxsplit=1)[1] + job = queue.fetch_job(job_id) + if not job: + # The job can belong to a different queue, using the same connection + continue + + job.delete() + + # Clean up the scheduler, if any + try: + scheduler = django_rq.get_scheduler(queue_name, queue) + except ImportError: + # If the scheduler is not enabled, an exception is thrown + continue + + try: + scheduler.acquire_lock() + for job in scheduler.get_jobs(): + scheduler.cancel(job) + finally: + scheduler.remove_lock() + + class ApiTestBase(APITestCase): + def _clear_rq_jobs(self): + clear_rq_jobs() + def setUp(self): super().setUp() self.client = APIClient() @@ -61,6 +109,9 @@ def tearDown(self): for cache in caches.all(initialized_only=True): cache.clear() + # Clear any remaining RQ jobs produced by the tests executed + self._clear_rq_jobs() + return super().tearDown() From 947988867c4999f70259dc51a1704cfb75a7ade3 Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Thu, 16 May 2024 14:57:44 +0300 Subject: [PATCH 22/34] Update comment --- cvat/apps/dataset_manager/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index e2e6f43da92..0e5430227b9 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -79,7 +79,7 @@ def export(dst_format, project_id=None, task_id=None, job_id=None, server_url=No cache_dir = get_export_cache_dir(db_instance) # As we're not locking the db object here, it can be updated by the time of actual export. - # The file will be saved with older timestamp. + # The file will be saved with the older timestamp. # When it's time to download the file, it will be handled - the export will be restarted. # The situation is considered rare, so no locking is used. instance_update_time = timezone.localtime(db_instance.updated_date) From da069035d4cd4960aaa532cb86714222284ad215 Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Thu, 16 May 2024 14:59:26 +0300 Subject: [PATCH 23/34] Remove unused imports --- cvat/apps/engine/tests/test_rest_api_3D.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cvat/apps/engine/tests/test_rest_api_3D.py b/cvat/apps/engine/tests/test_rest_api_3D.py index e90d54b3c30..fc8f8e95c26 100644 --- a/cvat/apps/engine/tests/test_rest_api_3D.py +++ b/cvat/apps/engine/tests/test_rest_api_3D.py @@ -19,7 +19,6 @@ from django.contrib.auth.models import Group, User from rest_framework import status -from rest_framework.test import APIClient, APITestCase from cvat.apps.engine.media_extractors import ValidateDimension from cvat.apps.dataset_manager.task import TaskAnnotation From 700bf2cc2aaa715edea001957bbab03251ce038b Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Thu, 16 May 2024 19:36:06 +0300 Subject: [PATCH 24/34] Refactor lock release --- cvat/apps/dataset_manager/util.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index 176d5de7520..5aab55b4830 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -8,7 +8,7 @@ import os.path as osp import re import zipfile -from contextlib import contextmanager, suppress +from contextlib import contextmanager from copy import deepcopy from datetime import datetime, timedelta from threading import Lock @@ -20,7 +20,7 @@ from datumaro.util.os_util import make_file_name from django.conf import settings from django.db import models -from pottery import Redlock, ReleaseUnlockedLock +from pottery import Redlock from cvat.apps.engine.models import Job, Project, Task @@ -138,7 +138,7 @@ def get_dataset_cache_lock( raise LockNotAvailableError finally: - with suppress(ReleaseUnlockedLock): + if lock.locked(): lock.release() From 9ccafc5182a14ca0ec990a1f359d12ec5ccdce7c Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Thu, 16 May 2024 19:37:53 +0300 Subject: [PATCH 25/34] Rename lock function --- cvat/apps/dataset_manager/util.py | 8 +++++--- cvat/apps/dataset_manager/views.py | 6 +++--- cvat/apps/engine/views.py | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index 5aab55b4830..1be5dd109b8 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -98,11 +98,12 @@ class LockNotAvailableError(Exception): pass -def make_export_lock_key(filename: os.PathLike[str]) -> str: +def make_export_cache_lock_key(filename: os.PathLike[str]) -> str: return f"export_lock:{filename}" + @contextmanager -def get_dataset_cache_lock( +def get_export_cache_lock( export_path: os.PathLike[str], *, ttl: int | timedelta, @@ -126,7 +127,7 @@ def get_dataset_cache_lock( # where parallel access is potentially possible and valid, # e.g. dataset downloading could use a shared lock instead. lock = Redlock( - key=make_export_lock_key(export_path), + key=make_export_cache_lock_key(export_path), masters={django_rq.get_connection(settings.CVAT_QUEUES.EXPORT_DATA.value)}, auto_release_time=ttl, ) @@ -144,6 +145,7 @@ def get_dataset_cache_lock( EXPORT_CACHE_DIR_NAME = 'export_cache' + def get_export_cache_dir(db_instance: Project | Task | Job) -> str: base_dir = osp.abspath(db_instance.get_dirname()) diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 0e5430227b9..e21ffed3595 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -22,7 +22,7 @@ from .formats.registry import EXPORT_FORMATS, IMPORT_FORMATS from .util import ( LockNotAvailableError, - current_function_name, get_dataset_cache_lock, + current_function_name, get_export_cache_lock, get_export_cache_dir, make_export_filename, parse_export_filename ) @@ -94,7 +94,7 @@ def export(dst_format, project_id=None, task_id=None, job_id=None, server_url=No os.makedirs(cache_dir, exist_ok=True) - with get_dataset_cache_lock( + with get_export_cache_lock( output_path, block=True, acquire_timeout=EXPORT_CACHE_LOCK_TIMEOUT, @@ -163,7 +163,7 @@ def clear_export_cache(file_path: str, file_ctime: float, logger: logging.Logger # file_ctime is for backward compatibility with older RQ jobs, not needed now try: - with get_dataset_cache_lock( + with get_export_cache_lock( file_path, block=True, acquire_timeout=EXPORT_CACHE_LOCK_TIMEOUT, diff --git a/cvat/apps/engine/views.py b/cvat/apps/engine/views.py index 63f88421dd0..aff80f0753f 100644 --- a/cvat/apps/engine/views.py +++ b/cvat/apps/engine/views.py @@ -2993,7 +2993,7 @@ def _export_annotations( status=status.HTTP_500_INTERNAL_SERVER_ERROR ) - with dm.util.get_dataset_cache_lock( + with dm.util.get_export_cache_lock( file_path, ttl=60, # request timeout ): if action == "download": From c2b3049b6a0632470d18d28f5a54b40bcf3ccfba Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Thu, 16 May 2024 19:38:17 +0300 Subject: [PATCH 26/34] Add api compatibility test --- .../tests/test_rest_api_formats.py | 37 ++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 3fe259e7615..2480dee81d0 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -1593,7 +1593,6 @@ def test_concurrent_download_and_cleanup(self): chain_side_effects = self.chain_side_effects set_condition = self.set_condition wait_condition = self.wait_condition - _LockTimeoutError = self._LockTimeoutError process_closing = self.process_closing format_name = "CVAT for images 1.1" @@ -1973,6 +1972,42 @@ def test_cleanup_can_defer_removal_if_file_is_used_recently(self): self.assertEqual(len(mock_rq_job.retry_intervals), 1) self.assertTrue(osp.isfile(export_path)) + def test_cleanup_can_be_called_with_old_signature(self): + # Test RQ jobs for backward compatibility of API prior to the PR + # https://github.com/cvat-ai/cvat/pull/7864 + # Jobs referring to the old API can exist in the redis queues after the server is updated + + format_name = "CVAT for images 1.1" + images = self._generate_task_images(3) + task = self._create_task(tasks["main"], images) + self._create_annotations(task, f'{format_name} many jobs', "default") + task_id = task["id"] + + with ( + patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, + patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), + ): + mock_rq_get_current_job.return_value = MagicMock(timeout=5) + + export_path = export(dst_format=format_name, task_id=task_id) + + file_ctime = parse_export_filename(export_path).instance_timestamp + old_kwargs = { + 'file_path': export_path, + 'file_ctime': file_ctime, + 'logger': MagicMock(), + } + + with ( + patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, + patch('cvat.apps.dataset_manager.views.TTL_CONSTS', new={'task': timedelta(seconds=0)}), + ): + mock_rq_get_current_job.return_value = MagicMock(timeout=5) + + clear_export_cache(**old_kwargs) + + self.assertFalse(osp.isfile(export_path)) + class ProjectDumpUpload(_DbTestBase): def test_api_v2_export_import_dataset(self): From 4ddc79d5726a2b9fac6c66143c148fa6cba95936 Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Thu, 16 May 2024 20:54:44 +0300 Subject: [PATCH 27/34] Fix tests --- .../tests/test_rest_api_formats.py | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 2480dee81d0..4e3f59c387f 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -1383,7 +1383,7 @@ def test_concurrent_export_and_cleanup(self): clear_removed_the_file = self.SharedBool() @contextmanager - def patched_get_dataset_cache_lock(export_path, *, ttl, block=True, acquire_timeout=None): + def patched_get_export_cache_lock(export_path, *, ttl, block=True, acquire_timeout=None): # fakeredis lock acquired in a subprocess won't be visible to other processes # just implement the lock here from cvat.apps.dataset_manager.util import LockNotAvailableError @@ -1426,8 +1426,8 @@ def patched_log_exception(logger=None, exc_info=True): with ( patch('cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_TIMEOUT', new=5), patch( - 'cvat.apps.dataset_manager.views.get_dataset_cache_lock', - new=patched_get_dataset_cache_lock + 'cvat.apps.dataset_manager.views.get_export_cache_lock', + new=patched_get_export_cache_lock ), patch('cvat.apps.dataset_manager.views.osp.exists') as mock_osp_exists, patch('cvat.apps.dataset_manager.views.os.replace') as mock_os_replace, @@ -1467,8 +1467,8 @@ def _clear(*_, file_path: str, file_ctime: str): with ( patch('cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_TIMEOUT', new=5), patch( - 'cvat.apps.dataset_manager.views.get_dataset_cache_lock', - new=patched_get_dataset_cache_lock + 'cvat.apps.dataset_manager.views.get_export_cache_lock', + new=patched_get_export_cache_lock ), patch('cvat.apps.dataset_manager.views.os.remove') as mock_os_remove, patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, @@ -1489,7 +1489,7 @@ def _clear(*_, file_path: str, file_ctime: str): file_path=file_path, file_ctime=file_ctime, logger=MagicMock() ) except LockNotAvailableError: - # should come from waiting for get_dataset_cache_lock + # should come from waiting for get_export_cache_lock exited_by_timeout = True assert exited_by_timeout @@ -1614,7 +1614,7 @@ def test_concurrent_download_and_cleanup(self): } @contextmanager - def patched_get_dataset_cache_lock(export_path, *, ttl, block=True, acquire_timeout=None): + def patched_get_export_cache_lock(export_path, *, ttl, block=True, acquire_timeout=None): # fakeredis lock acquired in a subprocess won't be visible to other processes # just implement the lock here from cvat.apps.dataset_manager.util import LockNotAvailableError @@ -1653,8 +1653,8 @@ def patched_osp_exists(path: str): with ( patch( - 'cvat.apps.engine.views.dm.util.get_dataset_cache_lock', - new=patched_get_dataset_cache_lock + 'cvat.apps.engine.views.dm.util.get_export_cache_lock', + new=patched_get_export_cache_lock ), patch('cvat.apps.dataset_manager.views.osp.exists') as mock_osp_exists, TemporaryDirectory() as temp_dir, @@ -1674,8 +1674,8 @@ def _clear(*_, file_path: str, file_ctime: str): with ( patch('cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_TIMEOUT', new=5), patch( - 'cvat.apps.dataset_manager.views.get_dataset_cache_lock', - new=patched_get_dataset_cache_lock + 'cvat.apps.dataset_manager.views.get_export_cache_lock', + new=patched_get_export_cache_lock ), patch('cvat.apps.dataset_manager.views.os.remove') as mock_os_remove, patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, @@ -1695,7 +1695,7 @@ def _clear(*_, file_path: str, file_ctime: str): file_path=file_path, file_ctime=file_ctime, logger=MagicMock() ) except LockNotAvailableError: - # should come from waiting for get_dataset_cache_lock + # should come from waiting for get_export_cache_lock exited_by_timeout = True assert exited_by_timeout @@ -1819,9 +1819,9 @@ def test_export_can_request_retry_on_locking_failure(self): from cvat.apps.dataset_manager.util import LockNotAvailableError with ( patch( - 'cvat.apps.dataset_manager.views.get_dataset_cache_lock', + 'cvat.apps.dataset_manager.views.get_export_cache_lock', side_effect=LockNotAvailableError - ) as mock_get_dataset_cache_lock, + ) as mock_get_export_cache_lock, patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), self.assertRaises(LockNotAvailableError), @@ -1831,7 +1831,7 @@ def test_export_can_request_retry_on_locking_failure(self): export(dst_format=format_name, task_id=task_id) - mock_get_dataset_cache_lock.assert_called() + mock_get_export_cache_lock.assert_called() self.assertEqual(mock_rq_job.retries_left, 1) self.assertEqual(len(mock_rq_job.retry_intervals), 1) @@ -1911,9 +1911,9 @@ def test_cleanup_can_request_retry_on_locking_failure(self): with ( patch( - 'cvat.apps.dataset_manager.views.get_dataset_cache_lock', + 'cvat.apps.dataset_manager.views.get_export_cache_lock', side_effect=LockNotAvailableError - ) as mock_get_dataset_cache_lock, + ) as mock_get_export_cache_lock, patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), self.assertRaises(LockNotAvailableError), @@ -1924,7 +1924,7 @@ def test_cleanup_can_request_retry_on_locking_failure(self): file_ctime = parse_export_filename(export_path).instance_timestamp clear_export_cache(file_path=export_path, file_ctime=file_ctime, logger=MagicMock()) - mock_get_dataset_cache_lock.assert_called() + mock_get_export_cache_lock.assert_called() self.assertEqual(mock_rq_job.retries_left, 1) self.assertEqual(len(mock_rq_job.retry_intervals), 1) self.assertTrue(osp.isfile(export_path)) From 4471aa999d639a02b033e8e22a67a25c3eb9598d Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Fri, 17 May 2024 13:40:59 +0300 Subject: [PATCH 28/34] Restore old job removal behavior --- cvat/apps/engine/views.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cvat/apps/engine/views.py b/cvat/apps/engine/views.py index aff80f0753f..2ef73e29c6b 100644 --- a/cvat/apps/engine/views.py +++ b/cvat/apps/engine/views.py @@ -2979,9 +2979,11 @@ def _export_annotations( # In the case the server is configured with ONE_RUNNING_JOB_IN_QUEUE_PER_USER # we have to enqueue dependent jobs after canceling one. rq_job.cancel(enqueue_dependents=settings.ONE_RUNNING_JOB_IN_QUEUE_PER_USER) + rq_job.delete() else: if rq_job.is_finished: if location == Location.CLOUD_STORAGE: + rq_job.delete() return Response(status=status.HTTP_200_OK) elif location == Location.LOCAL: @@ -3013,6 +3015,7 @@ def _export_annotations( extension=osp.splitext(file_path)[1] ) + rq_job.delete() return sendfile(request, file_path, attachment=True, attachment_filename=filename) else: if osp.exists(file_path): @@ -3027,10 +3030,12 @@ def _export_annotations( # In the case the server is configured with ONE_RUNNING_JOB_IN_QUEUE_PER_USER # we have to enqueue dependent jobs after canceling one. rq_job.cancel(enqueue_dependents=settings.ONE_RUNNING_JOB_IN_QUEUE_PER_USER) + rq_job.delete() else: raise NotImplementedError(f"Export to {location} location is not implemented yet") elif rq_job.is_failed: exc_info = rq_job.meta.get('formatted_exception', str(rq_job.exc_info)) + rq_job.delete() return Response(exc_info, status=status.HTTP_500_INTERNAL_SERVER_ERROR) elif rq_job.is_deferred and rq_id not in queue.deferred_job_registry.get_job_ids(): # Sometimes jobs can depend on outdated jobs in the deferred jobs registry. @@ -3047,6 +3052,7 @@ def _export_annotations( # In the case the server is configured with ONE_RUNNING_JOB_IN_QUEUE_PER_USER # we have to enqueue dependent jobs after canceling one. rq_job.cancel(enqueue_dependents=settings.ONE_RUNNING_JOB_IN_QUEUE_PER_USER) + rq_job.delete() else: return Response(status=status.HTTP_202_ACCEPTED) try: From e76382d2de7a5ccd80054374b402cc461d253bb7 Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Fri, 17 May 2024 15:06:27 +0300 Subject: [PATCH 29/34] Update downloading test --- .../tests/test_rest_api_formats.py | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 4e3f59c387f..97ac83f99d7 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -1610,7 +1610,6 @@ def test_concurrent_download_and_cleanup(self): download_url = self._generate_url_dump_tasks_annotations(task_id) download_params = { "format": format_name, - "action": "download", } @contextmanager @@ -1661,9 +1660,12 @@ def patched_osp_exists(path: str): ): mock_osp_exists.side_effect = patched_osp_exists - self._download_file( - download_url, download_params, self.admin, osp.join(temp_dir, "export.zip") - ) + response = self._get_request_with_data(download_url, download_params, self.admin) + self.assertEqual(response.status_code, status.HTTP_200_OK) + + content = BytesIO(b"".join(response.streaming_content)) + with open(osp.join(temp_dir, "export.zip"), "wb") as f: + f.write(content.getvalue()) mock_osp_exists.assert_called() @@ -1726,16 +1728,17 @@ def patched_export(*args, **kwargs): return result - with ( - patch('cvat.apps.dataset_manager.views.export', new=patched_export), - TemporaryDirectory() as temp_dir, - ): - self._download_file( - download_url, download_params, self.admin, osp.join(temp_dir, "export.zip") - ) + with patch('cvat.apps.dataset_manager.views.export', new=patched_export): + response = self._get_request_with_data(download_url, download_params, self.admin) + self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED) + + response = self._get_request_with_data(download_url, download_params, self.admin) + self.assertEqual(response.status_code, status.HTTP_201_CREATED) export_instance_time = parse_export_filename(export_path).instance_timestamp + download_params["action"] = "download" + processes_finished_correctly = False with ExitStack() as es: # Run both operations concurrently From 2af76a8418761c06ee5f1a9ea7744d5df423a766 Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Mon, 27 May 2024 16:57:12 +0300 Subject: [PATCH 30/34] Fix comments --- .../tests/test_rest_api_formats.py | 16 ++++++++-------- cvat/apps/dataset_manager/util.py | 12 ++++++------ cvat/apps/dataset_manager/views.py | 8 +++++--- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 97ac83f99d7..3623e0d66a5 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -32,7 +32,7 @@ import cvat.apps.dataset_manager as dm from cvat.apps.dataset_manager.bindings import CvatTaskOrJobDataExtractor, TaskData from cvat.apps.dataset_manager.task import TaskAnnotation -from cvat.apps.dataset_manager.views import clear_export_cache, export, parse_export_filename +from cvat.apps.dataset_manager.views import clear_export_cache, export, parse_export_file_path from cvat.apps.engine.models import Task from cvat.apps.engine.tests.utils import get_paginated_collection, ApiTestBase, ForceLogin @@ -1524,7 +1524,7 @@ def _clear(*_, file_path: str, file_ctime: str): first_export_path = export(dst_format=format_name, task_id=task_id) - export_instance_time = parse_export_filename(first_export_path).instance_timestamp + export_instance_timestamp = parse_export_file_path(first_export_path).instance_timestamp self._create_annotations(task, f'{format_name} many jobs', "default") @@ -1548,7 +1548,7 @@ def _clear(*_, file_path: str, file_ctime: str): export_checked_the_file, export_created_the_file, export_file_path, clear_removed_the_file, ), - kwargs=dict(file_path=first_export_path, file_ctime=export_instance_time), + kwargs=dict(file_path=first_export_path, file_ctime=export_instance_timestamp), ))) export_process.start() @@ -1735,7 +1735,7 @@ def patched_export(*args, **kwargs): response = self._get_request_with_data(download_url, download_params, self.admin) self.assertEqual(response.status_code, status.HTTP_201_CREATED) - export_instance_time = parse_export_filename(export_path).instance_timestamp + export_instance_time = parse_export_file_path(export_path).instance_timestamp download_params["action"] = "download" @@ -1891,7 +1891,7 @@ def test_cleanup_can_remove_file(self): mock_rq_get_current_job.return_value = MagicMock(timeout=5) export_path = export(dst_format=format_name, task_id=task_id) - file_ctime = parse_export_filename(export_path).instance_timestamp + file_ctime = parse_export_file_path(export_path).instance_timestamp clear_export_cache(file_path=export_path, file_ctime=file_ctime, logger=MagicMock()) self.assertFalse(osp.isfile(export_path)) @@ -1924,7 +1924,7 @@ def test_cleanup_can_request_retry_on_locking_failure(self): mock_rq_job = MagicMock(timeout=5) mock_rq_get_current_job.return_value = mock_rq_job - file_ctime = parse_export_filename(export_path).instance_timestamp + file_ctime = parse_export_file_path(export_path).instance_timestamp clear_export_cache(file_path=export_path, file_ctime=file_ctime, logger=MagicMock()) mock_get_export_cache_lock.assert_called() @@ -1968,7 +1968,7 @@ def test_cleanup_can_defer_removal_if_file_is_used_recently(self): mock_rq_get_current_job.return_value = mock_rq_job export_path = export(dst_format=format_name, task_id=task_id) - file_ctime = parse_export_filename(export_path).instance_timestamp + file_ctime = parse_export_file_path(export_path).instance_timestamp clear_export_cache(file_path=export_path, file_ctime=file_ctime, logger=MagicMock()) self.assertEqual(mock_rq_job.retries_left, 1) @@ -1994,7 +1994,7 @@ def test_cleanup_can_be_called_with_old_signature(self): export_path = export(dst_format=format_name, task_id=task_id) - file_ctime = parse_export_filename(export_path).instance_timestamp + file_ctime = parse_export_file_path(export_path).instance_timestamp old_kwargs = { 'file_path': export_path, 'file_ctime': file_ctime, diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index 1be5dd109b8..e910245a2db 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -99,7 +99,7 @@ class LockNotAvailableError(Exception): def make_export_cache_lock_key(filename: os.PathLike[str]) -> str: - return f"export_lock:{filename}" + return f"export_lock:{os.fspath(filename)}" @contextmanager @@ -160,18 +160,18 @@ def get_export_cache_dir(db_instance: Project | Task | Job) -> str: def make_export_filename( dst_dir: str, save_images: bool, - instance_update_time: datetime, + instance_timestamp: float, format_name: str, ) -> str: from .formats.registry import EXPORT_FORMATS file_ext = EXPORT_FORMATS[format_name].EXT - filename = '%s-instance%s-%s.%s' % ( + filename = '%s-instance%f-%s.%s' % ( 'dataset' if save_images else 'annotations', # store the instance timestamp in the file name to reliably get this information # ctime / mtime do not return file creation time on linux # mtime is used for file usage checks - instance_update_time.timestamp(), + instance_timestamp, make_file_name(to_snake_case(format_name)), file_ext, ) @@ -187,11 +187,11 @@ class ParsedExportFilename: file_ext: str -def parse_export_filename(file_path: os.PathLike[str]) -> ParsedExportFilename: +def parse_export_file_path(file_path: os.PathLike[str]) -> ParsedExportFilename: file_path = osp.normpath(file_path) dirname, basename = osp.split(file_path) - basename_match = re.match( + basename_match = re.fullmatch( ( r'(?Pdataset|annotations)' r'(?:-instance(?P\d+\.\d+))?' # optional for backward compatibility diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index e21ffed3595..53cbdd5c03b 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -24,7 +24,7 @@ LockNotAvailableError, current_function_name, get_export_cache_lock, get_export_cache_dir, make_export_filename, - parse_export_filename + parse_export_file_path ) from .util import EXPORT_CACHE_DIR_NAME # pylint: disable=unused-import @@ -90,7 +90,9 @@ def export(dst_format, project_id=None, task_id=None, job_id=None, server_url=No )) instance_update_time = max(tasks_update + [instance_update_time]) - output_path = make_export_filename(cache_dir, save_images, instance_update_time, dst_format) + output_path = make_export_filename( + cache_dir, save_images, instance_update_time.timestamp(), dst_format + ) os.makedirs(cache_dir, exist_ok=True) @@ -172,7 +174,7 @@ def clear_export_cache(file_path: str, file_ctime: float, logger: logging.Logger if not osp.exists(file_path): raise FileNotFoundError("Export cache file '{}' doesn't exist".format(file_path)) - parsed_filename = parse_export_filename(file_path) + parsed_filename = parse_export_file_path(file_path) cache_ttl = get_export_cache_ttl(parsed_filename.instance_type) if timezone.now().timestamp() <= osp.getmtime(file_path) + cache_ttl.total_seconds(): From aafc36930cbb7388996b3485b7b3ce3b1d183672 Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Mon, 27 May 2024 17:04:23 +0300 Subject: [PATCH 31/34] Remove unused import --- cvat/apps/dataset_manager/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index e910245a2db..b46e67ba2ad 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -10,7 +10,7 @@ import zipfile from contextlib import contextmanager from copy import deepcopy -from datetime import datetime, timedelta +from datetime import timedelta from threading import Lock from typing import Any, Generator, Optional, Sequence From 275a2747078e6ed85e0ea68b1d7a71d4b4b8506b Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Tue, 28 May 2024 11:17:38 +0300 Subject: [PATCH 32/34] Raise an exception on unlocking expired lock --- .../dataset_manager/tests/test_rest_api_formats.py | 10 ++++++++++ cvat/apps/dataset_manager/util.py | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 3623e0d66a5..b8be48b173b 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -19,6 +19,7 @@ from functools import partial from io import BytesIO from tempfile import TemporaryDirectory +from time import sleep from typing import Any, Callable, ClassVar, Optional, overload from unittest.mock import MagicMock, patch, DEFAULT as MOCK_DEFAULT @@ -32,6 +33,7 @@ import cvat.apps.dataset_manager as dm from cvat.apps.dataset_manager.bindings import CvatTaskOrJobDataExtractor, TaskData from cvat.apps.dataset_manager.task import TaskAnnotation +from cvat.apps.dataset_manager.util import get_export_cache_lock from cvat.apps.dataset_manager.views import clear_export_cache, export, parse_export_file_path from cvat.apps.engine.models import Task from cvat.apps.engine.tests.utils import get_paginated_collection, ApiTestBase, ForceLogin @@ -1812,6 +1814,14 @@ def test_export_can_create_file_and_cleanup_job(self): self.assertTrue(osp.isfile(export_path)) mock_rq_scheduler.enqueue_in.assert_called_once() + def test_export_cache_lock_can_raise_on_releasing_expired_lock(self): + from pottery import ReleaseUnlockedLock + + with self.assertRaises(ReleaseUnlockedLock): + lock_time = 2 + with get_export_cache_lock('test_export_path', ttl=lock_time, acquire_timeout=5): + sleep(lock_time + 1) + def test_export_can_request_retry_on_locking_failure(self): format_name = "CVAT for images 1.1" images = self._generate_task_images(3) diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index b46e67ba2ad..064303f84f8 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -131,15 +131,15 @@ def get_export_cache_lock( masters={django_rq.get_connection(settings.CVAT_QUEUES.EXPORT_DATA.value)}, auto_release_time=ttl, ) + acquired = lock.acquire(blocking=block, timeout=acquire_timeout) try: - acquired = lock.acquire(blocking=block, timeout=acquire_timeout) if acquired: yield lock else: raise LockNotAvailableError finally: - if lock.locked(): + if acquired: lock.release() From 297d6866a307e21baaa2147364f85e3a643f8be5 Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Tue, 28 May 2024 11:22:23 +0300 Subject: [PATCH 33/34] Remove unexpected requirements changes --- cvat/requirements/base.txt | 2 -- cvat/requirements/development.txt | 2 -- cvat/requirements/testing.txt | 2 -- 3 files changed, 6 deletions(-) diff --git a/cvat/requirements/base.txt b/cvat/requirements/base.txt index 6629be5b038..fbfbd4dd3a3 100644 --- a/cvat/requirements/base.txt +++ b/cvat/requirements/base.txt @@ -6,8 +6,6 @@ # pip-compile-multi # -r ../../utils/dataset_manifest/requirements.txt ---no-binary lxml ---no-binary xmlsec asgiref==3.8.1 # via django diff --git a/cvat/requirements/development.txt b/cvat/requirements/development.txt index f5af9280496..52a87e9fb59 100644 --- a/cvat/requirements/development.txt +++ b/cvat/requirements/development.txt @@ -6,8 +6,6 @@ # pip-compile-multi # -r base.txt ---no-binary lxml ---no-binary xmlsec astroid==2.11.7 # via pylint diff --git a/cvat/requirements/testing.txt b/cvat/requirements/testing.txt index b378a14eaae..cf7f07c185d 100644 --- a/cvat/requirements/testing.txt +++ b/cvat/requirements/testing.txt @@ -6,8 +6,6 @@ # pip-compile-multi # -r development.txt ---no-binary lxml ---no-binary xmlsec coverage==7.2.3 # via -r cvat/requirements/testing.in From 664ed2895e150b247023dafafc91fa3d01062ba0 Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Tue, 28 May 2024 17:30:19 +0300 Subject: [PATCH 34/34] Remove extra requirements --- utils/dataset_manifest/requirements.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/utils/dataset_manifest/requirements.txt b/utils/dataset_manifest/requirements.txt index d9272cb8552..c69cb304507 100644 --- a/utils/dataset_manifest/requirements.txt +++ b/utils/dataset_manifest/requirements.txt @@ -5,9 +5,6 @@ # # pip-compile-multi # ---no-binary lxml ---no-binary xmlsec - av==9.2.0 # via -r utils/dataset_manifest/requirements.in natsort==8.0.0