From f00d00e3b0d9f630416e4f062dca1fa4deccc247 Mon Sep 17 00:00:00 2001 From: Tommaso Comparin <3862206+tcompa@users.noreply.github.com> Date: Wed, 4 Dec 2024 14:32:24 +0100 Subject: [PATCH 1/3] New data-retrieval logic for tasks page --- .github/workflows/task_list.yaml | 31 +++ tasks/data_retrieval/.gitignore | 3 + tasks/data_retrieval/create_tasks_data.py | 244 ++++++++++++++++++ tasks/data_retrieval/requirements.txt | 2 + tasks/data_retrieval/sources.txt | 16 ++ .../data_retrieval/create_tasks_data.py | 244 ++++++++++++++++++ 6 files changed, 540 insertions(+) create mode 100644 .github/workflows/task_list.yaml create mode 100644 tasks/data_retrieval/.gitignore create mode 100644 tasks/data_retrieval/create_tasks_data.py create mode 100644 tasks/data_retrieval/requirements.txt create mode 100644 tasks/data_retrieval/sources.txt create mode 100644 tasks_page/data_retrieval/create_tasks_data.py diff --git a/.github/workflows/task_list.yaml b/.github/workflows/task_list.yaml new file mode 100644 index 0000000..069d379 --- /dev/null +++ b/.github/workflows/task_list.yaml @@ -0,0 +1,31 @@ +name: Task-list page + +on: + push: + branches: ["main"] + pull_request: + branches: ["main"] + workflow_dispatch: + + +jobs: + retrieve-tasks-data: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - uses: actions/setup-python@v5 + with: + python-version: 3.11 + cache: pip + + - name: Install requirements + run: python3 -m pip install -r tasks/data_retrieval/requirements.txt + + - name: Fetch tasks data + run: python3 -u tasks/data_retrieval/create_tasks_data.py + + - run: cat tasks/data_retrieval/tasks_data.json \ No newline at end of file diff --git a/tasks/data_retrieval/.gitignore b/tasks/data_retrieval/.gitignore new file mode 100644 index 0000000..3d4072f --- /dev/null +++ b/tasks/data_retrieval/.gitignore @@ -0,0 +1,3 @@ +downloads +venv +tasks_data.json diff --git a/tasks/data_retrieval/create_tasks_data.py b/tasks/data_retrieval/create_tasks_data.py new file mode 100644 index 0000000..ec85595 --- /dev/null +++ b/tasks/data_retrieval/create_tasks_data.py @@ -0,0 +1,244 @@ +import time +from pathlib import Path +import requests +import json +from os import environ +from zipfile import ZipFile +from typing import Any +from pydantic import BaseModel +from typing import Optional, Literal + + +DOWNLOAD_FOLDER = Path(__file__).parent / "downloads" +DOWNLOAD_FOLDER.mkdir(exist_ok=True) + + +class TaskReadV2(BaseModel): + """ + Based on + https://github.com/fractal-analytics-platform/fractal-server/blob/main/fractal_server/app/schemas/v2/task.py + """ + + name: str + type: Literal["parallel", "non_parallel", "compound"] + source: Optional[str] = None + version: Optional[str] = None + docs_info: Optional[str] = None + docs_link: Optional[str] = None + input_types: dict[str, bool] + output_types: dict[str, bool] + category: Optional[str] = None + modality: Optional[str] = None + authors: Optional[str] = None + tags: list[str] + + class Config: + extra = "forbid" + + +def parse_wheel_filename(wheel_path: str) -> dict[str, str]: + """ + Given a wheel-file name or path, extract distribution and version. + """ + wheel_filename = wheel_path.split("/")[-1] + parts = wheel_filename.split("-") + return dict(name=parts[0], version=parts[1]) + + +def load_manifest_from_zip(wheel_path: str) -> dict[str, Any]: + """ + Given a wheel file on-disk, extract the Fractal manifest. + """ + with ZipFile(wheel_path) as wheel: + namelist = wheel.namelist() + try: + manifest = next( + name for name in namelist if "__FRACTAL_MANIFEST__.json" in name + ) + except StopIteration: + msg = f"{wheel_path} does not include __FRACTAL_MANIFEST__.json" + raise ValueError(msg) + with wheel.open(manifest) as manifest_fd: + manifest_dict = json.load(manifest_fd) + return manifest_dict + + +def download_file(url: str) -> str: + file_name = url.split("/")[-1] + response = requests.get(url, stream=True) + file_path = (DOWNLOAD_FOLDER / file_name).as_posix() + with open(file_path, "wb") as f: + for data in response.iter_content(): + f.write(data) + return file_path + + +def handle_pypi_project(pypi_project_url: str) -> dict[str, Any]: + """ + Example: https://pypi.org/project/fractal-tasks-core + """ + + # Extract project_name + parts = pypi_project_url.split("/") + if parts[:4] != ["https:", "", "pypi.org", "project"]: + raise ValueError( + f"Invalid {pypi_project_url=}.\n" + "Valid example: https://pypi.org/project/fractal-tasks-core" + ) + project_name = parts[4] + + # Fetch and parse PyPI information + pypi_api_url = f"https://pypi.org/pypi/{project_name}/json" + res = requests.get(pypi_api_url) + response_data = res.json() + if not res.status_code == 200: + raise RuntimeError(f"Invalid response from {pypi_api_url}: {res}") + latest_version = response_data["info"]["version"] + releases = response_data["releases"] + latest_release = releases[latest_version] + latest_release_wheel_assets = [ + item for item in latest_release if item["filename"].endswith(".whl") + ] + if len(latest_release_wheel_assets) > 1: + raise ValueError( + f"Found more than one wheel asset in release {latest_version}." + ) + latest_release_wheel_asset = latest_release_wheel_assets[0] + latest_release_wheel_asset_url = latest_release_wheel_asset["url"] + + # Download wheel and parse manifest + wheel_path = download_file(latest_release_wheel_asset_url) + info = parse_wheel_filename(wheel_path) + manifest = load_manifest_from_zip(wheel_path) + Path(wheel_path).unlink() + + return dict(manifest=manifest, **info) + + +def handle_github_repository(github_url: str) -> dict[str, Any]: + """ + Example: + https://github.com/fractal-analytics-platform/fractal-lif-converters/ + """ + + # Extract owner and repository + parts = github_url.split("/") + if parts[:3] != ["https:", "", "github.com"]: + print(parts) + raise ValueError( + f"Invalid {github_url=}.\n" + "Valid example: https://github.com/fractal-analytics-platform/fractal-lif-converters" + ) + owner, repository = parts[3:5] + + # Fetch and parse GitHub information + github_api_url = ( + f"https://api.github.com/repos/{owner}/{repository}/releases/latest" + ) + headers = { + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + } + res = requests.get(github_api_url, headers=headers) + if not res.status_code == 200: + raise RuntimeError(f"Invalid response from {github_api_url}: {res}") + assets = res.json()["assets"] + wheel_assets = [asset for asset in assets if asset["name"].endswith(".whl")] + if len(wheel_assets) > 1: + raise ValueError("Found more than one wheel asset in latest GitHub release.") + wheel_asset = wheel_assets[0] + wheel_asset_browser_download_url = wheel_asset["browser_download_url"] + + # Download wheel and parse manifest + wheel_path = download_file(wheel_asset_browser_download_url) + info = parse_wheel_filename(wheel_path) + manifest = load_manifest_from_zip(wheel_path) + Path(wheel_path).unlink() + + return dict(manifest=manifest, **info) + + +def get_package_info(source: str) -> dict[str, Any]: + if source.startswith("https://github.com"): + return handle_github_repository(source) + elif source.startswith("https://pypi.org"): + return handle_pypi_project(source) + else: + raise ValueError(f"Invalid {source=}.") + + +def _get_task_type( + task: dict[str, Any], +) -> Literal["parallel", "non_parallel", "compound"]: + np = task.get("executable_non_parallel", None) + p = task.get("executable_parallel", None) + if p and np: + return "compound" + elif p and not np: + return "parallel" + elif np and not p: + return "non_parallel" + else: + raise ValueError(f"Invalid task with {p=} and {np=}.") + + +COLUMN_NAMES = [ + "version", + "name", + "category", + "modality", + "tags", + "input_types", + "output_types", + "docs_link", +] +COLUMN_DEFAULTS = { + "input_types": {}, + "output_types": {}, + "tags": [], +} +COLUMN_TITLES = list(map(str.title, COLUMN_NAMES)) + + +# Read and filter list of sources +sources_file = Path(__file__).parent / "sources.txt" +with sources_file.open("r") as f: + sources = f.read().splitlines() +sources = [ + source + for source in sources + if not (source.startswith("#") or source == "") +] + +TASKS = [] +for source in sources: + t_start = time.perf_counter() + print(f"START processing {source=}") + try: + new_tasks = [] + data = get_package_info(source) + pkg_name = data["name"] + pkg_version = data.get("version") + pkg_task_list = data["manifest"]["task_list"] + for task in pkg_task_list: + new_task = dict() + for column_name in COLUMN_NAMES: + new_task[column_name] = task.get( + column_name, COLUMN_DEFAULTS.get(column_name, None) + ) + new_task["version"] = pkg_version + new_task["type"] = _get_task_type(task) + TaskReadV2(**new_task) + new_tasks.append(new_task) + except Exception as e: + print(f"ERROR, skip.\nOriginal error:\n{str(e)}") + TASKS.extend(new_tasks) + t_end = time.perf_counter() + print(f"END processing {source=} - elapsed {t_end-t_start:.3f} s.") + print() + +output_file = Path(__file__).parent / "tasks_data.json" +with output_file.open("w") as f: + json.dump(TASKS, f, indent=2) + +DOWNLOAD_FOLDER.rmdir() diff --git a/tasks/data_retrieval/requirements.txt b/tasks/data_retrieval/requirements.txt new file mode 100644 index 0000000..e614a88 --- /dev/null +++ b/tasks/data_retrieval/requirements.txt @@ -0,0 +1,2 @@ +requests +pydantic diff --git a/tasks/data_retrieval/sources.txt b/tasks/data_retrieval/sources.txt new file mode 100644 index 0000000..a7281bf --- /dev/null +++ b/tasks/data_retrieval/sources.txt @@ -0,0 +1,16 @@ +# PyPI +https://pypi.org/project/fractal-tasks-core/ +https://pypi.org/project/fractal-faim-ipa +https://pypi.org/project/fractal-lif-converters +https://pypi.org/project/operetta-compose + +# GitHub releases with wheels +https://github.com/fractal-analytics-platform/fractal-lif-converters/ + + +# https://github.com/fractal-analytics-platform/fractal-helper-tasks +# https://github.com/fmi-basel/gliberal-scMultipleX +# https://github.com/Apricot-Therapeutics/APx_fractal_task_collection +# https://github.com/fractal-analytics-platform/fractal-plantseg-tasks +# https://github.com/m-albert/fractal-ome-zarr-hcs-stitching/archive +# https://github.com/fractal-analytics-platform/fractal-ilastik-tasksC/archive/refs/tags/0.1.1.zip diff --git a/tasks_page/data_retrieval/create_tasks_data.py b/tasks_page/data_retrieval/create_tasks_data.py new file mode 100644 index 0000000..ec85595 --- /dev/null +++ b/tasks_page/data_retrieval/create_tasks_data.py @@ -0,0 +1,244 @@ +import time +from pathlib import Path +import requests +import json +from os import environ +from zipfile import ZipFile +from typing import Any +from pydantic import BaseModel +from typing import Optional, Literal + + +DOWNLOAD_FOLDER = Path(__file__).parent / "downloads" +DOWNLOAD_FOLDER.mkdir(exist_ok=True) + + +class TaskReadV2(BaseModel): + """ + Based on + https://github.com/fractal-analytics-platform/fractal-server/blob/main/fractal_server/app/schemas/v2/task.py + """ + + name: str + type: Literal["parallel", "non_parallel", "compound"] + source: Optional[str] = None + version: Optional[str] = None + docs_info: Optional[str] = None + docs_link: Optional[str] = None + input_types: dict[str, bool] + output_types: dict[str, bool] + category: Optional[str] = None + modality: Optional[str] = None + authors: Optional[str] = None + tags: list[str] + + class Config: + extra = "forbid" + + +def parse_wheel_filename(wheel_path: str) -> dict[str, str]: + """ + Given a wheel-file name or path, extract distribution and version. + """ + wheel_filename = wheel_path.split("/")[-1] + parts = wheel_filename.split("-") + return dict(name=parts[0], version=parts[1]) + + +def load_manifest_from_zip(wheel_path: str) -> dict[str, Any]: + """ + Given a wheel file on-disk, extract the Fractal manifest. + """ + with ZipFile(wheel_path) as wheel: + namelist = wheel.namelist() + try: + manifest = next( + name for name in namelist if "__FRACTAL_MANIFEST__.json" in name + ) + except StopIteration: + msg = f"{wheel_path} does not include __FRACTAL_MANIFEST__.json" + raise ValueError(msg) + with wheel.open(manifest) as manifest_fd: + manifest_dict = json.load(manifest_fd) + return manifest_dict + + +def download_file(url: str) -> str: + file_name = url.split("/")[-1] + response = requests.get(url, stream=True) + file_path = (DOWNLOAD_FOLDER / file_name).as_posix() + with open(file_path, "wb") as f: + for data in response.iter_content(): + f.write(data) + return file_path + + +def handle_pypi_project(pypi_project_url: str) -> dict[str, Any]: + """ + Example: https://pypi.org/project/fractal-tasks-core + """ + + # Extract project_name + parts = pypi_project_url.split("/") + if parts[:4] != ["https:", "", "pypi.org", "project"]: + raise ValueError( + f"Invalid {pypi_project_url=}.\n" + "Valid example: https://pypi.org/project/fractal-tasks-core" + ) + project_name = parts[4] + + # Fetch and parse PyPI information + pypi_api_url = f"https://pypi.org/pypi/{project_name}/json" + res = requests.get(pypi_api_url) + response_data = res.json() + if not res.status_code == 200: + raise RuntimeError(f"Invalid response from {pypi_api_url}: {res}") + latest_version = response_data["info"]["version"] + releases = response_data["releases"] + latest_release = releases[latest_version] + latest_release_wheel_assets = [ + item for item in latest_release if item["filename"].endswith(".whl") + ] + if len(latest_release_wheel_assets) > 1: + raise ValueError( + f"Found more than one wheel asset in release {latest_version}." + ) + latest_release_wheel_asset = latest_release_wheel_assets[0] + latest_release_wheel_asset_url = latest_release_wheel_asset["url"] + + # Download wheel and parse manifest + wheel_path = download_file(latest_release_wheel_asset_url) + info = parse_wheel_filename(wheel_path) + manifest = load_manifest_from_zip(wheel_path) + Path(wheel_path).unlink() + + return dict(manifest=manifest, **info) + + +def handle_github_repository(github_url: str) -> dict[str, Any]: + """ + Example: + https://github.com/fractal-analytics-platform/fractal-lif-converters/ + """ + + # Extract owner and repository + parts = github_url.split("/") + if parts[:3] != ["https:", "", "github.com"]: + print(parts) + raise ValueError( + f"Invalid {github_url=}.\n" + "Valid example: https://github.com/fractal-analytics-platform/fractal-lif-converters" + ) + owner, repository = parts[3:5] + + # Fetch and parse GitHub information + github_api_url = ( + f"https://api.github.com/repos/{owner}/{repository}/releases/latest" + ) + headers = { + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + } + res = requests.get(github_api_url, headers=headers) + if not res.status_code == 200: + raise RuntimeError(f"Invalid response from {github_api_url}: {res}") + assets = res.json()["assets"] + wheel_assets = [asset for asset in assets if asset["name"].endswith(".whl")] + if len(wheel_assets) > 1: + raise ValueError("Found more than one wheel asset in latest GitHub release.") + wheel_asset = wheel_assets[0] + wheel_asset_browser_download_url = wheel_asset["browser_download_url"] + + # Download wheel and parse manifest + wheel_path = download_file(wheel_asset_browser_download_url) + info = parse_wheel_filename(wheel_path) + manifest = load_manifest_from_zip(wheel_path) + Path(wheel_path).unlink() + + return dict(manifest=manifest, **info) + + +def get_package_info(source: str) -> dict[str, Any]: + if source.startswith("https://github.com"): + return handle_github_repository(source) + elif source.startswith("https://pypi.org"): + return handle_pypi_project(source) + else: + raise ValueError(f"Invalid {source=}.") + + +def _get_task_type( + task: dict[str, Any], +) -> Literal["parallel", "non_parallel", "compound"]: + np = task.get("executable_non_parallel", None) + p = task.get("executable_parallel", None) + if p and np: + return "compound" + elif p and not np: + return "parallel" + elif np and not p: + return "non_parallel" + else: + raise ValueError(f"Invalid task with {p=} and {np=}.") + + +COLUMN_NAMES = [ + "version", + "name", + "category", + "modality", + "tags", + "input_types", + "output_types", + "docs_link", +] +COLUMN_DEFAULTS = { + "input_types": {}, + "output_types": {}, + "tags": [], +} +COLUMN_TITLES = list(map(str.title, COLUMN_NAMES)) + + +# Read and filter list of sources +sources_file = Path(__file__).parent / "sources.txt" +with sources_file.open("r") as f: + sources = f.read().splitlines() +sources = [ + source + for source in sources + if not (source.startswith("#") or source == "") +] + +TASKS = [] +for source in sources: + t_start = time.perf_counter() + print(f"START processing {source=}") + try: + new_tasks = [] + data = get_package_info(source) + pkg_name = data["name"] + pkg_version = data.get("version") + pkg_task_list = data["manifest"]["task_list"] + for task in pkg_task_list: + new_task = dict() + for column_name in COLUMN_NAMES: + new_task[column_name] = task.get( + column_name, COLUMN_DEFAULTS.get(column_name, None) + ) + new_task["version"] = pkg_version + new_task["type"] = _get_task_type(task) + TaskReadV2(**new_task) + new_tasks.append(new_task) + except Exception as e: + print(f"ERROR, skip.\nOriginal error:\n{str(e)}") + TASKS.extend(new_tasks) + t_end = time.perf_counter() + print(f"END processing {source=} - elapsed {t_end-t_start:.3f} s.") + print() + +output_file = Path(__file__).parent / "tasks_data.json" +with output_file.open("w") as f: + json.dump(TASKS, f, indent=2) + +DOWNLOAD_FOLDER.rmdir() From 5f6720dd9ff38869a605798199673d1b6dd14229 Mon Sep 17 00:00:00 2001 From: Tommaso Comparin <3862206+tcompa@users.noreply.github.com> Date: Wed, 4 Dec 2024 14:38:00 +0100 Subject: [PATCH 2/3] remove obsolete file --- .../data_retrieval/create_tasks_data.py | 244 ------------------ 1 file changed, 244 deletions(-) delete mode 100644 tasks_page/data_retrieval/create_tasks_data.py diff --git a/tasks_page/data_retrieval/create_tasks_data.py b/tasks_page/data_retrieval/create_tasks_data.py deleted file mode 100644 index ec85595..0000000 --- a/tasks_page/data_retrieval/create_tasks_data.py +++ /dev/null @@ -1,244 +0,0 @@ -import time -from pathlib import Path -import requests -import json -from os import environ -from zipfile import ZipFile -from typing import Any -from pydantic import BaseModel -from typing import Optional, Literal - - -DOWNLOAD_FOLDER = Path(__file__).parent / "downloads" -DOWNLOAD_FOLDER.mkdir(exist_ok=True) - - -class TaskReadV2(BaseModel): - """ - Based on - https://github.com/fractal-analytics-platform/fractal-server/blob/main/fractal_server/app/schemas/v2/task.py - """ - - name: str - type: Literal["parallel", "non_parallel", "compound"] - source: Optional[str] = None - version: Optional[str] = None - docs_info: Optional[str] = None - docs_link: Optional[str] = None - input_types: dict[str, bool] - output_types: dict[str, bool] - category: Optional[str] = None - modality: Optional[str] = None - authors: Optional[str] = None - tags: list[str] - - class Config: - extra = "forbid" - - -def parse_wheel_filename(wheel_path: str) -> dict[str, str]: - """ - Given a wheel-file name or path, extract distribution and version. - """ - wheel_filename = wheel_path.split("/")[-1] - parts = wheel_filename.split("-") - return dict(name=parts[0], version=parts[1]) - - -def load_manifest_from_zip(wheel_path: str) -> dict[str, Any]: - """ - Given a wheel file on-disk, extract the Fractal manifest. - """ - with ZipFile(wheel_path) as wheel: - namelist = wheel.namelist() - try: - manifest = next( - name for name in namelist if "__FRACTAL_MANIFEST__.json" in name - ) - except StopIteration: - msg = f"{wheel_path} does not include __FRACTAL_MANIFEST__.json" - raise ValueError(msg) - with wheel.open(manifest) as manifest_fd: - manifest_dict = json.load(manifest_fd) - return manifest_dict - - -def download_file(url: str) -> str: - file_name = url.split("/")[-1] - response = requests.get(url, stream=True) - file_path = (DOWNLOAD_FOLDER / file_name).as_posix() - with open(file_path, "wb") as f: - for data in response.iter_content(): - f.write(data) - return file_path - - -def handle_pypi_project(pypi_project_url: str) -> dict[str, Any]: - """ - Example: https://pypi.org/project/fractal-tasks-core - """ - - # Extract project_name - parts = pypi_project_url.split("/") - if parts[:4] != ["https:", "", "pypi.org", "project"]: - raise ValueError( - f"Invalid {pypi_project_url=}.\n" - "Valid example: https://pypi.org/project/fractal-tasks-core" - ) - project_name = parts[4] - - # Fetch and parse PyPI information - pypi_api_url = f"https://pypi.org/pypi/{project_name}/json" - res = requests.get(pypi_api_url) - response_data = res.json() - if not res.status_code == 200: - raise RuntimeError(f"Invalid response from {pypi_api_url}: {res}") - latest_version = response_data["info"]["version"] - releases = response_data["releases"] - latest_release = releases[latest_version] - latest_release_wheel_assets = [ - item for item in latest_release if item["filename"].endswith(".whl") - ] - if len(latest_release_wheel_assets) > 1: - raise ValueError( - f"Found more than one wheel asset in release {latest_version}." - ) - latest_release_wheel_asset = latest_release_wheel_assets[0] - latest_release_wheel_asset_url = latest_release_wheel_asset["url"] - - # Download wheel and parse manifest - wheel_path = download_file(latest_release_wheel_asset_url) - info = parse_wheel_filename(wheel_path) - manifest = load_manifest_from_zip(wheel_path) - Path(wheel_path).unlink() - - return dict(manifest=manifest, **info) - - -def handle_github_repository(github_url: str) -> dict[str, Any]: - """ - Example: - https://github.com/fractal-analytics-platform/fractal-lif-converters/ - """ - - # Extract owner and repository - parts = github_url.split("/") - if parts[:3] != ["https:", "", "github.com"]: - print(parts) - raise ValueError( - f"Invalid {github_url=}.\n" - "Valid example: https://github.com/fractal-analytics-platform/fractal-lif-converters" - ) - owner, repository = parts[3:5] - - # Fetch and parse GitHub information - github_api_url = ( - f"https://api.github.com/repos/{owner}/{repository}/releases/latest" - ) - headers = { - "Accept": "application/vnd.github+json", - "X-GitHub-Api-Version": "2022-11-28", - } - res = requests.get(github_api_url, headers=headers) - if not res.status_code == 200: - raise RuntimeError(f"Invalid response from {github_api_url}: {res}") - assets = res.json()["assets"] - wheel_assets = [asset for asset in assets if asset["name"].endswith(".whl")] - if len(wheel_assets) > 1: - raise ValueError("Found more than one wheel asset in latest GitHub release.") - wheel_asset = wheel_assets[0] - wheel_asset_browser_download_url = wheel_asset["browser_download_url"] - - # Download wheel and parse manifest - wheel_path = download_file(wheel_asset_browser_download_url) - info = parse_wheel_filename(wheel_path) - manifest = load_manifest_from_zip(wheel_path) - Path(wheel_path).unlink() - - return dict(manifest=manifest, **info) - - -def get_package_info(source: str) -> dict[str, Any]: - if source.startswith("https://github.com"): - return handle_github_repository(source) - elif source.startswith("https://pypi.org"): - return handle_pypi_project(source) - else: - raise ValueError(f"Invalid {source=}.") - - -def _get_task_type( - task: dict[str, Any], -) -> Literal["parallel", "non_parallel", "compound"]: - np = task.get("executable_non_parallel", None) - p = task.get("executable_parallel", None) - if p and np: - return "compound" - elif p and not np: - return "parallel" - elif np and not p: - return "non_parallel" - else: - raise ValueError(f"Invalid task with {p=} and {np=}.") - - -COLUMN_NAMES = [ - "version", - "name", - "category", - "modality", - "tags", - "input_types", - "output_types", - "docs_link", -] -COLUMN_DEFAULTS = { - "input_types": {}, - "output_types": {}, - "tags": [], -} -COLUMN_TITLES = list(map(str.title, COLUMN_NAMES)) - - -# Read and filter list of sources -sources_file = Path(__file__).parent / "sources.txt" -with sources_file.open("r") as f: - sources = f.read().splitlines() -sources = [ - source - for source in sources - if not (source.startswith("#") or source == "") -] - -TASKS = [] -for source in sources: - t_start = time.perf_counter() - print(f"START processing {source=}") - try: - new_tasks = [] - data = get_package_info(source) - pkg_name = data["name"] - pkg_version = data.get("version") - pkg_task_list = data["manifest"]["task_list"] - for task in pkg_task_list: - new_task = dict() - for column_name in COLUMN_NAMES: - new_task[column_name] = task.get( - column_name, COLUMN_DEFAULTS.get(column_name, None) - ) - new_task["version"] = pkg_version - new_task["type"] = _get_task_type(task) - TaskReadV2(**new_task) - new_tasks.append(new_task) - except Exception as e: - print(f"ERROR, skip.\nOriginal error:\n{str(e)}") - TASKS.extend(new_tasks) - t_end = time.perf_counter() - print(f"END processing {source=} - elapsed {t_end-t_start:.3f} s.") - print() - -output_file = Path(__file__).parent / "tasks_data.json" -with output_file.open("w") as f: - json.dump(TASKS, f, indent=2) - -DOWNLOAD_FOLDER.rmdir() From 7379d138b0a4ae9369f7afa61f2cd7c8b52c72ba Mon Sep 17 00:00:00 2001 From: Tommaso Comparin <3862206+tcompa@users.noreply.github.com> Date: Wed, 4 Dec 2024 14:40:25 +0100 Subject: [PATCH 3/3] Make output comply with task-group endpoint --- tasks/data_retrieval/create_tasks_data.py | 27 ++++++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/tasks/data_retrieval/create_tasks_data.py b/tasks/data_retrieval/create_tasks_data.py index ec85595..9b8821c 100644 --- a/tasks/data_retrieval/create_tasks_data.py +++ b/tasks/data_retrieval/create_tasks_data.py @@ -2,7 +2,6 @@ from pathlib import Path import requests import json -from os import environ from zipfile import ZipFile from typing import Any from pydantic import BaseModel @@ -36,6 +35,12 @@ class Config: extra = "forbid" +class TaskGroupReadV2(BaseModel): + pkg_name: str + version: Optional[str] = None + task_list: list[TaskReadV2] + + def parse_wheel_filename(wheel_path: str) -> dict[str, str]: """ Given a wheel-file name or path, extract distribution and version. @@ -210,12 +215,12 @@ def _get_task_type( if not (source.startswith("#") or source == "") ] -TASKS = [] +TASK_GROUPS = [] for source in sources: t_start = time.perf_counter() print(f"START processing {source=}") try: - new_tasks = [] + task_list = [] data = get_package_info(source) pkg_name = data["name"] pkg_version = data.get("version") @@ -229,16 +234,26 @@ def _get_task_type( new_task["version"] = pkg_version new_task["type"] = _get_task_type(task) TaskReadV2(**new_task) - new_tasks.append(new_task) + task_list.append(new_task) + + task_group = dict( + pkg_name=pkg_name, + version=pkg_version, + task_list=task_list, + ) except Exception as e: print(f"ERROR, skip.\nOriginal error:\n{str(e)}") - TASKS.extend(new_tasks) + + TaskGroupReadV2(**task_group) + + TASK_GROUPS.append(task_group) + t_end = time.perf_counter() print(f"END processing {source=} - elapsed {t_end-t_start:.3f} s.") print() output_file = Path(__file__).parent / "tasks_data.json" with output_file.open("w") as f: - json.dump(TASKS, f, indent=2) + json.dump(TASK_GROUPS, f, indent=2) DOWNLOAD_FOLDER.rmdir()