diff --git a/README.md b/README.md index 6873234..0b65641 100644 --- a/README.md +++ b/README.md @@ -163,13 +163,25 @@ databusclient download 'PREFIX dcat: SELECT ?x WHER databusclient deploy --help ``` ``` -Usage: databusclient deploy [OPTIONS] DISTRIBUTIONS... +Usage: databusclient deploy [OPTIONS] [DISTRIBUTIONS]... -Arguments: - DISTRIBUTIONS... distributions in the form of List[URL|CV|fileext|compression|sha256sum:contentlength] where URL is the - download URL and CV the key=value pairs (_ separted) - content variants of a distribution, fileExt and Compression can be set, if not they are inferred from the path [required] + Flexible deploy to databus command: + + - Classic dataset deployment + + - Metadata-based deployment + + - Upload & deploy via Nextcloud +Arguments: + DISTRIBUTIONS... Depending on mode: + - Classic mode: List of distributions in the form + URL|CV|fileext|compression|sha256sum:contentlength + (where URL is the download URL and CV the key=value pairs, + separated by underscores) + - Upload mode: List of local file or folder paths (must exist) + - Metdata mode: None + Options: --version-id TEXT Target databus version/dataset identifier of the form )\n + - Upload & deploy via Nextcloud (--webdav-url, --remote, --path) """ - click.echo(f"Deploying dataset version: {version_id}") - dataid = client.create_dataset(version_id, title, abstract, description, license_url, distributions) - client.deploy(dataid=dataid, api_key=apikey) + + # Sanity checks for conflicting options + if metadata_file and any([distributions, webdav_url, remote, path]): + raise click.UsageError("Invalid combination: when using --metadata, do not provide --webdav-url, --remote, --path, or distributions.") + if any([webdav_url, remote, path]) and not all([webdav_url, remote, path]): + raise click.UsageError("Invalid combination: when using WebDAV/Nextcloud mode, please provide --webdav-url, --remote, and --path together.") + + # === Mode 1: Classic Deploy === + if distributions and not (metadata_file or webdav_url or remote or path): + click.echo("[MODE] Classic deploy with distributions") + click.echo(f"Deploying dataset version: {version_id}") + + dataid = client.create_dataset(version_id, title, abstract, description, license_url, distributions) + client.deploy(dataid=dataid, api_key=apikey) + return + + # === Mode 2: Metadata File === + if metadata_file: + click.echo(f"[MODE] Deploy from metadata file: {metadata_file}") + with open(metadata_file, 'r') as f: + metadata = json.load(f) + client.deploy_from_metadata(metadata, version_id, title, abstract, description, license_url, apikey) + return + + # === Mode 3: Upload & Deploy (Nextcloud) === + if webdav_url and remote and path: + if not distributions: + raise click.UsageError("Please provide files to upload when using WebDAV/Nextcloud mode.") + + #Check that all given paths exist and are files or directories.# + invalid = [f for f in distributions if not os.path.exists(f)] + if invalid: + raise click.UsageError(f"The following input files or folders do not exist: {', '.join(invalid)}") + + click.echo("[MODE] Upload & Deploy to DBpedia Databus via Nextcloud") + click.echo(f"→ Uploading to: {remote}:{path}") + metadata = upload.upload_to_nextcloud(distributions, remote, path, webdav_url) + client.deploy_from_metadata(metadata, version_id, title, abstract, description, license_url, apikey) + return + + raise click.UsageError( + "No valid input provided. Please use one of the following modes:\n" + " - Classic deploy: pass distributions as arguments\n" + " - Metadata deploy: use --metadata \n" + " - Upload & deploy: use --webdav-url, --remote, --path, and file arguments" + ) @app.command() diff --git a/databusclient/client.py b/databusclient/client.py index 764bf6b..358f1a6 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -7,7 +7,6 @@ from SPARQLWrapper import SPARQLWrapper, JSON from hashlib import sha256 import os -import re __debug = False @@ -205,6 +204,56 @@ def create_distribution( return f"{url}|{meta_string}" +def create_distributions_from_metadata(metadata: List[Dict[str, Union[str, int]]]) -> List[str]: + """ + Create distributions from metadata entries. + + Parameters + ---------- + metadata : List[Dict[str, Union[str, int]]] + List of metadata entries, each containing: + - checksum: str - SHA-256 hex digest (64 characters) + - size: int - File size in bytes (positive integer) + - url: str - Download URL for the file + - file_format: str - File format of the file [optional] + - compression: str - Compression format of the file [optional] + + Returns + ------- + List[str] + List of distribution identifier strings for use with create_dataset + """ + distributions = [] + counter = 0 + + for entry in metadata: + # Validate required keys + required_keys = ["checksum", "size", "url"] + missing_keys = [key for key in required_keys if key not in entry] + if missing_keys: + raise ValueError(f"Metadata entry missing required keys: {missing_keys}") + + checksum = entry["checksum"] + size = entry["size"] + url = entry["url"] + if not isinstance(size, int) or size <= 0: + raise ValueError(f"Invalid size for {url}: expected positive integer, got {size}") + # Validate SHA-256 hex digest (64 hex chars) + if not isinstance(checksum, str) or len(checksum) != 64 or not all( + c in '0123456789abcdefABCDEF' for c in checksum): + raise ValueError(f"Invalid checksum for {url}") + + distributions.append( + create_distribution( + url=url, + cvs={"count": f"{counter}"}, + file_format=entry.get("file_format"), + compression=entry.get("compression"), + sha256_length_tuple=(checksum, size) + ) + ) + counter += 1 + return distributions def create_dataset( version_id: str, @@ -393,6 +442,55 @@ def deploy( print(resp.text) +def deploy_from_metadata( + metadata: List[Dict[str, Union[str, int]]], + version_id: str, + title: str, + abstract: str, + description: str, + license_url: str, + apikey: str +) -> None: + """ + Deploy a dataset from metadata entries. + + Parameters + ---------- + metadata : List[Dict[str, Union[str, int]]] + List of file metadata entries (see create_distributions_from_metadata) + version_id : str + Dataset version ID in the form $DATABUS_BASE/$ACCOUNT/$GROUP/$ARTIFACT/$VERSION + title : str + Dataset title + abstract : str + Short description of the dataset + description : str + Long description (Markdown supported) + license_url : str + License URI + apikey : str + API key for authentication + """ + distributions = create_distributions_from_metadata(metadata) + + dataset = create_dataset( + version_id=version_id, + title=title, + abstract=abstract, + description=description, + license_url=license_url, + distributions=distributions + ) + + print(f"Deploying dataset version: {version_id}") + deploy(dataset, apikey) + + print(f"Successfully deployed to {version_id}") + print(f"Deployed {len(metadata)} file(s):") + for entry in metadata: + print(f" - {entry['url']}") + + def __download_file__(url, filename, vault_token_file=None, auth_url=None, client_id=None) -> None: """ Download a file from the internet with a progress bar using tqdm. diff --git a/nextcloudclient/__init__.py b/nextcloudclient/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/nextcloudclient/upload.py b/nextcloudclient/upload.py new file mode 100644 index 0000000..f0d3328 --- /dev/null +++ b/nextcloudclient/upload.py @@ -0,0 +1,82 @@ +import hashlib +import os +import subprocess +import posixpath +from urllib.parse import urljoin, quote + + +def compute_sha256_and_length(filepath): + sha256 = hashlib.sha256() + total_length = 0 + with open(filepath, 'rb') as f: + while True: + chunk = f.read(4096) + if not chunk: + break + sha256.update(chunk) + total_length += len(chunk) + return sha256.hexdigest(), total_length + +def get_all_files(path): + if os.path.isfile(path): + return [path] + files = [] + for root, _, filenames in os.walk(path): + for name in filenames: + files.append(os.path.join(root, name)) + return files + +def upload_to_nextcloud(source_paths: list[str], remote_name: str, remote_path: str, webdav_url: str): + result = [] + for path in source_paths: + if not os.path.exists(path): + print(f"Path not found: {path}") + continue + + abs_path = os.path.abspath(path) + basename = os.path.basename(abs_path) + files = get_all_files(abs_path) + + tmp_results = [] + + for file in files: + checksum,size = compute_sha256_and_length(file) + + if os.path.isdir(path): + rel_file = os.path.relpath(file, abs_path) + # Normalize to POSIX for WebDAV/URLs + rel_file = rel_file.replace(os.sep, "/") + remote_webdav_path = posixpath.join(remote_path, basename, rel_file) + else: + remote_webdav_path = posixpath.join(remote_path, os.path.basename(file)) + + # Preserve scheme/host and percent-encode path segments + url = urljoin(webdav_url.rstrip("/") + "/", quote(remote_webdav_path.lstrip("/"), safe="/")) + + filename = os.path.basename(file) + tmp_results.append({ + "filename": filename, + "checksum": checksum, + "size": size, + "url": url, + }) + + dest_subpath = posixpath.join(remote_path.lstrip("/"), basename) + if os.path.isdir(path): + destination = f"{remote_name}:{dest_subpath}" + command = ["rclone", "copy", abs_path, destination, "--progress"] + else: + destination = f"{remote_name}:{dest_subpath}" + command = ["rclone", "copyto", abs_path, destination, "--progress"] + + print(f"Upload: {path} → {destination}") + try: + subprocess.run(command, check=True) + result.extend(tmp_results) + print("✅ Uploaded successfully.\n") + except subprocess.CalledProcessError as e: + print(f"❌ Error uploading {path}: {e}\n") + except FileNotFoundError: + print("❌ rclone not found on PATH. Install rclone and retry.") + + return result diff --git a/poetry.lock b/poetry.lock index c5b6e69..b4b80af 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. [[package]] name = "black" @@ -442,7 +442,7 @@ description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_version == \"3.9\"" +markers = "python_version < \"3.10\"" files = [ {file = "typing_extensions-4.9.0-py3-none-any.whl", hash = "sha256:af72aea155e91adfc61c3ae9e0e342dbc0cba726d6cba4b6c72c1f34e47291cd"}, {file = "typing_extensions-4.9.0.tar.gz", hash = "sha256:23478f88c37f27d76ac8aee6c905017a143b0b1b886c3c9f66bc2fd94f9f5783"},