From 3f73738d16ee2292324568757dad094030c17d4c Mon Sep 17 00:00:00 2001 From: Theo Date: Thu, 3 Jul 2025 15:34:45 +0200 Subject: [PATCH 01/39] added deploy script with uploading to given rclone remote --- README.md | 21 +++++++++ databusclient/client.py | 2 +- databusclient/deploy.py | 92 +++++++++++++++++++++++++++++++++++++++ nextcloudclient/upload.py | 69 +++++++++++++++++++++++++++++ poetry.lock | 52 ++++++++++++++++++++-- pyproject.toml | 1 + test.sh | 8 ++-- 7 files changed, 236 insertions(+), 9 deletions(-) create mode 100644 databusclient/deploy.py create mode 100644 nextcloudclient/upload.py diff --git a/README.md b/README.md index cac4401..9fb58e0 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,27 @@ python3 -m pip install databusclient ``` +## Upload to Nextcloud and Deploy to Databus +Please add databus API_KEY to .env file + +The script uploads all given files and all files in the given folders to the given remote. +Then registers them on the databus. +### Example Call +```bash +cd databusclient + +python deploy.py \ +--remote scads-nextcloud \ +--path test \ +--version-id https://databus.dbpedia.org/gg46ixav/test_group/test_artifact/2023-07-03 \ +--title "Test Dataset" \ +--abstract "This is a short abstract of the test dataset." \ +--description "This dataset was uploaded for testing the Nextcloud → Databus deployment pipeline." \ +--license https://dalicc.net/licenselibrary/Apache-2.0 \ +/home/theo/Work/SCADS.AI/Projects/CSVTest/newtestoutputfolder \ +/home/theo/Work/SCADS.AI/Projects/CSVTest/output.csv.bz2 + +``` ## CLI Usage ```bash databusclient --help diff --git a/databusclient/client.py b/databusclient/client.py index 5cb5061..e135672 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -342,7 +342,7 @@ def append_to_dataset_graph_if_existent(add_key: str, add_value: str): graphs.append(dataset_graph) dataset = { - "@context": "https://downloads.dbpedia.org/databus/context.jsonld", + "@context": "https://databus.dbpedia.org/res/context.jsonld", "@graph": graphs, } return dataset diff --git a/databusclient/deploy.py b/databusclient/deploy.py new file mode 100644 index 0000000..75fd3d0 --- /dev/null +++ b/databusclient/deploy.py @@ -0,0 +1,92 @@ +import os +import sys +import argparse + +from databusclient import create_distribution, create_dataset, deploy +from dotenv import load_dotenv + +from nextcloudclient.upload import upload_to_nextcloud + + +def deploy_to_databus( + metadata, + version_id, + title, + abstract, + description, + license_url +): + + load_dotenv() + api_key = os.getenv("API_KEY") + if not api_key: + raise ValueError("API_KEY not found in .env") + + distributions = [] + counter=0 + for filename, checksum, size, url in metadata: + + parts = filename.split(".") + if len(parts) == 1: + file_format = "none" + compression = "none" + elif len(parts) == 2: + file_format = parts[-1] + compression = "none" + else: + file_format = parts[-2] + compression = parts[-1] + + distributions.append( + create_distribution( + url=url, + cvs={"count":f"{counter}"}, + file_format=file_format, + compression=compression, + sha256_length_tuple=(checksum, size) + ) + ) + counter+=1 + + dataset = create_dataset( + version_id=version_id, + title=title, + abstract=abstract, + description=description, + license_url=license_url, + distributions=distributions + ) + + deploy(dataset, api_key) + metadata_string = ",\n".join([entry[-1] for entry in metadata]) + + print(f"Successfully deployed\n{metadata_string}\nto databus {version_id}") + +def parse_args(): + parser = argparse.ArgumentParser(description="Upload files to Nextcloud and deploy to DBpedia Databus.") + + parser.add_argument("files", nargs="+", help="Path(s) to file(s) or folder(s) to upload") + parser.add_argument("--remote", required=True, help="rclone remote name (e.g., 'nextcloud')") + parser.add_argument("--path", required=True, help="Remote path on Nextcloud (e.g., 'datasets/mydataset')") + parser.add_argument("--version-id", required=True, help="Databus version URI") + parser.add_argument("--title", required=True, help="Title of the dataset") + parser.add_argument("--abstract", required=True, help="Short abstract of the dataset") + parser.add_argument("--description", required=True, help="Detailed description of the dataset") + parser.add_argument("--license", required=True, help="License URL (e.g., https://dalicc.net/licenselibrary/Apache-2.0)") + + return parser.parse_args() + +if __name__ == '__main__': + + args = parse_args() + + metadata = upload_to_nextcloud(args.files, args.remote, args.path) + + deploy_to_databus( + metadata, + version_id=args.version_id, + title=args.title, + abstract=args.abstract, + description=args.description, + license_url=args.license + ) \ No newline at end of file diff --git a/nextcloudclient/upload.py b/nextcloudclient/upload.py new file mode 100644 index 0000000..b8ac989 --- /dev/null +++ b/nextcloudclient/upload.py @@ -0,0 +1,69 @@ +import hashlib +import os +import subprocess +import posixpath + +BASE_URL = "https://cloud.scadsai.uni-leipzig.de" + +def compute_sha256_and_length(filepath): + sha256 = hashlib.sha256() + total_length = 0 + with open(filepath, 'rb') as f: + while True: + chunk = f.read(4096) + if not chunk: + break + sha256.update(chunk) + total_length += len(chunk) + return sha256.hexdigest(), total_length + +def get_all_files(path): + if os.path.isfile(path): + return [path] + files = [] + for root, _, filenames in os.walk(path): + for name in filenames: + files.append(os.path.join(root, name)) + return files + +def upload_to_nextcloud(source_paths: str, remote_name: str, remote_path: str): + result = [] + for path in source_paths: + if not os.path.exists(path): + print(f"Path not found: {path}") + continue + + abs_path = os.path.abspath(path) + basename = os.path.basename(abs_path) + files = get_all_files(abs_path) + + for file in files: + checksum,size = compute_sha256_and_length(file) + + if os.path.isdir(path): + rel_file = os.path.relpath(file, abs_path) + remote_webdav_path = posixpath.join(remote_path, basename, rel_file) + else: + remote_webdav_path = posixpath.join(remote_path, os.path.basename(file)) + + url = f"{BASE_URL}/remote.php/webdav/{remote_webdav_path}" + + filename = file.split("/")[-1] + result.append((filename, checksum, size, url)) + + if os.path.isdir(path): + destination = f"{remote_name}:{remote_path}/{basename}" + command = ["rclone", "copy", abs_path, destination, "--progress"] + else: + destination = f"{remote_name}:{remote_path}/{basename}" + command = ["rclone", "copyto", abs_path, destination, "--progress"] + + print(f"Upload: {path} → {destination}") + try: + subprocess.run(command, check=True) + print("✅ Uploaded successfully.\n") + except subprocess.CalledProcessError as e: + print(f"❌ Error uploading {path}: {e}\n") + + + return result diff --git a/poetry.lock b/poetry.lock index 6add7d4..2f1b811 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. [[package]] name = "black" @@ -6,6 +6,7 @@ version = "22.12.0" description = "The uncompromising code formatter." optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "black-22.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eedd20838bd5d75b80c9f5487dbcb06836a43833a37846cf1d8c1cc01cef59d"}, {file = "black-22.12.0-cp310-cp310-win_amd64.whl", hash = "sha256:159a46a4947f73387b4d83e87ea006dbb2337eab6c879620a3ba52699b1f4351"}, @@ -41,6 +42,7 @@ version = "2024.2.2" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "certifi-2024.2.2-py3-none-any.whl", hash = "sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1"}, {file = "certifi-2024.2.2.tar.gz", hash = "sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f"}, @@ -52,6 +54,7 @@ version = "3.3.2" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." optional = false python-versions = ">=3.7.0" +groups = ["main"] files = [ {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"}, {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"}, @@ -151,6 +154,7 @@ version = "8.1.7" description = "Composable command line interface toolkit" optional = false python-versions = ">=3.7" +groups = ["main", "dev"] files = [ {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"}, {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"}, @@ -165,10 +169,12 @@ version = "0.4.6" description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +groups = ["main", "dev"] files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +markers = {main = "platform_system == \"Windows\"", dev = "platform_system == \"Windows\" or sys_platform == \"win32\""} [[package]] name = "exceptiongroup" @@ -176,6 +182,8 @@ version = "1.2.0" description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" +groups = ["dev"] +markers = "python_version < \"3.11\"" files = [ {file = "exceptiongroup-1.2.0-py3-none-any.whl", hash = "sha256:4bfd3996ac73b41e9b9628b04e079f193850720ea5945fc96a08633c66912f14"}, {file = "exceptiongroup-1.2.0.tar.gz", hash = "sha256:91f5c769735f051a4290d52edd0858999b57e5876e9f85937691bd4c9fa3ed68"}, @@ -190,6 +198,7 @@ version = "3.6" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.5" +groups = ["main"] files = [ {file = "idna-3.6-py3-none-any.whl", hash = "sha256:c05567e9c24a6b9faaa835c4821bad0590fbb9d5779e7caa6e1cc4978e7eb24f"}, {file = "idna-3.6.tar.gz", hash = "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca"}, @@ -201,6 +210,7 @@ version = "2.0.0" description = "brain-dead simple config-ini parsing" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, @@ -212,6 +222,7 @@ version = "0.6.1" description = "An ISO 8601 date/time/duration parser and formatter" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "isodate-0.6.1-py2.py3-none-any.whl", hash = "sha256:0751eece944162659049d35f4f549ed815792b38793f07cf73381c1c87cbed96"}, {file = "isodate-0.6.1.tar.gz", hash = "sha256:48c5881de7e8b0a0d648cb024c8062dc84e7b840ed81e864c7614fd3c127bde9"}, @@ -226,6 +237,7 @@ version = "1.0.0" description = "Type system extensions for programs checked with the mypy type checker." optional = false python-versions = ">=3.5" +groups = ["dev"] files = [ {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, @@ -237,6 +249,7 @@ version = "23.2" description = "Core utilities for Python packages" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "packaging-23.2-py3-none-any.whl", hash = "sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7"}, {file = "packaging-23.2.tar.gz", hash = "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5"}, @@ -248,6 +261,7 @@ version = "0.12.1" description = "Utility library for gitignore style pattern matching of file paths." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"}, {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"}, @@ -259,6 +273,7 @@ version = "4.2.0" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "platformdirs-4.2.0-py3-none-any.whl", hash = "sha256:0614df2a2f37e1a662acbd8e2b25b92ccf8632929bc6d43467e17fe89c75e068"}, {file = "platformdirs-4.2.0.tar.gz", hash = "sha256:ef0cc731df711022c174543cb70a9b5bd22e5a9337c8624ef2c2ceb8ddad8768"}, @@ -274,6 +289,7 @@ version = "1.4.0" description = "plugin and hook calling mechanisms for python" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "pluggy-1.4.0-py3-none-any.whl", hash = "sha256:7db9f7b503d67d1c5b95f59773ebb58a8c1c288129a88665838012cfb07b8981"}, {file = "pluggy-1.4.0.tar.gz", hash = "sha256:8c85c2876142a764e5b7548e7d9a0e0ddb46f5185161049a79b7e974454223be"}, @@ -289,6 +305,7 @@ version = "3.1.1" description = "pyparsing module - Classes and methods to define and execute parsing grammars" optional = false python-versions = ">=3.6.8" +groups = ["main"] files = [ {file = "pyparsing-3.1.1-py3-none-any.whl", hash = "sha256:32c7c0b711493c72ff18a981d24f28aaf9c1fb7ed5e9667c9e84e3db623bdbfb"}, {file = "pyparsing-3.1.1.tar.gz", hash = "sha256:ede28a1a32462f5a9705e07aea48001a08f7cf81a021585011deba701581a0db"}, @@ -303,6 +320,7 @@ version = "7.4.4" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"}, {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"}, @@ -319,12 +337,28 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +[[package]] +name = "python-dotenv" +version = "1.1.1" +description = "Read key-value pairs from a .env file and set them as environment variables" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "python_dotenv-1.1.1-py3-none-any.whl", hash = "sha256:31f23644fe2602f88ff55e1f5c79ba497e01224ee7737937930c448e4d0e24dc"}, + {file = "python_dotenv-1.1.1.tar.gz", hash = "sha256:a8a6399716257f45be6a007360200409fce5cda2661e3dec71d23dc15f6189ab"}, +] + +[package.extras] +cli = ["click (>=5.0)"] + [[package]] name = "rdflib" version = "7.0.0" description = "RDFLib is a Python library for working with RDF, a simple yet powerful language for representing information." optional = false python-versions = ">=3.8.1,<4.0.0" +groups = ["main"] files = [ {file = "rdflib-7.0.0-py3-none-any.whl", hash = "sha256:0438920912a642c866a513de6fe8a0001bd86ef975057d6962c79ce4771687cd"}, {file = "rdflib-7.0.0.tar.gz", hash = "sha256:9995eb8569428059b8c1affd26b25eac510d64f5043d9ce8c84e0d0036e995ae"}, @@ -346,6 +380,7 @@ version = "2.31.0" description = "Python HTTP for Humans." optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"}, {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"}, @@ -367,6 +402,7 @@ version = "1.16.0" description = "Python 2 and 3 compatibility utilities" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +groups = ["main"] files = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, @@ -378,6 +414,7 @@ version = "2.0.0" description = "SPARQL Endpoint interface to Python" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "SPARQLWrapper-2.0.0-py3-none-any.whl", hash = "sha256:c99a7204fff676ee28e6acef327dc1ff8451c6f7217dcd8d49e8872f324a8a20"}, {file = "SPARQLWrapper-2.0.0.tar.gz", hash = "sha256:3fed3ebcc77617a4a74d2644b86fd88e0f32e7f7003ac7b2b334c026201731f1"}, @@ -398,6 +435,8 @@ version = "2.0.1" description = "A lil' TOML parser" optional = false python-versions = ">=3.7" +groups = ["dev"] +markers = "python_full_version < \"3.11.0a7\"" files = [ {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, @@ -409,6 +448,7 @@ version = "4.66.2" description = "Fast, Extensible Progress Meter" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "tqdm-4.66.2-py3-none-any.whl", hash = "sha256:1ee4f8a893eb9bef51c6e35730cebf234d5d0b6bd112b0271e10ed7c24a02bd9"}, {file = "tqdm-4.66.2.tar.gz", hash = "sha256:6cd52cdf0fef0e0f543299cfc96fec90d7b8a7e88745f411ec33eb44d5ed3531"}, @@ -429,6 +469,7 @@ version = "0.6.1" description = "Typer, build great CLIs. Easy to code. Based on Python type hints." optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "typer-0.6.1-py3-none-any.whl", hash = "sha256:54b19e5df18654070a82f8c2aa1da456a4ac16a2a83e6dcd9f170e291c56338e"}, {file = "typer-0.6.1.tar.gz", hash = "sha256:2d5720a5e63f73eaf31edaa15f6ab87f35f0690f8ca233017d7d23d743a91d73"}, @@ -449,6 +490,8 @@ version = "4.9.0" description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version < \"3.10\"" files = [ {file = "typing_extensions-4.9.0-py3-none-any.whl", hash = "sha256:af72aea155e91adfc61c3ae9e0e342dbc0cba726d6cba4b6c72c1f34e47291cd"}, {file = "typing_extensions-4.9.0.tar.gz", hash = "sha256:23478f88c37f27d76ac8aee6c905017a143b0b1b886c3c9f66bc2fd94f9f5783"}, @@ -460,18 +503,19 @@ version = "2.2.0" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "urllib3-2.2.0-py3-none-any.whl", hash = "sha256:ce3711610ddce217e6d113a2732fafad960a03fd0318c91faa79481e35c11224"}, {file = "urllib3-2.2.0.tar.gz", hash = "sha256:051d961ad0c62a94e50ecf1af379c3aba230c66c710493493560c0c223c49f20"}, ] [package.extras] -brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] +brotli = ["brotli (>=1.0.9) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\""] h2 = ["h2 (>=4,<5)"] socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] [metadata] -lock-version = "2.0" +lock-version = "2.1" python-versions = "^3.9" -content-hash = "6380be6ddc03d3f38ddd1f923f18a24ca9fb385753e0f47ef4549ed8dc933f3e" +content-hash = "d39aaed4a4d4c65370e6ae93f46350e193e2eb31caeea80a0b7da92f16339f09" diff --git a/pyproject.toml b/pyproject.toml index 016518e..1a6802c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ typer = "^0.6.1" requests = "^2.28.1" tqdm = "^4.42.1" SPARQLWrapper = "^2.0.0" +python-dotenv = "^1.1.1" [tool.poetry.dev-dependencies] diff --git a/test.sh b/test.sh index 0a4c096..47ceb8e 100755 --- a/test.sh +++ b/test.sh @@ -1,10 +1,10 @@ #!/usr/bin/env bash databusclient deploy \ - --versionid "https://d8lr.tools.dbpedia.org/hopver/testGroup/testArtifact/1.0-alpha/" \ + --version-id "https://databus.dbpedia.org/gg46ixav/test_group/test_artifact/2023-07-02" \ --title "Test Title" \ --abstract "Test Abstract" \ --description "Test Description" \ - --license "http://dalicc.net/licenselibrary/AdaptivePublicLicense10" \ - --apikey "$1" \ - "https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml|type=swagger" + --license-uri "https://dalicc.net/licenselibrary/Apache-2.0" \ + --apikey "f67f582e-bb33-4e82-ba1a-cbaa750be278" \ + "https://raw.githubusercontent.com/dbpedia/databus/68f976e29e2db15472f1b664a6fd5807b88d1370/README.md" From 9edc0dc03131594c3fa9e7223cff06f6e3da2584 Mon Sep 17 00:00:00 2001 From: Theo Date: Fri, 4 Jul 2025 11:55:52 +0200 Subject: [PATCH 02/39] added webdav-url argument --- README.md | 1 + databusclient/deploy.py | 4 ++-- nextcloudclient/upload.py | 6 ++---- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 9fb58e0..bd91dbc 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ Then registers them on the databus. cd databusclient python deploy.py \ +--webdav-url https://cloud.scadsai.uni-leipzig.de/remote.php/webdav \ --remote scads-nextcloud \ --path test \ --version-id https://databus.dbpedia.org/gg46ixav/test_group/test_artifact/2023-07-03 \ diff --git a/databusclient/deploy.py b/databusclient/deploy.py index 75fd3d0..474c5ed 100644 --- a/databusclient/deploy.py +++ b/databusclient/deploy.py @@ -64,8 +64,8 @@ def deploy_to_databus( def parse_args(): parser = argparse.ArgumentParser(description="Upload files to Nextcloud and deploy to DBpedia Databus.") - parser.add_argument("files", nargs="+", help="Path(s) to file(s) or folder(s) to upload") + parser.add_argument("--webdav-url", required=True, help="URL to webdav cloud(e.g., 'https://cloud.scadsai.uni-leipzig.de/remote.php/webdav')") parser.add_argument("--remote", required=True, help="rclone remote name (e.g., 'nextcloud')") parser.add_argument("--path", required=True, help="Remote path on Nextcloud (e.g., 'datasets/mydataset')") parser.add_argument("--version-id", required=True, help="Databus version URI") @@ -80,7 +80,7 @@ def parse_args(): args = parse_args() - metadata = upload_to_nextcloud(args.files, args.remote, args.path) + metadata = upload_to_nextcloud(args.files, args.remote, args.path, args.webdav_url) deploy_to_databus( metadata, diff --git a/nextcloudclient/upload.py b/nextcloudclient/upload.py index b8ac989..18fa8b4 100644 --- a/nextcloudclient/upload.py +++ b/nextcloudclient/upload.py @@ -3,8 +3,6 @@ import subprocess import posixpath -BASE_URL = "https://cloud.scadsai.uni-leipzig.de" - def compute_sha256_and_length(filepath): sha256 = hashlib.sha256() total_length = 0 @@ -26,7 +24,7 @@ def get_all_files(path): files.append(os.path.join(root, name)) return files -def upload_to_nextcloud(source_paths: str, remote_name: str, remote_path: str): +def upload_to_nextcloud(source_paths: str, remote_name: str, remote_path: str, webdav_url: str): result = [] for path in source_paths: if not os.path.exists(path): @@ -46,7 +44,7 @@ def upload_to_nextcloud(source_paths: str, remote_name: str, remote_path: str): else: remote_webdav_path = posixpath.join(remote_path, os.path.basename(file)) - url = f"{BASE_URL}/remote.php/webdav/{remote_webdav_path}" + url = posixpath.join(webdav_url,remote_webdav_path) filename = file.split("/")[-1] result.append((filename, checksum, size, url)) From a56f01d04a4715bd429776c09f16f28c8d05d47d Mon Sep 17 00:00:00 2001 From: Theo Date: Fri, 25 Jul 2025 12:20:18 +0200 Subject: [PATCH 03/39] added deploying to the databus without upload to nextcloud --- README.md | 20 +++++++++++++++++++ databusclient/client.py | 4 +++- databusclient/deploy.py | 40 ++++++++++++++++++++++++++----------- databusclient/metadata.json | 14 +++++++++++++ 4 files changed, 65 insertions(+), 13 deletions(-) create mode 100644 databusclient/metadata.json diff --git a/README.md b/README.md index bd91dbc..8622442 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,26 @@ python3 -m pip install databusclient ``` +## Deploy to Databus +Please add databus API_KEY to .env file +Use metadata.json file to list all files which should be added to the databus + +The script registers all files on the databus. +### Example Call +```bash +cd databusclient + +python deploy.py \ + --no-upload \ + --metadata ./metadata.json \ + --version-id https://databus.org/user/dataset/version/1.0 \ + --title "Test Dataset" \ + --abstract "This is a short abstract of the test dataset." \ + --description "This dataset was uploaded for testing the Nextcloud → Databus deployment pipeline." \ + --license https://dalicc.net/licenselibrary/Apache-2.0 + +``` + ## Upload to Nextcloud and Deploy to Databus Please add databus API_KEY to .env file diff --git a/databusclient/client.py b/databusclient/client.py index e135672..d425bee 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -480,7 +480,9 @@ def download( query = __handle_databus_collection__(endpoint,databusURI) res = __handle__databus_file_query__(endpoint, query) else: - print("dataId not supported yet") #TODO add support for other DatabusIds here (artifact, group, etc.) + query = __handle_databus_collection__(endpoint,databusURI) + res = __handle__databus_file_query__(endpoint, query) + __download_list__(res,localDir) # query in local file elif databusURI.startswith("file://"): print("query in file not supported yet") diff --git a/databusclient/deploy.py b/databusclient/deploy.py index 474c5ed..a77a671 100644 --- a/databusclient/deploy.py +++ b/databusclient/deploy.py @@ -1,6 +1,7 @@ import os import sys import argparse +import json from databusclient import create_distribution, create_dataset, deploy from dotenv import load_dotenv @@ -16,16 +17,14 @@ def deploy_to_databus( description, license_url ): - load_dotenv() api_key = os.getenv("API_KEY") if not api_key: raise ValueError("API_KEY not found in .env") distributions = [] - counter=0 + counter = 0 for filename, checksum, size, url in metadata: - parts = filename.split(".") if len(parts) == 1: file_format = "none" @@ -40,13 +39,13 @@ def deploy_to_databus( distributions.append( create_distribution( url=url, - cvs={"count":f"{counter}"}, + cvs={"count": f"{counter}"}, file_format=file_format, compression=compression, sha256_length_tuple=(checksum, size) ) ) - counter+=1 + counter += 1 dataset = create_dataset( version_id=version_id, @@ -62,12 +61,16 @@ def deploy_to_databus( print(f"Successfully deployed\n{metadata_string}\nto databus {version_id}") + def parse_args(): parser = argparse.ArgumentParser(description="Upload files to Nextcloud and deploy to DBpedia Databus.") - parser.add_argument("files", nargs="+", help="Path(s) to file(s) or folder(s) to upload") - parser.add_argument("--webdav-url", required=True, help="URL to webdav cloud(e.g., 'https://cloud.scadsai.uni-leipzig.de/remote.php/webdav')") - parser.add_argument("--remote", required=True, help="rclone remote name (e.g., 'nextcloud')") - parser.add_argument("--path", required=True, help="Remote path on Nextcloud (e.g., 'datasets/mydataset')") + parser.add_argument("files", nargs="*", help="Path(s) to file(s) or folder(s) to upload") + parser.add_argument("--webdav-url", help="WebDAV URL (e.g., https://cloud.example.com/remote.php/webdav)") + parser.add_argument("--remote", help="rclone remote name (e.g., 'nextcloud')") + parser.add_argument("--path", help="Remote path on Nextcloud (e.g., 'datasets/mydataset')") + parser.add_argument("--no-upload", action="store_true", help="Skip file upload and use existing metadata") + parser.add_argument("--metadata", help="Path to metadata JSON file (required if --no-upload is used)") + parser.add_argument("--version-id", required=True, help="Databus version URI") parser.add_argument("--title", required=True, help="Title of the dataset") parser.add_argument("--abstract", required=True, help="Short abstract of the dataset") @@ -76,11 +79,24 @@ def parse_args(): return parser.parse_args() -if __name__ == '__main__': +if __name__ == '__main__': args = parse_args() - metadata = upload_to_nextcloud(args.files, args.remote, args.path, args.webdav_url) + if args.no_upload: + if not args.metadata: + print("Error: --metadata is required when using --no-upload") + sys.exit(1) + if not os.path.isfile(args.metadata): + print(f"Error: Metadata file not found: {args.metadata}") + sys.exit(1) + with open(args.metadata, 'r') as f: + metadata = json.load(f) + else: + if not (args.webdav_url and args.remote and args.path): + print("Error: --webdav-url, --remote, and --path are required unless --no-upload is used") + sys.exit(1) + metadata = upload_to_nextcloud(args.files, args.remote, args.path, args.webdav_url) deploy_to_databus( metadata, @@ -89,4 +105,4 @@ def parse_args(): abstract=args.abstract, description=args.description, license_url=args.license - ) \ No newline at end of file + ) diff --git a/databusclient/metadata.json b/databusclient/metadata.json new file mode 100644 index 0000000..a52193c --- /dev/null +++ b/databusclient/metadata.json @@ -0,0 +1,14 @@ +[ + [ + "example.ttl", + "6e340b9cffb37a989ca544e6bb780a2c7e5d7dcb", + 12345, + "https://cloud.example.com/remote.php/webdav/datasets/mydataset/example.ttl" + ], + [ + "example.csv.gz", + "3f786850e387550fdab836ed7e6dc881de23001b", + 54321, + "https://cloud.example.com/remote.php/webdav/datasets/mydataset/example.csv.gz" + ] +] From 800256c111de064632c904abae8181a87d3f5c4c Mon Sep 17 00:00:00 2001 From: Theo Date: Tue, 21 Oct 2025 15:21:32 +0200 Subject: [PATCH 04/39] updated pyproject.toml and content-hash --- poetry.lock | 41 ++++------------------------------------- pyproject.toml | 6 +----- 2 files changed, 5 insertions(+), 42 deletions(-) diff --git a/poetry.lock b/poetry.lock index 33f8346..f6dc15b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -222,7 +222,7 @@ version = "0.7.2" description = "An ISO 8601 date/time/duration parser and formatter" optional = false python-versions = ">=3.7" -groups = ["main"] +groups = ["main", "dev"] markers = "python_version < \"3.11\"" files = [ {file = "isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15"}, @@ -303,7 +303,7 @@ version = "3.1.1" description = "pyparsing module - Classes and methods to define and execute parsing grammars" optional = false python-versions = ">=3.6.8" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "pyparsing-3.1.1-py3-none-any.whl", hash = "sha256:32c7c0b711493c72ff18a981d24f28aaf9c1fb7ed5e9667c9e84e3db623bdbfb"}, {file = "pyparsing-3.1.1.tar.gz", hash = "sha256:ede28a1a32462f5a9705e07aea48001a08f7cf81a021585011deba701581a0db"}, @@ -356,7 +356,7 @@ version = "7.2.1" description = "RDFLib is a Python library for working with RDF, a simple yet powerful language for representing information." optional = false python-versions = ">=3.8.1" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "rdflib-7.2.1-py3-none-any.whl", hash = "sha256:1a175bc1386a167a42fbfaba003bfa05c164a2a3ca3cb9c0c97f9c9638ca6ac2"}, {file = "rdflib-7.2.1.tar.gz", hash = "sha256:cf9b7fa25234e8925da8b1fb09700f8349b5f0f100e785fb4260e737308292ac"}, @@ -395,18 +395,6 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] -[[package]] -name = "six" -version = "1.16.0" -description = "Python 2 and 3 compatibility utilities" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" -groups = ["main"] -files = [ - {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, - {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, -] - [[package]] name = "sparqlwrapper" version = "2.0.0" @@ -462,27 +450,6 @@ notebook = ["ipywidgets (>=6)"] slack = ["slack-sdk"] telegram = ["requests"] -[[package]] -name = "typer" -version = "0.6.1" -description = "Typer, build great CLIs. Easy to code. Based on Python type hints." -optional = false -python-versions = ">=3.6" -groups = ["main"] -files = [ - {file = "typer-0.6.1-py3-none-any.whl", hash = "sha256:54b19e5df18654070a82f8c2aa1da456a4ac16a2a83e6dcd9f170e291c56338e"}, - {file = "typer-0.6.1.tar.gz", hash = "sha256:2d5720a5e63f73eaf31edaa15f6ab87f35f0690f8ca233017d7d23d743a91d73"}, -] - -[package.dependencies] -click = ">=7.1.1,<9.0.0" - -[package.extras] -all = ["colorama (>=0.4.3,<0.5.0)", "rich (>=10.11.0,<13.0.0)", "shellingham (>=1.3.0,<2.0.0)"] -dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2.17.0,<3.0.0)"] -doc = ["mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)"] -test = ["black (>=22.3.0,<23.0.0)", "coverage (>=5.2,<6.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<5.4.0)", "pytest-cov (>=2.10.0,<3.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<2.0.0)", "rich (>=10.11.0,<13.0.0)", "shellingham (>=1.3.0,<2.0.0)"] - [[package]] name = "typing-extensions" version = "4.9.0" @@ -517,4 +484,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.1" python-versions = "^3.9" -content-hash = "d39aaed4a4d4c65370e6ae93f46350e193e2eb31caeea80a0b7da92f16339f09" +content-hash = "b73a850bbe9a7813562d0578d6446707bb86cf863f8f7f36a9060cd6ca017784" diff --git a/pyproject.toml b/pyproject.toml index 7451ca7..24ef4db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,14 +14,10 @@ tqdm = "^4.42.1" SPARQLWrapper = "^2.0.0" python-dotenv = "^1.1.1" - -[tool.poetry.dev-dependencies] -black = "^22.6.0" -rdflib = "^7.2.1" - [tool.poetry.group.dev.dependencies] black = "^22.6.0" pytest = "^7.1.3" +rdflib = "^7.2.1" [tool.poetry.scripts] databusclient = "databusclient.cli:app" From b179f903db030b63ad47b7f393b570735a5c33b0 Mon Sep 17 00:00:00 2001 From: Theo Date: Tue, 28 Oct 2025 10:59:26 +0100 Subject: [PATCH 05/39] updated README.md --- README.md | 4 ++-- poetry.lock | 21 ++++++++++++++++++--- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 2673502..f0f5b2e 100644 --- a/README.md +++ b/README.md @@ -102,8 +102,8 @@ python deploy.py \ --abstract "This is a short abstract of the test dataset." \ --description "This dataset was uploaded for testing the Nextcloud → Databus deployment pipeline." \ --license https://dalicc.net/licenselibrary/Apache-2.0 \ -/home/theo/Work/SCADS.AI/Projects/CSVTest/newtestoutputfolder \ -/home/theo/Work/SCADS.AI/Projects/CSVTest/output.csv.bz2 +/home/CSVTest/newtestoutputfolder \ +/home/CSVTest/output.csv.bz2 ``` ## CLI Usage diff --git a/poetry.lock b/poetry.lock index c5b6e69..7890f41 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. [[package]] name = "black" @@ -335,6 +335,21 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +[[package]] +name = "python-dotenv" +version = "1.2.1" +description = "Read key-value pairs from a .env file and set them as environment variables" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61"}, + {file = "python_dotenv-1.2.1.tar.gz", hash = "sha256:42667e897e16ab0d66954af0e60a9caa94f0fd4ecf3aaf6d2d260eec1aa36ad6"}, +] + +[package.extras] +cli = ["click (>=5.0)"] + [[package]] name = "rdflib" version = "7.2.1" @@ -442,7 +457,7 @@ description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_version == \"3.9\"" +markers = "python_version < \"3.10\"" files = [ {file = "typing_extensions-4.9.0-py3-none-any.whl", hash = "sha256:af72aea155e91adfc61c3ae9e0e342dbc0cba726d6cba4b6c72c1f34e47291cd"}, {file = "typing_extensions-4.9.0.tar.gz", hash = "sha256:23478f88c37f27d76ac8aee6c905017a143b0b1b886c3c9f66bc2fd94f9f5783"}, @@ -469,4 +484,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.1" python-versions = "^3.9" -content-hash = "6f798ca5bc7629dc0668179934c9889c0d971743c1b162ae1387bd0c5a349d94" +content-hash = "a0d37ff89c254a897734e20b8910f18c2a41f4f54336cd92eec6299e1a3fa787" From 0ce0c24051bdd5d1897d5e2c29f825574478bc10 Mon Sep 17 00:00:00 2001 From: Theo Date: Tue, 28 Oct 2025 16:00:21 +0100 Subject: [PATCH 06/39] added checksum validation --- databusclient/deploy.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/databusclient/deploy.py b/databusclient/deploy.py index a77a671..95c7e91 100644 --- a/databusclient/deploy.py +++ b/databusclient/deploy.py @@ -8,7 +8,6 @@ from nextcloudclient.upload import upload_to_nextcloud - def deploy_to_databus( metadata, version_id, @@ -25,6 +24,9 @@ def deploy_to_databus( distributions = [] counter = 0 for filename, checksum, size, url in metadata: + # Expect a SHA-256 hex digest (64 chars). Reject others. + if not isinstance(checksum, str) or len(checksum) != 64: + raise ValueError(f"Invalid checksum for {filename}: expected SHA-256 hex (64 chars), got '{checksum}'") parts = filename.split(".") if len(parts) == 1: file_format = "none" From 6596cbcd62a1e78c46f143e595c1f3e25461e6e0 Mon Sep 17 00:00:00 2001 From: Theo Date: Tue, 28 Oct 2025 16:02:21 +0100 Subject: [PATCH 07/39] updated upload_to_nextcloud function to accept list of source_paths --- nextcloudclient/upload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextcloudclient/upload.py b/nextcloudclient/upload.py index 18fa8b4..a55eddf 100644 --- a/nextcloudclient/upload.py +++ b/nextcloudclient/upload.py @@ -24,7 +24,7 @@ def get_all_files(path): files.append(os.path.join(root, name)) return files -def upload_to_nextcloud(source_paths: str, remote_name: str, remote_path: str, webdav_url: str): +def upload_to_nextcloud(source_paths: list[str], remote_name: str, remote_path: str, webdav_url: str): result = [] for path in source_paths: if not os.path.exists(path): From b9f98544ba649799f0373adad3bc86f6bc9aa0d1 Mon Sep 17 00:00:00 2001 From: Theo Date: Tue, 28 Oct 2025 16:07:38 +0100 Subject: [PATCH 08/39] only add result if upload successful --- nextcloudclient/upload.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nextcloudclient/upload.py b/nextcloudclient/upload.py index a55eddf..72db5ad 100644 --- a/nextcloudclient/upload.py +++ b/nextcloudclient/upload.py @@ -35,6 +35,8 @@ def upload_to_nextcloud(source_paths: list[str], remote_name: str, remote_path: basename = os.path.basename(abs_path) files = get_all_files(abs_path) + tmp_results = [] + for file in files: checksum,size = compute_sha256_and_length(file) @@ -47,7 +49,7 @@ def upload_to_nextcloud(source_paths: list[str], remote_name: str, remote_path: url = posixpath.join(webdav_url,remote_webdav_path) filename = file.split("/")[-1] - result.append((filename, checksum, size, url)) + tmp_results.append((filename, checksum, size, url)) if os.path.isdir(path): destination = f"{remote_name}:{remote_path}/{basename}" @@ -59,6 +61,7 @@ def upload_to_nextcloud(source_paths: list[str], remote_name: str, remote_path: print(f"Upload: {path} → {destination}") try: subprocess.run(command, check=True) + result.append(tmp_results) print("✅ Uploaded successfully.\n") except subprocess.CalledProcessError as e: print(f"❌ Error uploading {path}: {e}\n") From 2f8493db74ca5f6bba91c5c983a5de295f0c0ebb Mon Sep 17 00:00:00 2001 From: Theo Date: Tue, 28 Oct 2025 16:09:15 +0100 Subject: [PATCH 09/39] use os.path.basename instead of .split("/")[-1] --- nextcloudclient/upload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextcloudclient/upload.py b/nextcloudclient/upload.py index 72db5ad..9313766 100644 --- a/nextcloudclient/upload.py +++ b/nextcloudclient/upload.py @@ -48,7 +48,7 @@ def upload_to_nextcloud(source_paths: list[str], remote_name: str, remote_path: url = posixpath.join(webdav_url,remote_webdav_path) - filename = file.split("/")[-1] + filename = os.path.basename(file) tmp_results.append((filename, checksum, size, url)) if os.path.isdir(path): From 07359cc8001c5ce260756b84b71b996de3c6f17c Mon Sep 17 00:00:00 2001 From: Theo Date: Tue, 28 Oct 2025 16:21:35 +0100 Subject: [PATCH 10/39] added __init__.py and updated README.md --- README.md | 8 ++------ nextcloudclient/__init__.py | 0 2 files changed, 2 insertions(+), 6 deletions(-) create mode 100644 nextcloudclient/__init__.py diff --git a/README.md b/README.md index f0f5b2e..82645fc 100644 --- a/README.md +++ b/README.md @@ -71,9 +71,7 @@ Use metadata.json file to list all files which should be added to the databus The script registers all files on the databus. ### Example Call ```bash -cd databusclient - -python deploy.py \ +python -m databusclient.deploy \ --no-upload \ --metadata ./metadata.json \ --version-id https://databus.org/user/dataset/version/1.0 \ @@ -91,9 +89,7 @@ The script uploads all given files and all files in the given folders to the giv Then registers them on the databus. ### Example Call ```bash -cd databusclient - -python deploy.py \ +python -m databusclient.deploy \ --webdav-url https://cloud.scadsai.uni-leipzig.de/remote.php/webdav \ --remote scads-nextcloud \ --path test \ diff --git a/nextcloudclient/__init__.py b/nextcloudclient/__init__.py new file mode 100644 index 0000000..e69de29 From 80479682a315fe7f631c43a25ad68c315ede905a Mon Sep 17 00:00:00 2001 From: Theo Date: Tue, 28 Oct 2025 16:33:19 +0100 Subject: [PATCH 11/39] changed append to extend (no nested list) --- nextcloudclient/upload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextcloudclient/upload.py b/nextcloudclient/upload.py index 9313766..dd442d1 100644 --- a/nextcloudclient/upload.py +++ b/nextcloudclient/upload.py @@ -61,7 +61,7 @@ def upload_to_nextcloud(source_paths: list[str], remote_name: str, remote_path: print(f"Upload: {path} → {destination}") try: subprocess.run(command, check=True) - result.append(tmp_results) + result.extend(tmp_results) print("✅ Uploaded successfully.\n") except subprocess.CalledProcessError as e: print(f"❌ Error uploading {path}: {e}\n") From 01724500a742b05ad3321d51cd9e8cad07025d25 Mon Sep 17 00:00:00 2001 From: Theo Date: Tue, 28 Oct 2025 16:48:05 +0100 Subject: [PATCH 12/39] fixed windows separators and added rclone error message --- nextcloudclient/upload.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/nextcloudclient/upload.py b/nextcloudclient/upload.py index dd442d1..4ec7849 100644 --- a/nextcloudclient/upload.py +++ b/nextcloudclient/upload.py @@ -2,6 +2,8 @@ import os import subprocess import posixpath +from urllib.parse import urljoin, quote + def compute_sha256_and_length(filepath): sha256 = hashlib.sha256() @@ -42,20 +44,24 @@ def upload_to_nextcloud(source_paths: list[str], remote_name: str, remote_path: if os.path.isdir(path): rel_file = os.path.relpath(file, abs_path) + # Normalize to POSIX for WebDAV/URLs + rel_file = rel_file.replace(os.sep, "/") remote_webdav_path = posixpath.join(remote_path, basename, rel_file) else: remote_webdav_path = posixpath.join(remote_path, os.path.basename(file)) - url = posixpath.join(webdav_url,remote_webdav_path) + # Preserve scheme/host and percent-encode path segments + url = urljoin(webdav_url.rstrip("/") + "/", quote(remote_webdav_path.lstrip("/"), safe="/")) filename = os.path.basename(file) tmp_results.append((filename, checksum, size, url)) + dest_subpath = posixpath.join(remote_path.lstrip("/"), basename) if os.path.isdir(path): - destination = f"{remote_name}:{remote_path}/{basename}" + destination = f"{remote_name}:{dest_subpath}" command = ["rclone", "copy", abs_path, destination, "--progress"] else: - destination = f"{remote_name}:{remote_path}/{basename}" + destination = f"{remote_name}:{dest_subpath}" command = ["rclone", "copyto", abs_path, destination, "--progress"] print(f"Upload: {path} → {destination}") @@ -65,6 +71,7 @@ def upload_to_nextcloud(source_paths: list[str], remote_name: str, remote_path: print("✅ Uploaded successfully.\n") except subprocess.CalledProcessError as e: print(f"❌ Error uploading {path}: {e}\n") - + except FileNotFoundError: + print("❌ rclone not found on PATH. Install rclone and retry.") return result From f957512aa36aee0b861f60cc2c4024e514b76680 Mon Sep 17 00:00:00 2001 From: Theo Date: Mon, 3 Nov 2025 13:26:18 +0100 Subject: [PATCH 13/39] moved deploy.py to cli upload_and_deploy --- databusclient/cli.py | 111 ++++++++++++++++++++++++++++++++++++++++ databusclient/deploy.py | 110 --------------------------------------- 2 files changed, 111 insertions(+), 110 deletions(-) delete mode 100644 databusclient/deploy.py diff --git a/databusclient/cli.py b/databusclient/cli.py index 8fc3e02..ac42432 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -3,6 +3,7 @@ from typing import List from databusclient import client +from nextcloudclient import upload @click.group() def app(): @@ -57,5 +58,115 @@ def download(databusuris: List[str], localdir, databus, token, authurl, clientid ) +@app.command(help="Upload files to Nextcloud and deploy to DBpedia Databus.") +@click.option( + "--webdav-url", "webdav_url", + help="WebDAV URL (e.g., https://cloud.example.com/remote.php/webdav)", +) +@click.option( + "--remote", + help="rclone remote name (e.g., 'nextcloud')", +) +@click.option( + "--path", + help="Remote path on Nextcloud (e.g., 'datasets/mydataset')", +) +@click.option( + "--no-upload", "no_upload", + is_flag=True, + help="Skip file upload and use existing metadata", +) +@click.option( + "--metadata", + type=click.Path(exists=True), + help="Path to metadata JSON file (required if --no-upload is used)", +) + +@click.option( + "--version-id", "version_id", + required=True, + help="Target databus version/dataset identifier of the form " + "", +) +@click.option("--title", required=True, help="Dataset title") +@click.option("--abstract", required=True, help="Dataset abstract max 200 chars") +@click.option("--description", required=True, help="Dataset description") +@click.option("--license", "license_url", required=True, help="License (see dalicc.net)") +@click.option("--apikey", required=True, help="API key") + +@click.argument( + "files", + nargs=-1, + type=click.Path(exists=True), +) +def upload_and_deploy(webdav_url, remote, path, no_upload, metadata, version_id, title, abstract, description, license_url, apikey, files: List[str]): + """ + Deploy a dataset version with the provided metadata and distributions. + """ + + if no_upload: + if not metadata: + click.echo(click.style("Error: --metadata is required when using --no-upload", fg="red")) + sys.exit(1) + if not os.path.isfile(metadata): + click.echo(click.style(f"Error: Metadata file not found: {metadata}", fg="red")) + sys.exit(1) + with open(metadata, 'r') as f: + metadata = json.load(f) + else: + if not (webdav_url and remote and path): + click.echo(click.style("Error: --webdav-url, --remote, and --path are required unless --no-upload is used", fg="red")) + sys.exit(1) + + click.echo(f"Uploading data to nextcloud: {remote}") + metadata = upload.upload_to_nextcloud(files, remote, path, webdav_url) + + + click.echo(f"Creating {len(metadata)} distributions") + distributions = [] + counter = 0 + for filename, checksum, size, url in metadata: + # Expect a SHA-256 hex digest (64 chars). Reject others. + if not isinstance(checksum, str) or len(checksum) != 64: + raise ValueError(f"Invalid checksum for {filename}: expected SHA-256 hex (64 chars), got '{checksum}'") + parts = filename.split(".") + if len(parts) == 1: + file_format = "none" + compression = "none" + elif len(parts) == 2: + file_format = parts[-1] + compression = "none" + else: + file_format = parts[-2] + compression = parts[-1] + + distributions.append( + create_distribution( + url=url, + cvs={"count": f"{counter}"}, + file_format=file_format, + compression=compression, + sha256_length_tuple=(checksum, size) + ) + ) + counter += 1 + + dataset = create_dataset( + version_id=version_id, + title=title, + abstract=abstract, + description=description, + license_url=license_url, + distributions=distributions + ) + + click.echo(f"Deploying dataset version: {version_id}") + + deploy(dataset, api_key) + metadata_string = ",\n".join([entry[-1] for entry in metadata]) + + click.echo(f"Successfully deployed\n{metadata_string}\nto databus {version_id}") + + if __name__ == "__main__": app() diff --git a/databusclient/deploy.py b/databusclient/deploy.py deleted file mode 100644 index 95c7e91..0000000 --- a/databusclient/deploy.py +++ /dev/null @@ -1,110 +0,0 @@ -import os -import sys -import argparse -import json - -from databusclient import create_distribution, create_dataset, deploy -from dotenv import load_dotenv - -from nextcloudclient.upload import upload_to_nextcloud - -def deploy_to_databus( - metadata, - version_id, - title, - abstract, - description, - license_url -): - load_dotenv() - api_key = os.getenv("API_KEY") - if not api_key: - raise ValueError("API_KEY not found in .env") - - distributions = [] - counter = 0 - for filename, checksum, size, url in metadata: - # Expect a SHA-256 hex digest (64 chars). Reject others. - if not isinstance(checksum, str) or len(checksum) != 64: - raise ValueError(f"Invalid checksum for {filename}: expected SHA-256 hex (64 chars), got '{checksum}'") - parts = filename.split(".") - if len(parts) == 1: - file_format = "none" - compression = "none" - elif len(parts) == 2: - file_format = parts[-1] - compression = "none" - else: - file_format = parts[-2] - compression = parts[-1] - - distributions.append( - create_distribution( - url=url, - cvs={"count": f"{counter}"}, - file_format=file_format, - compression=compression, - sha256_length_tuple=(checksum, size) - ) - ) - counter += 1 - - dataset = create_dataset( - version_id=version_id, - title=title, - abstract=abstract, - description=description, - license_url=license_url, - distributions=distributions - ) - - deploy(dataset, api_key) - metadata_string = ",\n".join([entry[-1] for entry in metadata]) - - print(f"Successfully deployed\n{metadata_string}\nto databus {version_id}") - - -def parse_args(): - parser = argparse.ArgumentParser(description="Upload files to Nextcloud and deploy to DBpedia Databus.") - parser.add_argument("files", nargs="*", help="Path(s) to file(s) or folder(s) to upload") - parser.add_argument("--webdav-url", help="WebDAV URL (e.g., https://cloud.example.com/remote.php/webdav)") - parser.add_argument("--remote", help="rclone remote name (e.g., 'nextcloud')") - parser.add_argument("--path", help="Remote path on Nextcloud (e.g., 'datasets/mydataset')") - parser.add_argument("--no-upload", action="store_true", help="Skip file upload and use existing metadata") - parser.add_argument("--metadata", help="Path to metadata JSON file (required if --no-upload is used)") - - parser.add_argument("--version-id", required=True, help="Databus version URI") - parser.add_argument("--title", required=True, help="Title of the dataset") - parser.add_argument("--abstract", required=True, help="Short abstract of the dataset") - parser.add_argument("--description", required=True, help="Detailed description of the dataset") - parser.add_argument("--license", required=True, help="License URL (e.g., https://dalicc.net/licenselibrary/Apache-2.0)") - - return parser.parse_args() - - -if __name__ == '__main__': - args = parse_args() - - if args.no_upload: - if not args.metadata: - print("Error: --metadata is required when using --no-upload") - sys.exit(1) - if not os.path.isfile(args.metadata): - print(f"Error: Metadata file not found: {args.metadata}") - sys.exit(1) - with open(args.metadata, 'r') as f: - metadata = json.load(f) - else: - if not (args.webdav_url and args.remote and args.path): - print("Error: --webdav-url, --remote, and --path are required unless --no-upload is used") - sys.exit(1) - metadata = upload_to_nextcloud(args.files, args.remote, args.path, args.webdav_url) - - deploy_to_databus( - metadata, - version_id=args.version_id, - title=args.title, - abstract=args.abstract, - description=args.description, - license_url=args.license - ) From 607f527d5ffc799f570d23ff43cc3e98966e60a8 Mon Sep 17 00:00:00 2001 From: Theo Date: Mon, 3 Nov 2025 13:28:33 +0100 Subject: [PATCH 14/39] changed metadata to dict list --- databusclient/cli.py | 6 +++++- databusclient/metadata.json | 24 ++++++++++++------------ nextcloudclient/upload.py | 7 ++++++- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/databusclient/cli.py b/databusclient/cli.py index ac42432..2f0a7ef 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -125,7 +125,11 @@ def upload_and_deploy(webdav_url, remote, path, no_upload, metadata, version_id, click.echo(f"Creating {len(metadata)} distributions") distributions = [] counter = 0 - for filename, checksum, size, url in metadata: + for entry in metadata: + filename = entry["filename"] + checksum = entry["checksum"] + size = entry["size"] + url = entry["url"] # Expect a SHA-256 hex digest (64 chars). Reject others. if not isinstance(checksum, str) or len(checksum) != 64: raise ValueError(f"Invalid checksum for {filename}: expected SHA-256 hex (64 chars), got '{checksum}'") diff --git a/databusclient/metadata.json b/databusclient/metadata.json index a52193c..8c7a5d8 100644 --- a/databusclient/metadata.json +++ b/databusclient/metadata.json @@ -1,14 +1,14 @@ [ - [ - "example.ttl", - "6e340b9cffb37a989ca544e6bb780a2c7e5d7dcb", - 12345, - "https://cloud.example.com/remote.php/webdav/datasets/mydataset/example.ttl" - ], - [ - "example.csv.gz", - "3f786850e387550fdab836ed7e6dc881de23001b", - 54321, - "https://cloud.example.com/remote.php/webdav/datasets/mydataset/example.csv.gz" - ] + { + "filename": "example.ttl", + "checksum": "6e340b9cffb37a989ca544e6bb780a2c7e5d7dcb", + "size": 12345, + "url": "https://cloud.example.com/remote.php/webdav/datasets/mydataset/example.ttl" + }, + { + "filename": "example.csv.gz", + "checksum": "3f786850e387550fdab836ed7e6dc881de23001b", + "size": 54321, + "url": "https://cloud.example.com/remote.php/webdav/datasets/mydataset/example.csv.gz" + } ] diff --git a/nextcloudclient/upload.py b/nextcloudclient/upload.py index 4ec7849..f0d3328 100644 --- a/nextcloudclient/upload.py +++ b/nextcloudclient/upload.py @@ -54,7 +54,12 @@ def upload_to_nextcloud(source_paths: list[str], remote_name: str, remote_path: url = urljoin(webdav_url.rstrip("/") + "/", quote(remote_webdav_path.lstrip("/"), safe="/")) filename = os.path.basename(file) - tmp_results.append((filename, checksum, size, url)) + tmp_results.append({ + "filename": filename, + "checksum": checksum, + "size": size, + "url": url, + }) dest_subpath = posixpath.join(remote_path.lstrip("/"), basename) if os.path.isdir(path): From 6cb7e11d8728b75ce097c39b7ddc3b89945b1c8c Mon Sep 17 00:00:00 2001 From: Theo Date: Mon, 3 Nov 2025 13:31:53 +0100 Subject: [PATCH 15/39] removed python-dotenv --- poetry.lock | 17 +---------------- pyproject.toml | 1 - 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/poetry.lock b/poetry.lock index 7890f41..b4b80af 100644 --- a/poetry.lock +++ b/poetry.lock @@ -335,21 +335,6 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] -[[package]] -name = "python-dotenv" -version = "1.2.1" -description = "Read key-value pairs from a .env file and set them as environment variables" -optional = false -python-versions = ">=3.9" -groups = ["main"] -files = [ - {file = "python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61"}, - {file = "python_dotenv-1.2.1.tar.gz", hash = "sha256:42667e897e16ab0d66954af0e60a9caa94f0fd4ecf3aaf6d2d260eec1aa36ad6"}, -] - -[package.extras] -cli = ["click (>=5.0)"] - [[package]] name = "rdflib" version = "7.2.1" @@ -484,4 +469,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.1" python-versions = "^3.9" -content-hash = "a0d37ff89c254a897734e20b8910f18c2a41f4f54336cd92eec6299e1a3fa787" +content-hash = "6f798ca5bc7629dc0668179934c9889c0d971743c1b162ae1387bd0c5a349d94" diff --git a/pyproject.toml b/pyproject.toml index a168b2a..0d32ee1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,6 @@ click = "^8.0.4" requests = "^2.28.1" tqdm = "^4.42.1" SPARQLWrapper = "^2.0.0" -python-dotenv = "^1.1.1" rdflib = "^7.2.1" [tool.poetry.group.dev.dependencies] From 7651c31b266ef308db16e1ba3edd36875698fb50 Mon Sep 17 00:00:00 2001 From: Theo Date: Mon, 3 Nov 2025 13:40:53 +0100 Subject: [PATCH 16/39] small updates --- databusclient/cli.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/databusclient/cli.py b/databusclient/cli.py index 2f0a7ef..856e562 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -1,4 +1,7 @@ #!/usr/bin/env python3 +import json +import os + import click from typing import List from databusclient import client @@ -58,7 +61,7 @@ def download(databusuris: List[str], localdir, databus, token, authurl, clientid ) -@app.command(help="Upload files to Nextcloud and deploy to DBpedia Databus.") +@app.command() @click.option( "--webdav-url", "webdav_url", help="WebDAV URL (e.g., https://cloud.example.com/remote.php/webdav)", @@ -101,22 +104,19 @@ def download(databusuris: List[str], localdir, databus, token, authurl, clientid ) def upload_and_deploy(webdav_url, remote, path, no_upload, metadata, version_id, title, abstract, description, license_url, apikey, files: List[str]): """ - Deploy a dataset version with the provided metadata and distributions. + Upload files to Nextcloud and deploy to DBpedia Databus. """ if no_upload: if not metadata: - click.echo(click.style("Error: --metadata is required when using --no-upload", fg="red")) - sys.exit(1) + raise click.ClickException("--metadata is required when using --no-upload") if not os.path.isfile(metadata): - click.echo(click.style(f"Error: Metadata file not found: {metadata}", fg="red")) - sys.exit(1) + raise click.ClickException(f"Error: Metadata file not found: {metadata}") with open(metadata, 'r') as f: metadata = json.load(f) else: if not (webdav_url and remote and path): - click.echo(click.style("Error: --webdav-url, --remote, and --path are required unless --no-upload is used", fg="red")) - sys.exit(1) + raise click.ClickException("Error: --webdav-url, --remote, and --path are required unless --no-upload is used") click.echo(f"Uploading data to nextcloud: {remote}") metadata = upload.upload_to_nextcloud(files, remote, path, webdav_url) @@ -145,7 +145,7 @@ def upload_and_deploy(webdav_url, remote, path, no_upload, metadata, version_id, compression = parts[-1] distributions.append( - create_distribution( + client.create_distribution( url=url, cvs={"count": f"{counter}"}, file_format=file_format, @@ -155,7 +155,7 @@ def upload_and_deploy(webdav_url, remote, path, no_upload, metadata, version_id, ) counter += 1 - dataset = create_dataset( + dataset = client.create_dataset( version_id=version_id, title=title, abstract=abstract, @@ -166,7 +166,7 @@ def upload_and_deploy(webdav_url, remote, path, no_upload, metadata, version_id, click.echo(f"Deploying dataset version: {version_id}") - deploy(dataset, api_key) + deploy(dataset, apikey) metadata_string = ",\n".join([entry[-1] for entry in metadata]) click.echo(f"Successfully deployed\n{metadata_string}\nto databus {version_id}") From df17a7c395d6c2c59b0cb1791eb81b5159f7031f Mon Sep 17 00:00:00 2001 From: Theo Date: Mon, 3 Nov 2025 14:19:20 +0100 Subject: [PATCH 17/39] refactored upload_and_deploy function --- databusclient/cli.py | 142 +++++++++++++++------------------------- databusclient/client.py | 53 +++++++++++++++ 2 files changed, 104 insertions(+), 91 deletions(-) diff --git a/databusclient/cli.py b/databusclient/cli.py index 856e562..77e1bca 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 import json -import os import click from typing import List @@ -41,50 +40,50 @@ def deploy(version_id, title, abstract, description, license_url, apikey, distri @app.command() -@click.argument("databusuris", nargs=-1, required=True) -@click.option("--localdir", help="Local databus folder (if not given, databus folder structure is created in current working directory)") -@click.option("--databus", help="Databus URL (if not given, inferred from databusuri, e.g. https://databus.dbpedia.org/sparql)") -@click.option("--token", help="Path to Vault refresh token file") -@click.option("--authurl", default="https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token", show_default=True, help="Keycloak token endpoint URL") -@click.option("--clientid", default="vault-token-exchange", show_default=True, help="Client ID for token exchange") -def download(databusuris: List[str], localdir, databus, token, authurl, clientid): +@click.option( + "--metadata", "metadata_file", + required=True, + type=click.Path(exists=True), + help="Path to metadata JSON file", +) +@click.option( + "--version-id", "version_id", + required=True, + help="Target databus version/dataset identifier of the form " + "", +) +@click.option("--title", required=True, help="Dataset title") +@click.option("--abstract", required=True, help="Dataset abstract max 200 chars") +@click.option("--description", required=True, help="Dataset description") +@click.option("--license", "license_url", required=True, help="License (see dalicc.net)") +@click.option("--apikey", required=True, help="API key") +def deploy_with_metadata(metadata_file, version_id, title, abstract, description, license_url, apikey): """ - Download datasets from databus, optionally using vault access if vault options are provided. + Deploy to DBpedia Databus using metadata json file. """ - client.download( - localDir=localdir, - endpoint=databus, - databusURIs=databusuris, - token=token, - auth_url=authurl, - client_id=clientid, - ) + + with open(metadata_file, 'r') as f: + metadata = json.load(f) + + client.deploy_from_metadata(metadata, version_id, title, abstract, description, license_url, apikey) @app.command() @click.option( "--webdav-url", "webdav_url", + required=True, help="WebDAV URL (e.g., https://cloud.example.com/remote.php/webdav)", ) @click.option( "--remote", + required=True, help="rclone remote name (e.g., 'nextcloud')", ) @click.option( "--path", + required=True, help="Remote path on Nextcloud (e.g., 'datasets/mydataset')", ) -@click.option( - "--no-upload", "no_upload", - is_flag=True, - help="Skip file upload and use existing metadata", -) -@click.option( - "--metadata", - type=click.Path(exists=True), - help="Path to metadata JSON file (required if --no-upload is used)", -) - @click.option( "--version-id", "version_id", required=True, @@ -96,80 +95,41 @@ def download(databusuris: List[str], localdir, databus, token, authurl, clientid @click.option("--description", required=True, help="Dataset description") @click.option("--license", "license_url", required=True, help="License (see dalicc.net)") @click.option("--apikey", required=True, help="API key") - @click.argument( "files", nargs=-1, type=click.Path(exists=True), ) -def upload_and_deploy(webdav_url, remote, path, no_upload, metadata, version_id, title, abstract, description, license_url, apikey, files: List[str]): +def upload_and_deploy(webdav_url, remote, path, version_id, title, abstract, description, license_url, apikey, + files: List[str]): """ Upload files to Nextcloud and deploy to DBpedia Databus. """ - if no_upload: - if not metadata: - raise click.ClickException("--metadata is required when using --no-upload") - if not os.path.isfile(metadata): - raise click.ClickException(f"Error: Metadata file not found: {metadata}") - with open(metadata, 'r') as f: - metadata = json.load(f) - else: - if not (webdav_url and remote and path): - raise click.ClickException("Error: --webdav-url, --remote, and --path are required unless --no-upload is used") - - click.echo(f"Uploading data to nextcloud: {remote}") - metadata = upload.upload_to_nextcloud(files, remote, path, webdav_url) - - - click.echo(f"Creating {len(metadata)} distributions") - distributions = [] - counter = 0 - for entry in metadata: - filename = entry["filename"] - checksum = entry["checksum"] - size = entry["size"] - url = entry["url"] - # Expect a SHA-256 hex digest (64 chars). Reject others. - if not isinstance(checksum, str) or len(checksum) != 64: - raise ValueError(f"Invalid checksum for {filename}: expected SHA-256 hex (64 chars), got '{checksum}'") - parts = filename.split(".") - if len(parts) == 1: - file_format = "none" - compression = "none" - elif len(parts) == 2: - file_format = parts[-1] - compression = "none" - else: - file_format = parts[-2] - compression = parts[-1] - - distributions.append( - client.create_distribution( - url=url, - cvs={"count": f"{counter}"}, - file_format=file_format, - compression=compression, - sha256_length_tuple=(checksum, size) - ) - ) - counter += 1 - - dataset = client.create_dataset( - version_id=version_id, - title=title, - abstract=abstract, - description=description, - license_url=license_url, - distributions=distributions - ) - - click.echo(f"Deploying dataset version: {version_id}") + click.echo(f"Uploading data to nextcloud: {remote}") + metadata = upload.upload_to_nextcloud(files, remote, path, webdav_url) + client.deploy_from_metadata(metadata, version_id, title, abstract, description, license_url, apikey) - deploy(dataset, apikey) - metadata_string = ",\n".join([entry[-1] for entry in metadata]) - click.echo(f"Successfully deployed\n{metadata_string}\nto databus {version_id}") +@app.command() +@click.argument("databusuris", nargs=-1, required=True) +@click.option("--localdir", help="Local databus folder (if not given, databus folder structure is created in current working directory)") +@click.option("--databus", help="Databus URL (if not given, inferred from databusuri, e.g. https://databus.dbpedia.org/sparql)") +@click.option("--token", help="Path to Vault refresh token file") +@click.option("--authurl", default="https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token", show_default=True, help="Keycloak token endpoint URL") +@click.option("--clientid", default="vault-token-exchange", show_default=True, help="Client ID for token exchange") +def download(databusuris: List[str], localdir, databus, token, authurl, clientid): + """ + Download datasets from databus, optionally using vault access if vault options are provided. + """ + client.download( + localDir=localdir, + endpoint=databus, + databusURIs=databusuris, + token=token, + auth_url=authurl, + client_id=clientid, + ) if __name__ == "__main__": diff --git a/databusclient/client.py b/databusclient/client.py index bb663ec..8bc5955 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -205,6 +205,40 @@ def create_distribution( return f"{url}|{meta_string}" +def create_distributions_from_metadata(metadata): + distributions = [] + counter = 0 + for entry in metadata: + filename = entry["filename"] + checksum = entry["checksum"] + size = entry["size"] + url = entry["url"] + # Expect a SHA-256 hex digest (64 chars). Reject others. + if not isinstance(checksum, str) or len(checksum) != 64: + raise ValueError(f"Invalid checksum for {filename}: expected SHA-256 hex (64 chars), got '{checksum}'") + parts = filename.split(".") + if len(parts) == 1: + file_format = "none" + compression = "none" + elif len(parts) == 2: + file_format = parts[-1] + compression = "none" + else: + file_format = parts[-2] + compression = parts[-1] + + distributions.append( + create_distribution( + url=url, + cvs={"count": f"{counter}"}, + file_format=file_format, + compression=compression, + sha256_length_tuple=(checksum, size) + ) + ) + counter += 1 + return distributions + def create_dataset( version_id: str, @@ -393,6 +427,25 @@ def deploy( print(resp.text) +def deploy_from_metadata(metadata, version_id, title, abstract, description, license_url, apikey): + distributions = create_distributions_from_metadata(metadata) + + dataset = create_dataset( + version_id=version_id, + title=title, + abstract=abstract, + description=description, + license_url=license_url, + distributions=distributions + ) + + print(f"Deploying dataset version: {version_id}") + deploy(dataset, apikey) + + metadata_string = ",\n".join([entry[-1] for entry in metadata]) + print(f"Successfully deployed\n{metadata_string}\nto databus {version_id}") + + def __download_file__(url, filename, vault_token_file=None, auth_url=None, client_id=None) -> None: """ Download a file from the internet with a progress bar using tqdm. From 7492531d864564fb82a88d44877a91e8399b73bf Mon Sep 17 00:00:00 2001 From: Theo Date: Mon, 3 Nov 2025 14:20:32 +0100 Subject: [PATCH 18/39] updated README.md --- README.md | 130 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 92 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 82645fc..f9c3197 100644 --- a/README.md +++ b/README.md @@ -64,44 +64,6 @@ docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://d A docker image is available at [dbpedia/databus-python-client](https://hub.docker.com/r/dbpedia/databus-python-client). See [download section](#usage-of-docker-image) for details. -## Deploy to Databus -Please add databus API_KEY to .env file -Use metadata.json file to list all files which should be added to the databus - -The script registers all files on the databus. -### Example Call -```bash -python -m databusclient.deploy \ - --no-upload \ - --metadata ./metadata.json \ - --version-id https://databus.org/user/dataset/version/1.0 \ - --title "Test Dataset" \ - --abstract "This is a short abstract of the test dataset." \ - --description "This dataset was uploaded for testing the Nextcloud → Databus deployment pipeline." \ - --license https://dalicc.net/licenselibrary/Apache-2.0 - -``` - -## Upload to Nextcloud and Deploy to Databus -Please add databus API_KEY to .env file - -The script uploads all given files and all files in the given folders to the given remote. -Then registers them on the databus. -### Example Call -```bash -python -m databusclient.deploy \ ---webdav-url https://cloud.scadsai.uni-leipzig.de/remote.php/webdav \ ---remote scads-nextcloud \ ---path test \ ---version-id https://databus.dbpedia.org/gg46ixav/test_group/test_artifact/2023-07-03 \ ---title "Test Dataset" \ ---abstract "This is a short abstract of the test dataset." \ ---description "This dataset was uploaded for testing the Nextcloud → Databus deployment pipeline." \ ---license https://dalicc.net/licenselibrary/Apache-2.0 \ -/home/CSVTest/newtestoutputfolder \ -/home/CSVTest/output.csv.bz2 - -``` ## CLI Usage **Installation** @@ -259,6 +221,98 @@ If using vault authentication, make sure the token file is available in the cont docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia-enterprise/live-fusion-snapshots/fusion/2025-08-23/fusion_props=all_subjectns=commons-wikimedia-org_vocab=all.ttl.gz --token vault-token.dat ``` +### Upload-and-deploy command +``` +databusclient upload-and-deploy --help +``` +``` +Usage: databusclient upload-and-deploy [OPTIONS] [FILES]... + + Upload files to Nextcloud and deploy to DBpedia Databus. + +Arguments: + FILES... files in the form of List[path], where every path must exist locally, which will be uploaded and deployed + +Options: + --webdav-url TEXT WebDAV URL (e.g., + https://cloud.example.com/remote.php/webdav) + --remote TEXT rclone remote name (e.g., 'nextcloud') + --path TEXT Remote path on Nextcloud (e.g., 'datasets/mydataset') + --no-upload Skip file upload and use existing metadata + --metadata PATH Path to metadata JSON file (required if --no-upload is + used) + --version-id TEXT Target databus version/dataset identifier of the form [required] + --title TEXT Dataset title [required] + --abstract TEXT Dataset abstract max 200 chars [required] + --description TEXT Dataset description [required] + --license TEXT License (see dalicc.net) [required] + --apikey TEXT API key [required] + --help Show this message and exit. +``` +The script uploads all given files and all files in the given folders to the given remote. +Then registers them on the databus. + + +#### Example of using upload-and-deploy command + +```bash +databusclient upload-and-deploy \ +--webdav-url https://cloud.scadsai.uni-leipzig.de/remote.php/webdav \ +--remote scads-nextcloud \ +--path test \ +--version-id https://databus.dbpedia.org/gg46ixav/test_group/test_artifact/2023-07-03 \ +--title "Test Dataset" \ +--abstract "This is a short abstract of the test dataset." \ +--description "This dataset was uploaded for testing the Nextcloud → Databus deployment pipeline." \ +--license https://dalicc.net/licenselibrary/Apache-2.0 \ +--api-key "API-KEY" +/home/CSVTest/newtestoutputfolder \ +/home/CSVTest/output.csv.bz2 +``` + + +### deploy command with metadata +``` +databusclient deploy-with-metadata --help +``` +``` +Usage: databusclient deploy-with-metadata [OPTIONS] + + Deploy to DBpedia Databus using metadata json file. + +Options: + --metadata PATH Path to metadata JSON file [required] + --version-id TEXT Target databus version/dataset identifier of the form [required] + --title TEXT Dataset title [required] + --abstract TEXT Dataset abstract max 200 chars [required] + --description TEXT Dataset description [required] + --license TEXT License (see dalicc.net) [required] + --apikey TEXT API key [required] + --help Show this message and exit. +``` + +Use the metadata.json file to list all files which should be added to the databus. +The script registers all files on the databus. + + +#### Examples of using deploy command + +```bash +databusclient upload-with-metadata \ + --metadata ./metadata.json \ + --version-id https://databus.org/user/dataset/version/1.0 \ + --title "Test Dataset" \ + --abstract "This is a short abstract of the test dataset." \ + --description "This dataset was uploaded for testing the Nextcloud → Databus deployment pipeline." \ + --license https://dalicc.net/licenselibrary/Apache-2.0 + +``` + + ## Module Usage ### Step 1: Create lists of distributions for the dataset From c985603f27e6d84969b0d00595f7686afcb8243c Mon Sep 17 00:00:00 2001 From: Theo Date: Mon, 3 Nov 2025 14:38:21 +0100 Subject: [PATCH 19/39] updated metadata_string for new metadata format --- databusclient/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/databusclient/client.py b/databusclient/client.py index 8bc5955..9ce45d5 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -442,7 +442,7 @@ def deploy_from_metadata(metadata, version_id, title, abstract, description, lic print(f"Deploying dataset version: {version_id}") deploy(dataset, apikey) - metadata_string = ",\n".join([entry[-1] for entry in metadata]) + metadata_string = ",\n".join(entry["url"] for entry in metadata) print(f"Successfully deployed\n{metadata_string}\nto databus {version_id}") From 62a3611f05a7a54df9ec3d229740b218ad63a712 Mon Sep 17 00:00:00 2001 From: Theo Date: Mon, 3 Nov 2025 14:40:34 +0100 Subject: [PATCH 20/39] updated README.md --- README.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index f9c3197..1f3ca17 100644 --- a/README.md +++ b/README.md @@ -262,14 +262,14 @@ databusclient upload-and-deploy \ --webdav-url https://cloud.scadsai.uni-leipzig.de/remote.php/webdav \ --remote scads-nextcloud \ --path test \ ---version-id https://databus.dbpedia.org/gg46ixav/test_group/test_artifact/2023-07-03 \ +--version-id https://databus.org/user/dataset/version/1.0 \ --title "Test Dataset" \ --abstract "This is a short abstract of the test dataset." \ --description "This dataset was uploaded for testing the Nextcloud → Databus deployment pipeline." \ --license https://dalicc.net/licenselibrary/Apache-2.0 \ ---api-key "API-KEY" -/home/CSVTest/newtestoutputfolder \ -/home/CSVTest/output.csv.bz2 +--apikey "API-KEY" \ +/home/test \ +/home/test_folder/test ``` @@ -295,20 +295,21 @@ Options: --help Show this message and exit. ``` -Use the metadata.json file to list all files which should be added to the databus. +Use the metadata.json file (see [databusclient/metadata.json](databusclient/metadata.json)) to list all files which should be added to the databus. The script registers all files on the databus. #### Examples of using deploy command ```bash -databusclient upload-with-metadata \ - --metadata ./metadata.json \ +databusclient deploy-with-metadata \ + --metadata /home/metadata.json \ --version-id https://databus.org/user/dataset/version/1.0 \ --title "Test Dataset" \ --abstract "This is a short abstract of the test dataset." \ --description "This dataset was uploaded for testing the Nextcloud → Databus deployment pipeline." \ - --license https://dalicc.net/licenselibrary/Apache-2.0 + --license https://dalicc.net/licenselibrary/Apache-2.0 \ + --apikey "API-KEY" ``` From 22ac02f5510fac43d1a596ed462d46c9264bf3f3 Mon Sep 17 00:00:00 2001 From: Theo Date: Mon, 3 Nov 2025 14:45:45 +0100 Subject: [PATCH 21/39] updated README.md --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 1f3ca17..7305fe5 100644 --- a/README.md +++ b/README.md @@ -221,6 +221,7 @@ If using vault authentication, make sure the token file is available in the cont docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia-enterprise/live-fusion-snapshots/fusion/2025-08-23/fusion_props=all_subjectns=commons-wikimedia-org_vocab=all.ttl.gz --token vault-token.dat ``` + ### Upload-and-deploy command ``` databusclient upload-and-deploy --help @@ -273,7 +274,7 @@ databusclient upload-and-deploy \ ``` -### deploy command with metadata +### deploy-with-metadata command ``` databusclient deploy-with-metadata --help ``` @@ -310,12 +311,10 @@ databusclient deploy-with-metadata \ --description "This dataset was uploaded for testing the Nextcloud → Databus deployment pipeline." \ --license https://dalicc.net/licenselibrary/Apache-2.0 \ --apikey "API-KEY" - ``` ## Module Usage - ### Step 1: Create lists of distributions for the dataset ```python From 3faaf4d02ebad96572f80a55bf708b2e06c39c02 Mon Sep 17 00:00:00 2001 From: Theo Date: Mon, 3 Nov 2025 15:03:49 +0100 Subject: [PATCH 22/39] Changed context url back --- databusclient/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/databusclient/client.py b/databusclient/client.py index 9ce45d5..72d131c 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -377,7 +377,7 @@ def append_to_dataset_graph_if_existent(add_key: str, add_value: str): graphs.append(dataset_graph) dataset = { - "@context": "https://databus.dbpedia.org/res/context.jsonld", + "@context": "https://downloads.dbpedia.org/databus/context.jsonld", "@graph": graphs, } return dataset From 5dfebe508fa228ff6ed56fcf1482721daa755340 Mon Sep 17 00:00:00 2001 From: Theo Date: Mon, 3 Nov 2025 15:04:22 +0100 Subject: [PATCH 23/39] added check for known compressions --- databusclient/client.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/databusclient/client.py b/databusclient/client.py index 72d131c..976c283 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -216,6 +216,9 @@ def create_distributions_from_metadata(metadata): # Expect a SHA-256 hex digest (64 chars). Reject others. if not isinstance(checksum, str) or len(checksum) != 64: raise ValueError(f"Invalid checksum for {filename}: expected SHA-256 hex (64 chars), got '{checksum}'") + # Known compression extensions + COMPRESSION_EXTS = {"gz", "bz2", "xz", "zip", "7z", "tar", "lz", "zst"} + parts = filename.split(".") if len(parts) == 1: file_format = "none" @@ -224,8 +227,18 @@ def create_distributions_from_metadata(metadata): file_format = parts[-1] compression = "none" else: - file_format = parts[-2] - compression = parts[-1] + # Check if last part is a known compression + + if parts[-1] in COMPRESSION_EXTS: + compression = parts[-1] + # Handle compound extensions like .tar.gz + if len(parts) > 2 and parts[-2] in COMPRESSION_EXTS: + file_format = parts[-3] if len(parts) > 3 else "file" + else: + file_format = parts[-2] + else: + file_format = parts[-1] + compression = "none" distributions.append( create_distribution( @@ -688,7 +701,7 @@ def __download_list__(urls: List[str], def __get_databus_id_parts__(uri: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str], Optional[str], Optional[str]]: uri = uri.removeprefix("https://").removeprefix("http://") parts = uri.strip("/").split("/") - parts += [None] * (6 - len(parts)) # pad with None if less than 6 parts + parts += [None] * (6 - len(parts)) # pad fwith None if less than 6 parts return tuple(parts[:6]) # return only the first 6 parts From f9367c0194d53992fa55817ab446c3a99a6271a4 Mon Sep 17 00:00:00 2001 From: Theo Date: Mon, 3 Nov 2025 15:05:24 +0100 Subject: [PATCH 24/39] updated checksum to sha256 --- databusclient/metadata.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/databusclient/metadata.json b/databusclient/metadata.json index 8c7a5d8..64363d2 100644 --- a/databusclient/metadata.json +++ b/databusclient/metadata.json @@ -1,13 +1,13 @@ [ { "filename": "example.ttl", - "checksum": "6e340b9cffb37a989ca544e6bb780a2c7e5d7dcb", + "checksum": "0929436d44bba110fc7578c138ed770ae9f548e195d19c2f00d813cca24b9f39", "size": 12345, "url": "https://cloud.example.com/remote.php/webdav/datasets/mydataset/example.ttl" }, { "filename": "example.csv.gz", - "checksum": "3f786850e387550fdab836ed7e6dc881de23001b", + "checksum": "2238acdd7cf6bc8d9c9963a9f6014051c754bf8a04aacc5cb10448e2da72c537", "size": 54321, "url": "https://cloud.example.com/remote.php/webdav/datasets/mydataset/example.csv.gz" } From 5d474dbfdefd71772b766528f19aa6bfbaa533bc Mon Sep 17 00:00:00 2001 From: Theo Date: Mon, 3 Nov 2025 15:07:37 +0100 Subject: [PATCH 25/39] updated README.md --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 7305fe5..d7df5f5 100644 --- a/README.md +++ b/README.md @@ -223,10 +223,10 @@ docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://d ### Upload-and-deploy command -``` +```bash databusclient upload-and-deploy --help ``` -``` +```text Usage: databusclient upload-and-deploy [OPTIONS] [FILES]... Upload files to Nextcloud and deploy to DBpedia Databus. @@ -275,10 +275,10 @@ databusclient upload-and-deploy \ ### deploy-with-metadata command -``` +```bash databusclient deploy-with-metadata --help ``` -``` +```text Usage: databusclient deploy-with-metadata [OPTIONS] Deploy to DBpedia Databus using metadata json file. @@ -300,7 +300,7 @@ Use the metadata.json file (see [databusclient/metadata.json](databusclient/meta The script registers all files on the databus. -#### Examples of using deploy command +#### Examples of using deploy-with-metadata command ```bash databusclient deploy-with-metadata \ From bef78eff2455f2936c8c3f4bf30a1c3d0d4157a6 Mon Sep 17 00:00:00 2001 From: Theo Date: Mon, 3 Nov 2025 15:15:24 +0100 Subject: [PATCH 26/39] size check --- databusclient/client.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/databusclient/client.py b/databusclient/client.py index 976c283..a9e3033 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -212,6 +212,8 @@ def create_distributions_from_metadata(metadata): filename = entry["filename"] checksum = entry["checksum"] size = entry["size"] + if not isinstance(size, int) or size <= 0: + raise ValueError(f"Invalid size for {filename}: expected positive integer, got {size}") url = entry["url"] # Expect a SHA-256 hex digest (64 chars). Reject others. if not isinstance(checksum, str) or len(checksum) != 64: From 529f2aeee0dd3df40f48c4f3c030291e7bc6c292 Mon Sep 17 00:00:00 2001 From: Theo Date: Mon, 3 Nov 2025 15:17:16 +0100 Subject: [PATCH 27/39] updated checksum validation --- databusclient/client.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/databusclient/client.py b/databusclient/client.py index a9e3033..79e19ab 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -215,9 +215,10 @@ def create_distributions_from_metadata(metadata): if not isinstance(size, int) or size <= 0: raise ValueError(f"Invalid size for {filename}: expected positive integer, got {size}") url = entry["url"] - # Expect a SHA-256 hex digest (64 chars). Reject others. - if not isinstance(checksum, str) or len(checksum) != 64: - raise ValueError(f"Invalid checksum for {filename}: expected SHA-256 hex (64 chars), got '{checksum}'") + # Validate SHA-256 hex digest (64 hex chars) + if not isinstance(checksum, str) or len(checksum) != 64 or not all( + c in '0123456789abcdefABCDEF' for c in checksum): + raise ValueError(f"Invalid checksum for {filename}") # Known compression extensions COMPRESSION_EXTS = {"gz", "bz2", "xz", "zip", "7z", "tar", "lz", "zst"} From 77dca5a80e1907d2f75ab029713b27f71e843f2d Mon Sep 17 00:00:00 2001 From: Theo Date: Mon, 3 Nov 2025 15:28:02 +0100 Subject: [PATCH 28/39] added doc --- databusclient/client.py | 61 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 4 deletions(-) diff --git a/databusclient/client.py b/databusclient/client.py index 79e19ab..003341b 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -205,10 +205,33 @@ def create_distribution( return f"{url}|{meta_string}" -def create_distributions_from_metadata(metadata): +def create_distributions_from_metadata(metadata: List[Dict[str, Union[str, int]]]) -> List[str]: + """ + Create distributions from metadata entries. + + Parameters + ---------- + metadata : List[Dict[str, Union[str, int]]] + List of metadata entries, each containing: + - filename: str - Name of the file + - checksum: str - SHA-256 hex digest (64 characters) + - size: int - File size in bytes (positive integer) + - url: str - Download URL for the file + + Returns + ------- + List[str] + List of distribution identifier strings for use with create_dataset + """ distributions = [] counter = 0 for entry in metadata: + # Validate required keys + required_keys = ["filename", "checksum", "size", "url"] + missing_keys = [key for key in required_keys if key not in entry] + if missing_keys: + raise ValueError(f"Metadata entry missing required keys: {missing_keys}") + filename = entry["filename"] checksum = entry["checksum"] size = entry["size"] @@ -443,7 +466,35 @@ def deploy( print(resp.text) -def deploy_from_metadata(metadata, version_id, title, abstract, description, license_url, apikey): +def deploy_from_metadata( + metadata: List[Dict[str, Union[str, int]]], + version_id: str, + title: str, + abstract: str, + description: str, + license_url: str, + apikey: str +) -> None: + """ + Deploy a dataset from metadata entries. + + Parameters + ---------- + metadata : List[Dict[str, Union[str, int]]] + List of file metadata entries (see create_distributions_from_metadata) + version_id : str + Dataset version ID in the form $DATABUS_BASE/$ACCOUNT/$GROUP/$ARTIFACT/$VERSION + title : str + Dataset title + abstract : str + Short description of the dataset + description : str + Long description (Markdown supported) + license_url : str + License URI + apikey : str + API key for authentication + """ distributions = create_distributions_from_metadata(metadata) dataset = create_dataset( @@ -458,8 +509,10 @@ def deploy_from_metadata(metadata, version_id, title, abstract, description, lic print(f"Deploying dataset version: {version_id}") deploy(dataset, apikey) - metadata_string = ",\n".join(entry["url"] for entry in metadata) - print(f"Successfully deployed\n{metadata_string}\nto databus {version_id}") + print(f"Successfully deployed to {version_id}") + print(f"Deployed {len(metadata)} file(s):") + for entry in metadata: + print(f" - {entry['filename']}") def __download_file__(url, filename, vault_token_file=None, auth_url=None, client_id=None) -> None: From 02b1873cd6b5954dec3827c4ae3e0f433b8844c5 Mon Sep 17 00:00:00 2001 From: Theo Date: Tue, 4 Nov 2025 14:35:52 +0100 Subject: [PATCH 29/39] - refactored deploy, upload_and_deploy and deploy_with_metadata to one single deploy command - added simple validate_distributions function --- databusclient/cli.py | 138 +++++++++++++++++----------------------- databusclient/client.py | 19 ++++++ 2 files changed, 78 insertions(+), 79 deletions(-) diff --git a/databusclient/cli.py b/databusclient/cli.py index 77e1bca..a8e2e14 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -1,5 +1,7 @@ #!/usr/bin/env python3 import json +import os +import re import click from typing import List @@ -25,90 +27,68 @@ def app(): @click.option("--description", required=True, help="Dataset description") @click.option("--license", "license_url", required=True, help="License (see dalicc.net)") @click.option("--apikey", required=True, help="API key") -@click.argument( - "distributions", - nargs=-1, - required=True, -) -def deploy(version_id, title, abstract, description, license_url, apikey, distributions: List[str]): - """ - Deploy a dataset version with the provided metadata and distributions. - """ - click.echo(f"Deploying dataset version: {version_id}") - dataid = client.create_dataset(version_id, title, abstract, description, license_url, distributions) - client.deploy(dataid=dataid, api_key=apikey) - - -@app.command() -@click.option( - "--metadata", "metadata_file", - required=True, - type=click.Path(exists=True), - help="Path to metadata JSON file", -) -@click.option( - "--version-id", "version_id", - required=True, - help="Target databus version/dataset identifier of the form " - "", -) -@click.option("--title", required=True, help="Dataset title") -@click.option("--abstract", required=True, help="Dataset abstract max 200 chars") -@click.option("--description", required=True, help="Dataset description") -@click.option("--license", "license_url", required=True, help="License (see dalicc.net)") -@click.option("--apikey", required=True, help="API key") -def deploy_with_metadata(metadata_file, version_id, title, abstract, description, license_url, apikey): - """ - Deploy to DBpedia Databus using metadata json file. - """ - - with open(metadata_file, 'r') as f: - metadata = json.load(f) - client.deploy_from_metadata(metadata, version_id, title, abstract, description, license_url, apikey) +@click.option("--metadata", "metadata_file", type=click.Path(exists=True), + help="Path to metadata JSON file (for metadata mode)") +@click.option("--webdav-url", "webdav_url", help="WebDAV URL (e.g., https://cloud.example.com/remote.php/webdav)") +@click.option("--remote", help="rclone remote name (e.g., 'nextcloud')") +@click.option("--path", help="Remote path on Nextcloud (e.g., 'datasets/mydataset')") - -@app.command() -@click.option( - "--webdav-url", "webdav_url", - required=True, - help="WebDAV URL (e.g., https://cloud.example.com/remote.php/webdav)", -) -@click.option( - "--remote", - required=True, - help="rclone remote name (e.g., 'nextcloud')", -) -@click.option( - "--path", - required=True, - help="Remote path on Nextcloud (e.g., 'datasets/mydataset')", -) -@click.option( - "--version-id", "version_id", - required=True, - help="Target databus version/dataset identifier of the form " - "", -) -@click.option("--title", required=True, help="Dataset title") -@click.option("--abstract", required=True, help="Dataset abstract max 200 chars") -@click.option("--description", required=True, help="Dataset description") -@click.option("--license", "license_url", required=True, help="License (see dalicc.net)") -@click.option("--apikey", required=True, help="API key") -@click.argument( - "files", - nargs=-1, - type=click.Path(exists=True), -) -def upload_and_deploy(webdav_url, remote, path, version_id, title, abstract, description, license_url, apikey, - files: List[str]): +@click.argument("inputs", nargs=-1) +def deploy(version_id, title, abstract, description, license_url, apikey, + metadata_file, webdav_url, remote, path, inputs: List[str]): """ - Upload files to Nextcloud and deploy to DBpedia Databus. + Flexible deploy to databus command:\n + - Classic dataset deployment\n + - Metadata-based deployment\n + - Upload & deploy via Nextcloud """ - click.echo(f"Uploading data to nextcloud: {remote}") - metadata = upload.upload_to_nextcloud(files, remote, path, webdav_url) - client.deploy_from_metadata(metadata, version_id, title, abstract, description, license_url, apikey) + # === Mode 1: Upload & Deploy (Nextcloud) === + if webdav_url and remote and path: + if not inputs: + raise click.UsageError("Please provide files to upload when using WebDAV/Nextcloud mode.") + + #Check that all given paths exist and are files or directories.# + invalid = [f for f in inputs if not os.path.exists(f)] + if invalid: + raise click.UsageError(f"The following input files or folders do not exist: {', '.join(invalid)}") + + click.echo(f"[MODE] Upload & Deploy to DBpedia Databus via Nextcloud") + click.echo(f"→ Uploading to: {remote}:{path}") + metadata = upload.upload_to_nextcloud(inputs, remote, path, webdav_url) + client.deploy_from_metadata(metadata, version_id, title, abstract, description, license_url, apikey) + return + + # === Mode 2: Metadata File === + if metadata_file: + click.echo(f"[MODE] Deploy from metadata file: {metadata_file}") + with open(metadata_file, 'r') as f: + metadata = json.load(f) + client.deploy_from_metadata(metadata, version_id, title, abstract, description, license_url, apikey) + return + + # === Mode 3: Classic Deploy === + if inputs: + invalid = client.validate_distributions(inputs) + if invalid: + raise click.UsageError( + f"The following distributions are not in a valid format:\n" + + "\n".join(invalid) + + "\nExpected format example:\n" + "https://example.com/file.ttl|format=ttl|gzip|abcdef123456789:12345" + ) + click.echo(f"[MODE] Classic deploy with distributions") + dataid = client.create_dataset(version_id, title, abstract, description, license_url, inputs) + client.deploy(dataid=dataid, api_key=apikey) + return + + raise click.UsageError( + "No valid input provided. Please use one of the following modes:\n" + " - Classic deploy: pass distributions as arguments\n" + " - Metadata deploy: use --metadata \n" + " - Upload & deploy: use --webdav-url, --remote, --path, and file arguments" + ) @app.command() diff --git a/databusclient/client.py b/databusclient/client.py index 003341b..b4aa28c 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -279,6 +279,25 @@ def create_distributions_from_metadata(metadata: List[Dict[str, Union[str, int]] return distributions +def validate_distributions(distros: List[str]) -> List[str]: + """ + Check that all distributions follow the pattern: + url|key=value|[format]|[compression]|[sha256:len] + + Parameters + ---------- + List[str] + List of distribution identifiers to validate + + Returns + ------- + List[str] + List of invalid distribution identifier strings + """ + pattern = re.compile(r"^https?://[^|]+\|.+$") + return [d for d in distros if not pattern.match(d)] + + def create_dataset( version_id: str, title: str, From 04c0b6ea1e0e10ac241a4c6ec187b335f7f6cf36 Mon Sep 17 00:00:00 2001 From: Theo Date: Tue, 4 Nov 2025 14:35:59 +0100 Subject: [PATCH 30/39] updated README.md --- README.md | 179 ++++++++++++++++++++++++------------------------------ 1 file changed, 80 insertions(+), 99 deletions(-) diff --git a/README.md b/README.md index d7df5f5..a44e2c5 100644 --- a/README.md +++ b/README.md @@ -163,13 +163,25 @@ databusclient download 'PREFIX dcat: SELECT ?x WHER databusclient deploy --help ``` ``` -Usage: databusclient deploy [OPTIONS] DISTRIBUTIONS... +Usage: databusclient deploy [OPTIONS] [INPUTS]... -Arguments: - DISTRIBUTIONS... distributions in the form of List[URL|CV|fileext|compression|sha256sum:contentlength] where URL is the - download URL and CV the key=value pairs (_ separted) - content variants of a distribution, fileExt and Compression can be set, if not they are inferred from the path [required] + Flexible deploy to databus command: + + - Classic dataset deployment + + - Metadata-based deployment + - Upload & deploy via Nextcloud + +Arguments: + INPUTS... Depending on mode: + - Classic mode: List of distributions in the form + URL|CV|fileext|compression|sha256sum:contentlength + (where URL is the download URL and CV the key=value pairs, + separated by underscores) + - Upload mode: List of local file or folder paths (must exist) + - Metdata mode: None + Options: --version-id TEXT Target databus version/dataset identifier of the form [required] - --title TEXT Dataset title [required] - --abstract TEXT Dataset abstract max 200 chars [required] - --description TEXT Dataset description [required] - --license TEXT License (see dalicc.net) [required] - --apikey TEXT API key [required] - --help Show this message and exit. -``` -The script uploads all given files and all files in the given folders to the given remote. -Then registers them on the databus. - - -#### Example of using upload-and-deploy command - -```bash -databusclient upload-and-deploy \ ---webdav-url https://cloud.scadsai.uni-leipzig.de/remote.php/webdav \ ---remote scads-nextcloud \ ---path test \ ---version-id https://databus.org/user/dataset/version/1.0 \ ---title "Test Dataset" \ ---abstract "This is a short abstract of the test dataset." \ ---description "This dataset was uploaded for testing the Nextcloud → Databus deployment pipeline." \ ---license https://dalicc.net/licenselibrary/Apache-2.0 \ ---apikey "API-KEY" \ -/home/test \ -/home/test_folder/test -``` - - -### deploy-with-metadata command -```bash -databusclient deploy-with-metadata --help -``` -```text -Usage: databusclient deploy-with-metadata [OPTIONS] - - Deploy to DBpedia Databus using metadata json file. - -Options: - --metadata PATH Path to metadata JSON file [required] - --version-id TEXT Target databus version/dataset identifier of the form [required] - --title TEXT Dataset title [required] - --abstract TEXT Dataset abstract max 200 chars [required] - --description TEXT Dataset description [required] - --license TEXT License (see dalicc.net) [required] - --apikey TEXT API key [required] - --help Show this message and exit. -``` - -Use the metadata.json file (see [databusclient/metadata.json](databusclient/metadata.json)) to list all files which should be added to the databus. -The script registers all files on the databus. - - -#### Examples of using deploy-with-metadata command - -```bash -databusclient deploy-with-metadata \ - --metadata /home/metadata.json \ - --version-id https://databus.org/user/dataset/version/1.0 \ - --title "Test Dataset" \ - --abstract "This is a short abstract of the test dataset." \ - --description "This dataset was uploaded for testing the Nextcloud → Databus deployment pipeline." \ - --license https://dalicc.net/licenselibrary/Apache-2.0 \ - --apikey "API-KEY" -``` - - ## Module Usage ### Step 1: Create lists of distributions for the dataset From fb93bc97d023dfee0ae5ec44fa5462a4eed9558b Mon Sep 17 00:00:00 2001 From: Theo Date: Tue, 4 Nov 2025 14:49:46 +0100 Subject: [PATCH 31/39] fixed docstring --- databusclient/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/databusclient/client.py b/databusclient/client.py index b4aa28c..f153f91 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -286,7 +286,7 @@ def validate_distributions(distros: List[str]) -> List[str]: Parameters ---------- - List[str] + distros: List[str] List of distribution identifiers to validate Returns From 8e6167b7c74b46bebfe058d19a0aee6711171fd7 Mon Sep 17 00:00:00 2001 From: Theo Date: Tue, 4 Nov 2025 14:49:58 +0100 Subject: [PATCH 32/39] removed metadata.json --- databusclient/metadata.json | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100644 databusclient/metadata.json diff --git a/databusclient/metadata.json b/databusclient/metadata.json deleted file mode 100644 index 64363d2..0000000 --- a/databusclient/metadata.json +++ /dev/null @@ -1,14 +0,0 @@ -[ - { - "filename": "example.ttl", - "checksum": "0929436d44bba110fc7578c138ed770ae9f548e195d19c2f00d813cca24b9f39", - "size": 12345, - "url": "https://cloud.example.com/remote.php/webdav/datasets/mydataset/example.ttl" - }, - { - "filename": "example.csv.gz", - "checksum": "2238acdd7cf6bc8d9c9963a9f6014051c754bf8a04aacc5cb10448e2da72c537", - "size": 54321, - "url": "https://cloud.example.com/remote.php/webdav/datasets/mydataset/example.csv.gz" - } -] From 943e30bdf6262e08f2c2fa1014c51509b68d2d35 Mon Sep 17 00:00:00 2001 From: Theo Date: Tue, 4 Nov 2025 14:52:02 +0100 Subject: [PATCH 33/39] moved COMPRESSION_EXTS out of loop --- databusclient/client.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/databusclient/client.py b/databusclient/client.py index f153f91..c840814 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -225,6 +225,10 @@ def create_distributions_from_metadata(metadata: List[Dict[str, Union[str, int]] """ distributions = [] counter = 0 + + # Known compression extensions + COMPRESSION_EXTS = {"gz", "bz2", "xz", "zip", "7z", "tar", "lz", "zst"} + for entry in metadata: # Validate required keys required_keys = ["filename", "checksum", "size", "url"] @@ -242,8 +246,6 @@ def create_distributions_from_metadata(metadata: List[Dict[str, Union[str, int]] if not isinstance(checksum, str) or len(checksum) != 64 or not all( c in '0123456789abcdefABCDEF' for c in checksum): raise ValueError(f"Invalid checksum for {filename}") - # Known compression extensions - COMPRESSION_EXTS = {"gz", "bz2", "xz", "zip", "7z", "tar", "lz", "zst"} parts = filename.split(".") if len(parts) == 1: @@ -254,7 +256,6 @@ def create_distributions_from_metadata(metadata: List[Dict[str, Union[str, int]] compression = "none" else: # Check if last part is a known compression - if parts[-1] in COMPRESSION_EXTS: compression = parts[-1] # Handle compound extensions like .tar.gz From 1274cbcf63a7fdfd5de6a37f5f3c0d069fad3a83 Mon Sep 17 00:00:00 2001 From: Theo Date: Tue, 4 Nov 2025 14:53:11 +0100 Subject: [PATCH 34/39] removed unnecessary f-strings --- databusclient/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/databusclient/cli.py b/databusclient/cli.py index a8e2e14..8a014d1 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -54,7 +54,7 @@ def deploy(version_id, title, abstract, description, license_url, apikey, if invalid: raise click.UsageError(f"The following input files or folders do not exist: {', '.join(invalid)}") - click.echo(f"[MODE] Upload & Deploy to DBpedia Databus via Nextcloud") + click.echo("[MODE] Upload & Deploy to DBpedia Databus via Nextcloud") click.echo(f"→ Uploading to: {remote}:{path}") metadata = upload.upload_to_nextcloud(inputs, remote, path, webdav_url) client.deploy_from_metadata(metadata, version_id, title, abstract, description, license_url, apikey) @@ -78,7 +78,7 @@ def deploy(version_id, title, abstract, description, license_url, apikey, + "\nExpected format example:\n" "https://example.com/file.ttl|format=ttl|gzip|abcdef123456789:12345" ) - click.echo(f"[MODE] Classic deploy with distributions") + click.echo("[MODE] Classic deploy with distributions") dataid = client.create_dataset(version_id, title, abstract, description, license_url, inputs) client.deploy(dataid=dataid, api_key=apikey) return From 02481b31d37991e773c36df57fa9538f6b02f4cf Mon Sep 17 00:00:00 2001 From: Theo Date: Tue, 4 Nov 2025 15:21:28 +0100 Subject: [PATCH 35/39] set file_format and compression to None --- databusclient/client.py | 27 ++------------------------- 1 file changed, 2 insertions(+), 25 deletions(-) diff --git a/databusclient/client.py b/databusclient/client.py index c840814..b724e87 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -226,9 +226,6 @@ def create_distributions_from_metadata(metadata: List[Dict[str, Union[str, int]] distributions = [] counter = 0 - # Known compression extensions - COMPRESSION_EXTS = {"gz", "bz2", "xz", "zip", "7z", "tar", "lz", "zst"} - for entry in metadata: # Validate required keys required_keys = ["filename", "checksum", "size", "url"] @@ -247,32 +244,12 @@ def create_distributions_from_metadata(metadata: List[Dict[str, Union[str, int]] c in '0123456789abcdefABCDEF' for c in checksum): raise ValueError(f"Invalid checksum for {filename}") - parts = filename.split(".") - if len(parts) == 1: - file_format = "none" - compression = "none" - elif len(parts) == 2: - file_format = parts[-1] - compression = "none" - else: - # Check if last part is a known compression - if parts[-1] in COMPRESSION_EXTS: - compression = parts[-1] - # Handle compound extensions like .tar.gz - if len(parts) > 2 and parts[-2] in COMPRESSION_EXTS: - file_format = parts[-3] if len(parts) > 3 else "file" - else: - file_format = parts[-2] - else: - file_format = parts[-1] - compression = "none" - distributions.append( create_distribution( url=url, cvs={"count": f"{counter}"}, - file_format=file_format, - compression=compression, + file_format=None, + compression=None, sha256_length_tuple=(checksum, size) ) ) From a5ec24d65e329be9b614b8a6c8fc552b9cce8773 Mon Sep 17 00:00:00 2001 From: Theo Date: Tue, 4 Nov 2025 15:28:41 +0100 Subject: [PATCH 36/39] get file_format and compression from metadata file --- databusclient/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/databusclient/client.py b/databusclient/client.py index b724e87..d8f5330 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -248,8 +248,8 @@ def create_distributions_from_metadata(metadata: List[Dict[str, Union[str, int]] create_distribution( url=url, cvs={"count": f"{counter}"}, - file_format=None, - compression=None, + file_format=entry.get("file_format"), + compression=entry.get("compression"), sha256_length_tuple=(checksum, size) ) ) From f95155f08ab273fe3525abe415173d659cac39de Mon Sep 17 00:00:00 2001 From: Theo Date: Tue, 4 Nov 2025 15:29:17 +0100 Subject: [PATCH 37/39] updated README.md --- README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index a44e2c5..12af04b 100644 --- a/README.md +++ b/README.md @@ -230,20 +230,21 @@ databusclient deploy \ --license https://dalicc.net/licenselibrary/Apache-2.0 \ --apikey "API-KEY" ``` -Metadata file structure: +Metadata file structure (file_format and compression are optional): ```json [ { - "filename": "example.ttl", "checksum": "0929436d44bba110fc7578c138ed770ae9f548e195d19c2f00d813cca24b9f39", "size": 12345, - "url": "https://cloud.example.com/remote.php/webdav/datasets/mydataset/example.ttl" + "url": "https://cloud.example.com/remote.php/webdav/datasets/mydataset/example.ttl", + "file_format": "ttl" }, { - "filename": "example.csv.gz", "checksum": "2238acdd7cf6bc8d9c9963a9f6014051c754bf8a04aacc5cb10448e2da72c537", "size": 54321, - "url": "https://cloud.example.com/remote.php/webdav/datasets/mydataset/example.csv.gz" + "url": "https://cloud.example.com/remote.php/webdav/datasets/mydataset/example.csv.gz", + "file_format": "csv", + "compression": "gz" } ] From 274f252cad28c908dd15f9eee9e06a897c54a5fd Mon Sep 17 00:00:00 2001 From: Integer-Ctrl Date: Wed, 5 Nov 2025 11:12:05 +0100 Subject: [PATCH 38/39] chores --- README.md | 10 +++---- databusclient/cli.py | 61 ++++++++++++++++++++--------------------- databusclient/client.py | 23 +--------------- 3 files changed, 36 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index 12af04b..0b65641 100644 --- a/README.md +++ b/README.md @@ -163,7 +163,7 @@ databusclient download 'PREFIX dcat: SELECT ?x WHER databusclient deploy --help ``` ``` -Usage: databusclient deploy [OPTIONS] [INPUTS]... +Usage: databusclient deploy [OPTIONS] [DISTRIBUTIONS]... Flexible deploy to databus command: @@ -174,7 +174,7 @@ Usage: databusclient deploy [OPTIONS] [INPUTS]... - Upload & deploy via Nextcloud Arguments: - INPUTS... Depending on mode: + DISTRIBUTIONS... Depending on mode: - Classic mode: List of distributions in the form URL|CV|fileext|compression|sha256sum:contentlength (where URL is the download URL and CV the key=value pairs, @@ -200,7 +200,7 @@ Options: ``` #### Examples of using deploy command -Mode 1: Classic Deploy (Distributions) +##### Mode 1: Classic Deploy (Distributions) ``` databusclient deploy --version-id https://databus.dbpedia.org/user1/group1/artifact1/2022-05-18 --title title1 --abstract abstract1 --description description1 --license http://dalicc.net/licenselibrary/AdaptivePublicLicense10 --apikey MYSTERIOUS 'https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml|type=swagger' ``` @@ -215,7 +215,7 @@ A few more notes for CLI usage: * If other parameters are used, you need to leave them empty like `https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml||yml|7a751b6dd5eb8d73d97793c3c564c71ab7b565fa4ba619e4a8fd05a6f80ff653:367116` -Mode 2: Deploy with Metadata File +##### Mode 2: Deploy with Metadata File Use a JSON metadata file to define all distributions. The metadata.json should list all distributions and their metadata. @@ -251,7 +251,7 @@ Metadata file structure (file_format and compression are optional): ``` -Mode 3: Upload & Deploy via Nextcloud +##### Mode 3: Upload & Deploy via Nextcloud Upload local files or folders to a WebDAV/Nextcloud instance and automatically deploy to DBpedia Databus. Rclone is required. diff --git a/databusclient/cli.py b/databusclient/cli.py index 8a014d1..0e57db3 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import json import os -import re import click from typing import List @@ -34,30 +33,29 @@ def app(): @click.option("--remote", help="rclone remote name (e.g., 'nextcloud')") @click.option("--path", help="Remote path on Nextcloud (e.g., 'datasets/mydataset')") -@click.argument("inputs", nargs=-1) +@click.argument("distributions", nargs=-1) def deploy(version_id, title, abstract, description, license_url, apikey, - metadata_file, webdav_url, remote, path, inputs: List[str]): + metadata_file, webdav_url, remote, path, distributions: List[str]): """ - Flexible deploy to databus command:\n - - Classic dataset deployment\n - - Metadata-based deployment\n - - Upload & deploy via Nextcloud + Flexible deploy to Databus command supporting three modes:\n + - Classic deploy (distributions as arguments)\n + - Metadata-based deploy (--metadata )\n + - Upload & deploy via Nextcloud (--webdav-url, --remote, --path) """ - # === Mode 1: Upload & Deploy (Nextcloud) === - if webdav_url and remote and path: - if not inputs: - raise click.UsageError("Please provide files to upload when using WebDAV/Nextcloud mode.") + # Sanity checks for conflicting options + if metadata_file and any([distributions, webdav_url, remote, path]): + raise click.UsageError("Invalid combination: when using --metadata, do not provide --webdav-url, --remote, --path, or distributions.") + if any([webdav_url, remote, path]) and not all([webdav_url, remote, path]): + raise click.UsageError("Invalid combination: when using WebDAV/Nextcloud mode, please provide --webdav-url, --remote, and --path together.") - #Check that all given paths exist and are files or directories.# - invalid = [f for f in inputs if not os.path.exists(f)] - if invalid: - raise click.UsageError(f"The following input files or folders do not exist: {', '.join(invalid)}") + # === Mode 1: Classic Deploy === + if distributions and not (metadata_file or webdav_url or remote or path): + click.echo("[MODE] Classic deploy with distributions") + click.echo(f"Deploying dataset version: {version_id}") - click.echo("[MODE] Upload & Deploy to DBpedia Databus via Nextcloud") - click.echo(f"→ Uploading to: {remote}:{path}") - metadata = upload.upload_to_nextcloud(inputs, remote, path, webdav_url) - client.deploy_from_metadata(metadata, version_id, title, abstract, description, license_url, apikey) + dataid = client.create_dataset(version_id, title, abstract, description, license_url, distributions) + client.deploy(dataid=dataid, api_key=apikey) return # === Mode 2: Metadata File === @@ -67,20 +65,21 @@ def deploy(version_id, title, abstract, description, license_url, apikey, metadata = json.load(f) client.deploy_from_metadata(metadata, version_id, title, abstract, description, license_url, apikey) return + + # === Mode 3: Upload & Deploy (Nextcloud) === + if webdav_url and remote and path: + if not distributions: + raise click.UsageError("Please provide files to upload when using WebDAV/Nextcloud mode.") - # === Mode 3: Classic Deploy === - if inputs: - invalid = client.validate_distributions(inputs) + #Check that all given paths exist and are files or directories.# + invalid = [f for f in distributions if not os.path.exists(f)] if invalid: - raise click.UsageError( - f"The following distributions are not in a valid format:\n" - + "\n".join(invalid) - + "\nExpected format example:\n" - "https://example.com/file.ttl|format=ttl|gzip|abcdef123456789:12345" - ) - click.echo("[MODE] Classic deploy with distributions") - dataid = client.create_dataset(version_id, title, abstract, description, license_url, inputs) - client.deploy(dataid=dataid, api_key=apikey) + raise click.UsageError(f"The following input files or folders do not exist: {', '.join(invalid)}") + + click.echo("[MODE] Upload & Deploy to DBpedia Databus via Nextcloud") + click.echo(f"→ Uploading to: {remote}:{path}") + metadata = upload.upload_to_nextcloud(distributions, remote, path, webdav_url) + client.deploy_from_metadata(metadata, version_id, title, abstract, description, license_url, apikey) return raise click.UsageError( diff --git a/databusclient/client.py b/databusclient/client.py index d8f5330..5fde766 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -7,7 +7,6 @@ from SPARQLWrapper import SPARQLWrapper, JSON from hashlib import sha256 import os -import re __debug = False @@ -256,26 +255,6 @@ def create_distributions_from_metadata(metadata: List[Dict[str, Union[str, int]] counter += 1 return distributions - -def validate_distributions(distros: List[str]) -> List[str]: - """ - Check that all distributions follow the pattern: - url|key=value|[format]|[compression]|[sha256:len] - - Parameters - ---------- - distros: List[str] - List of distribution identifiers to validate - - Returns - ------- - List[str] - List of invalid distribution identifier strings - """ - pattern = re.compile(r"^https?://[^|]+\|.+$") - return [d for d in distros if not pattern.match(d)] - - def create_dataset( version_id: str, title: str, @@ -754,7 +733,7 @@ def __download_list__(urls: List[str], def __get_databus_id_parts__(uri: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str], Optional[str], Optional[str]]: uri = uri.removeprefix("https://").removeprefix("http://") parts = uri.strip("/").split("/") - parts += [None] * (6 - len(parts)) # pad fwith None if less than 6 parts + parts += [None] * (6 - len(parts)) # pad with None if less than 6 parts return tuple(parts[:6]) # return only the first 6 parts From f22c71d6c1798ba28f72d48635f97d4786fa688f Mon Sep 17 00:00:00 2001 From: Theo Date: Wed, 5 Nov 2025 12:48:13 +0100 Subject: [PATCH 39/39] updated metadata format (removed filename - used url instead) --- databusclient/client.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/databusclient/client.py b/databusclient/client.py index 5fde766..358f1a6 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -212,10 +212,11 @@ def create_distributions_from_metadata(metadata: List[Dict[str, Union[str, int]] ---------- metadata : List[Dict[str, Union[str, int]]] List of metadata entries, each containing: - - filename: str - Name of the file - checksum: str - SHA-256 hex digest (64 characters) - size: int - File size in bytes (positive integer) - url: str - Download URL for the file + - file_format: str - File format of the file [optional] + - compression: str - Compression format of the file [optional] Returns ------- @@ -227,21 +228,20 @@ def create_distributions_from_metadata(metadata: List[Dict[str, Union[str, int]] for entry in metadata: # Validate required keys - required_keys = ["filename", "checksum", "size", "url"] + required_keys = ["checksum", "size", "url"] missing_keys = [key for key in required_keys if key not in entry] if missing_keys: raise ValueError(f"Metadata entry missing required keys: {missing_keys}") - filename = entry["filename"] checksum = entry["checksum"] size = entry["size"] - if not isinstance(size, int) or size <= 0: - raise ValueError(f"Invalid size for {filename}: expected positive integer, got {size}") url = entry["url"] + if not isinstance(size, int) or size <= 0: + raise ValueError(f"Invalid size for {url}: expected positive integer, got {size}") # Validate SHA-256 hex digest (64 hex chars) if not isinstance(checksum, str) or len(checksum) != 64 or not all( c in '0123456789abcdefABCDEF' for c in checksum): - raise ValueError(f"Invalid checksum for {filename}") + raise ValueError(f"Invalid checksum for {url}") distributions.append( create_distribution( @@ -488,7 +488,7 @@ def deploy_from_metadata( print(f"Successfully deployed to {version_id}") print(f"Deployed {len(metadata)} file(s):") for entry in metadata: - print(f" - {entry['filename']}") + print(f" - {entry['url']}") def __download_file__(url, filename, vault_token_file=None, auth_url=None, client_id=None) -> None: