Skip to content

Commit

Permalink
refactor: modularize xmlupload method (DEV-2836) (#574)
Browse files Browse the repository at this point in the history
  • Loading branch information
BalduinLandolt committed Oct 18, 2023
1 parent a41b174 commit b9588c3
Show file tree
Hide file tree
Showing 7 changed files with 137 additions and 233 deletions.
35 changes: 16 additions & 19 deletions pyproject.toml
Expand Up @@ -96,28 +96,25 @@ addopts = ["--import-mode=importlib"]


[tool.mypy]
ignore_missing_imports = true # TODO: deactivate this
ignore_missing_imports = true # TODO: deactivate this
show_column_numbers = true
strict = true
exclude = [
"src/dsp_tools/import_scripts", # TODO: activate this
"src/dsp_tools/models/exceptions.py", # TODO: activate this
"src/dsp_tools/models/group.py", # TODO: activate this
"src/dsp_tools/models/helpers.py ", # TODO: activate this
"src/dsp_tools/models/langstring.py", # TODO: activate this
"src/dsp_tools/models/listnode.py", # TODO: activate this
"src/dsp_tools/models/model.py", # TODO: activate this
"src/dsp_tools/models/ontology.py", # TODO: activate this
"src/dsp_tools/models/permission.py", # TODO: activate this
"src/dsp_tools/models/project.py", # TODO: activate this
"src/dsp_tools/models/projectContext.py", # TODO: activate this
"src/dsp_tools/models/propertyclass.py", # TODO: activate this
"src/dsp_tools/models/propertyelement.py", # TODO: activate this
"src/dsp_tools/models/resource.py", # TODO: activate this
"src/dsp_tools/models/resourceclass.py", # TODO: activate this
"src/dsp_tools/models/user.py", # TODO: activate this
"src/dsp_tools/models/value.py", # TODO: activate this
"src/dsp_tools/models/xmlresource.py", # TODO: activate this
"src/dsp_tools/import_scripts", # TODO: activate this
"src/dsp_tools/models/group.py", # TODO: activate this
"src/dsp_tools/models/helpers.py ", # TODO: activate this
"src/dsp_tools/models/langstring.py", # TODO: activate this
"src/dsp_tools/models/listnode.py", # TODO: activate this
"src/dsp_tools/models/ontology.py", # TODO: activate this
"src/dsp_tools/models/permission.py", # TODO: activate this
"src/dsp_tools/models/project.py", # TODO: activate this
"src/dsp_tools/models/projectContext.py", # TODO: activate this
"src/dsp_tools/models/propertyclass.py", # TODO: activate this
"src/dsp_tools/models/resource.py", # TODO: activate this
"src/dsp_tools/models/resourceclass.py", # TODO: activate this
"src/dsp_tools/models/user.py", # TODO: activate this
"src/dsp_tools/models/value.py", # TODO: activate this
"src/dsp_tools/models/xmlresource.py", # TODO: activate this
]


Expand Down
6 changes: 1 addition & 5 deletions src/dsp_tools/cli.py
Expand Up @@ -465,11 +465,7 @@ def _call_requested_action(args: argparse.Namespace) -> bool:
password=args.password,
imgdir=args.imgdir,
sipi=args.sipi_url,
config=UploadConfig(
verbose=args.verbose,
dump=args.dump,
save_metrics=args.metrics,
),
config=UploadConfig(verbose=args.verbose, dump=args.dump),
)
elif args.action == "process-files":
success = process_files(
Expand Down
2 changes: 1 addition & 1 deletion src/dsp_tools/models/xmlresource.py
Expand Up @@ -174,7 +174,7 @@ def get_propvals(
prop_data[prop.name] = vals if len(vals) > 1 else vals[0]
return prop_data

def get_bitstream_information_from_sipi(
def get_bitstream_information(
self, internal_file_name_bitstream: str, permissions_lookup: dict[str, Permissions]
) -> Optional[dict[str, Union[str, Permissions]]]:
"""
Expand Down
126 changes: 49 additions & 77 deletions src/dsp_tools/utils/xmlupload/resource_multimedia.py
@@ -1,117 +1,89 @@
from __future__ import annotations

from datetime import datetime
from pathlib import Path
from typing import Any, Optional

from dsp_tools.models.exceptions import BaseError
from dsp_tools.models.permission import Permissions
from dsp_tools.models.sipi import Sipi
from dsp_tools.models.xmlbitstream import XMLBitstream
from dsp_tools.models.xmlresource import XMLResource
from dsp_tools.utils.create_logger import get_logger
from dsp_tools.utils.shared import try_network_action
from dsp_tools.utils.xmlupload.write_diagnostic_info import MetricRecord

logger = get_logger(__name__)


def calculate_multimedia_file_size(
resources: list[XMLResource],
imgdir: str,
preprocessing_done: bool,
) -> tuple[list[float], float | int]:
"""
This function calculates the size of the bitstream files in the specified directory.
Args:
resources: List of resources to identify the files used
imgdir: directory where the files are
preprocessing_done: True if sipi has preprocessed the files
Returns:
List with all the file sizes
Total of all the file sizes
"""
# If there are multimedia files: calculate their total size
bitstream_all_sizes_mb = [
Path(Path(imgdir) / Path(res.bitstream.value)).stat().st_size / 1000000
if res.bitstream and not preprocessing_done
else 0.0
for res in resources
]
if sum(bitstream_all_sizes_mb) > 0:
bitstream_size_total_mb = round(sum(bitstream_all_sizes_mb), 1)
print(f"This xmlupload contains multimedia files with a total size of {bitstream_size_total_mb} MB.")
logger.info(f"This xmlupload contains multimedia files with a total size of {bitstream_size_total_mb} MB.")
else: # make Pylance happy
bitstream_size_total_mb = 0.0
return bitstream_all_sizes_mb, bitstream_size_total_mb


def get_sipi_multimedia_information(
def _upload_bitstream(
resource: XMLResource,
sipi_server: Sipi,
imgdir: str,
filesize: float,
permissions_lookup: dict[str, Permissions],
metrics: list[MetricRecord],
preprocessing_done: bool,
) -> dict[str, str | Permissions] | None:
"""
This function takes a resource with a corresponding bitstream filepath.
If the pre-processing is not done, it retrieves the file from the directory and uploads it to sipi.
If pre-processing is done it retrieves the bitstream information from sipi.
This function uploads a specified bitstream file to SIPI and then returns the file information from SIPI.
Args:
resource: resource with that has a bitstream
sipi_server: server to upload
imgdir: directory of the file
filesize: size of the file
permissions_lookup: dictionary that contains the permission name as string and the corresponding Python object
metrics: to store metric information in
preprocessing_done: If True, then no upload is necessary
Returns:
The information from sipi which is needed to establish a link from the resource
"""
if preprocessing_done:
resource_bitstream = resource.get_bitstream_information_from_sipi(
internal_file_name_bitstream=resource.bitstream.value, # type: ignore[union-attr]
permissions_lookup=permissions_lookup,
)
else:
resource_bitstream = _upload_multimedia_to_sipi(
resource=resource,
sipi_server=sipi_server,
imgdir=imgdir,
filesize=filesize,
permissions_lookup=permissions_lookup,
metrics=metrics,
)
return resource_bitstream


def _upload_multimedia_to_sipi(
resource: XMLResource,
sipi_server: Sipi,
imgdir: str,
filesize: float,
permissions_lookup: dict[str, Permissions],
metrics: list[MetricRecord],
) -> dict[str, str | Permissions] | None:
pth = resource.bitstream.value # type: ignore[union-attr]
bitstream_start = datetime.now()
filetype = Path(pth).suffix[1:]
img: Optional[dict[Any, Any]] = try_network_action(
sipi_server.upload_bitstream,
filepath=str(Path(imgdir) / Path(pth)),
)
bitstream_duration = datetime.now() - bitstream_start
bitstream_duration_ms = bitstream_duration.seconds * 1000 + int(bitstream_duration.microseconds / 1000)
mb_per_sec = round((filesize / bitstream_duration_ms) * 1000, 1)
metrics.append(MetricRecord(resource.id, filetype, filesize, "bitstream upload", bitstream_duration_ms, mb_per_sec))
internal_file_name_bitstream = img["uploadedFiles"][0]["internalFilename"] # type: ignore[index]
resource_bitstream = resource.get_bitstream_information_from_sipi(
resource_bitstream = resource.get_bitstream_information(
internal_file_name_bitstream=internal_file_name_bitstream,
permissions_lookup=permissions_lookup,
)
return resource_bitstream


def handle_bitstream(
resource: XMLResource,
bitstream: XMLBitstream,
preprocessing_done: bool,
permissions_lookup: dict[str, Permissions],
sipi_server: Sipi,
imgdir: str,
) -> dict[str, Any] | None:
"""
Upload a bitstream file to SIPI
Args:
resource: resource holding the bitstream
bitstream: the bitstream object
preprocessing_done: whether the preprocessing is done already
permissions_lookup: dictionary that contains the permission name as string and the corresponding Python object
sipi_server: server to upload
imgdir: directory of the file
Returns:
The information from sipi which is needed to establish a link from the resource
"""
try:
if preprocessing_done:
resource_bitstream = resource.get_bitstream_information(bitstream.value, permissions_lookup)
else:
resource_bitstream = _upload_bitstream(
resource=resource,
sipi_server=sipi_server,
imgdir=imgdir,
permissions_lookup=permissions_lookup,
)
msg = f"Uploaded file '{bitstream.value}'"
print(msg)
logger.info(msg)
return resource_bitstream
except BaseError as err:
err_msg = err.orig_err_msg_from_api or err.message
msg = f"Unable to upload file '{bitstream.value}' of resource '{resource.label}' ({resource.id})"
print(f"WARNING: {msg}: {err_msg}")
logger.warning(msg, exc_info=True)
return None
1 change: 0 additions & 1 deletion src/dsp_tools/utils/xmlupload/upload_config.py
Expand Up @@ -40,7 +40,6 @@ class UploadConfig:

verbose: bool = False
dump: bool = False
save_metrics: bool = False
preprocessing_done: bool = False
server_as_foldername: str = field(default="unknown")
save_location: Path = field(default=Path.home() / ".dsp-tools" / "xmluploads")
Expand Down
28 changes: 0 additions & 28 deletions src/dsp_tools/utils/xmlupload/write_diagnostic_info.py
@@ -1,19 +1,12 @@
from __future__ import annotations

import json
import os
from collections import namedtuple
from pathlib import Path
from typing import Any

import pandas as pd
from lxml import etree

from dsp_tools.utils.create_logger import get_logger
from dsp_tools.utils.xmlupload.upload_config import UploadConfig

MetricRecord = namedtuple("MetricRecord", ["res_id", "filetype", "filesize_mb", "event", "duration_ms", "mb_per_sec"])


logger = get_logger(__name__)

Expand All @@ -33,24 +26,3 @@ def write_id2iri_mapping(
json.dump(id2iri_mapping, f, ensure_ascii=False, indent=4)
print(f"The mapping of internal IDs to IRIs was written to {id2iri_filename}")
logger.info(f"The mapping of internal IDs to IRIs was written to {id2iri_filename}")


def write_metrics(
metrics: list[MetricRecord],
input_file: str | Path | etree._ElementTree[Any],
config: UploadConfig,
) -> None:
"""Writes the metrics to a file."""
match input_file:
case str() | Path():
metrics_filename = (
f"{config.timestamp_str}_metrics_{config.server_as_foldername}_{Path(input_file).stem}.csv"
)
case _:
metrics_filename = f"{config.timestamp_str}_metrics_{config.server_as_foldername}.csv"

# write files and print info
os.makedirs("metrics", exist_ok=True)
df = pd.DataFrame(metrics)
df.to_csv(f"metrics/{metrics_filename}")
print(f"Total time of xmlupload: {sum(int(record.duration_ms) for record in metrics) / 1000:.1f} seconds")

0 comments on commit b9588c3

Please sign in to comment.