-
Notifications
You must be signed in to change notification settings - Fork 4
/
apply_ingest_id.py
76 lines (61 loc) · 2.87 KB
/
apply_ingest_id.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from __future__ import annotations
from copy import deepcopy
from pathlib import Path
from typing import cast
import pandas as pd
from lxml import etree
from dsp_tools.commands.ingest_xmlupload.user_information import IngestInformation
from dsp_tools.models.exceptions import InputError
from dsp_tools.utils.create_logger import get_logger
logger = get_logger(__name__)
def get_mapping_dict_from_file(shortcode: str) -> dict[str, str]:
"""
This functions returns the information to replace the original filepaths with the identifier from dsp-ingest.
Args:
shortcode: Shortcode of the project
Returns:
dictionary with original: identifier from dsp-ingest
Raises:
InputError: if no file was found
"""
filepath = Path(f"mapping-{shortcode}.csv")
if not filepath.is_file():
raise InputError(f"No mapping CSV file was found at {filepath}.")
df = pd.read_csv(filepath)
msg = f"The file '{filepath}' is used to map the internal original filepaths to the internal SIPI image IDs."
print(msg)
logger.info(msg)
return dict(zip(df["original"].tolist(), df["derivative"].tolist()))
def replace_filepath_with_sipi_id(
xml_tree: etree._ElementTree[etree._Element],
orig_path_2_id_filename: dict[str, str],
) -> tuple[etree._ElementTree[etree._Element], IngestInformation]:
"""
Replace the original filepaths in the <bitstream> tags by the id filenames of the uploaded files.
Args:
xml_tree: The parsed original XML tree
orig_path_2_id_filename: Mapping from original filenames to id filenames from the mapping.csv
Returns:
A copy of the XMl tree, with the replaced filepaths.
Message informing if all referenced files were uploaded or not.
"""
no_id_found = []
used_media_file_paths = []
new_tree = deepcopy(xml_tree)
for elem in new_tree.iter():
if not etree.QName(elem).localname.endswith("bitstream") or not elem.text:
continue
img_path = Path(elem.text)
img_path = img_path.relative_to(Path.cwd()) if img_path.is_absolute() else img_path
img_path_str = str(img_path)
if img_path_str not in orig_path_2_id_filename:
img_path_str = str(img_path.with_suffix(img_path.suffix.lower()))
if img_path_str not in orig_path_2_id_filename:
img_path_str = str(img_path.with_suffix(img_path.suffix.upper()))
if img_path_str in orig_path_2_id_filename:
elem.text = orig_path_2_id_filename[img_path_str]
used_media_file_paths.append(img_path_str)
else:
no_id_found.append((cast("etree._Element", elem.getparent()).attrib["id"], str(elem.text)))
unused_media_paths = [x for x in orig_path_2_id_filename if x not in used_media_file_paths]
return new_tree, IngestInformation(unused_mediafiles=unused_media_paths, mediafiles_no_id=no_id_found)