-
Notifications
You must be signed in to change notification settings - Fork 0
/
find-download-image.py
160 lines (141 loc) · 4.92 KB
/
find-download-image.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import argparse
import csv
import json
import sys
from contextlib import nullcontext
from pprint import pformat
from PIL import Image
from lxml import etree
from pathlib import Path
import logging
from tqdm.auto import tqdm
from tqdm.contrib.logging import logging_redirect_tqdm
logger = logging.getLogger(__name__)
ns = {"f": "http://www.faustedition.net/ns"}
def download_config(archives_xml="../data/xml/archives.xml"):
"""Extracts the per-repository download rules from archives.xml"""
result = {}
archives = etree.parse(archives_xml)
for archive in archives.xpath("//f:archive", namespaces=ns):
id = archive.get("id")
try:
facs = archive.xpath("f:facsimile", namespaces=ns)[0]
rules = facs.attrib
result[id] = rules
except IndexError:
print(f"No facsimile element for {id}")
result[id] = {"downloadable": "unknown"}
return result
def find_allowed_facsimile(root: Path, path: str, rules: dict):
if rules.get("downloadable") != "yes":
logger.debug('downloadable != yes for %s (rules: %s)', path, rules)
return None
elif rules.get("resolution") == "reduced":
logger.debug('reduced resulution for %s', path)
return f"{path}_2.jpg"
else:
logger.debug('path=%s, root=%s', path, root)
try:
with Image.open(root / f"{path}_0.jpg") as img:
orig_width, _ = img.size
orig_dpi = img.info.get("resolution", 300)
if is_allowed(orig_width, orig_dpi, rules):
return f"{path}_0.jpg"
else:
for variant in range(1, 9):
filename = f"{path}_{variant}.jpg"
with Image.open(root / filename) as img:
width, _ = img.size
dpi = int(orig_dpi * (width / orig_width))
if is_allowed(width, dpi, rules):
return filename
except IOError:
logger.exception('Failed to read image: path=%s, root=%s', path, root)
return None
def is_allowed(width: int, resolution: int, rules: dict[str, str]):
result = True
if rules.get("downloadable") != "yes":
result = False
elif "max-width" in rules and width > int(rules["max-width"]):
result = False
elif "max-dpi" in rules and resolution > int(rules["max-dpi"]):
result = False
return result
def per_documents_data(metadata_json="../build/www/data/document_metadata.json"):
md_path = Path(metadata_json)
with md_path.open() as f:
if md_path.suffix == ".js":
f.readline()
_md = json.load(f)
mss = _md["metadata"]
pages = []
for ms in mss:
sigil_t = ms["sigil"]
repo_id = ms["sigils"]["repository"]
base = Path(*Path(ms["base"]).parts[1:])
for page_number, page in enumerate(ms["page"], start=1):
doc = page["doc"]
if len(doc) > 1:
print(f"{sigil_t:>6} {page_number}: {len(doc)} docs")
if doc and doc[0]:
imgs = doc[0]["img"]
for img in imgs:
pages.append(
{
"repo": repo_id,
"sigil": sigil_t,
"base": base,
"page": page_number,
"img": img,
}
)
return pages
def getargparser():
p = argparse.ArgumentParser()
p.add_argument(
"-a",
"--archives",
metavar="XML",
help="URL for archives.xml",
default="https://raw.githubusercontent.com/faustedition/faust-xml/master/xml/archives.xml",
)
p.add_argument(
"-d",
"--document-metadata",
metavar="JSON",
help="Path to document_metadata.js[on]",
required=True,
)
p.add_argument(
"-i",
"--image-root",
metavar="PATH",
type=Path,
help="root folder for scaled (jpg) facsimiles",
)
p.add_argument(
"-o",
"--output",
metavar="CSV",
type=argparse.FileType("wt"),
default=nullcontext(sys.stdout),
)
return p
def main():
logging.basicConfig(level=logging.INFO)
options = getargparser().parse_args()
rules = download_config(options.archives)
if logger.isEnabledFor(logging.DEBUG):
logger.debug('Rules: %s', pformat(rules))
page_data = per_documents_data(options.document_metadata)
with logging_redirect_tqdm():
for page in tqdm(page_data):
page["download"] = find_allowed_facsimile(
options.image_root, page["img"], rules.get(page["repo"], {})
)
writer = csv.DictWriter(options.output, fieldnames=list(page_data[0]))
writer.writeheader()
writer.writerows(page_data)
options.output.close()
if __name__ == "__main__":
main()