-
Notifications
You must be signed in to change notification settings - Fork 0
/
find-download-image.py
144 lines (127 loc) · 4.23 KB
/
find-download-image.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import argparse
import csv
import json
import sys
from contextlib import nullcontext
from PIL import Image
from lxml import etree
from pathlib import Path
from tqdm.auto import tqdm
ns = {"f": "http://www.faustedition.net/ns"}
def download_config(archives_xml="../data/xml/archives.xml"):
"""Extracts the per-repository download rules from archives.xml"""
result = {}
archives = etree.parse(archives_xml)
for archive in archives.xpath("//f:archive", namespaces=ns):
id = archive.get("id")
try:
facs = archive.xpath("f:facsimile", namespaces=ns)[0]
rules = facs.attrib
result[id] = rules
except IndexError:
print(f"No facsimile element for {id}")
result[id] = {"downloadable": "unknown"}
return result
def find_allowed_facsimile(root: Path, path: str, rules: dict):
if rules.get("downloadable") != "yes":
return None
elif rules.get("resolution") == "reduced":
return f"{path}_2.jpg"
else:
with Image.open(root / f"{path}_0.jpg") as img:
orig_width, _ = img.size
orig_dpi, _ = img.info.get("resolution", 300)
if is_allowed(orig_width, orig_dpi, rules):
return f"{path}_0.jpg"
else:
for variant in range(1, 9):
filename = f"{path}_{variant}.jpg"
with Image.open(root / filename) as img:
width, _ = img.size
dpi = int(orig_dpi * (width / orig_width))
if is_allowed(width, dpi, rules):
return filename
return None
def is_allowed(width: int, resolution: int, rules: dict[str, str]):
result = True
if rules.get("downloadable") != "yes":
result = False
elif "max-width" in rules and width > int(rules["max-width"]):
result = False
elif "max-dpi" in rules and resolution > int(rules["max-dpi"]):
result = False
return result
def per_documents_data(metadata_json="../build/www/data/document_metadata.json"):
md_path = Path(metadata_json)
with md_path.open() as f:
if md_path.suffix == ".js":
f.readline()
_md = json.load(f)
mss = _md["metadata"]
pages = []
for ms in mss:
sigil_t = ms["sigil"]
repo_id = ms["sigils"]["repository"]
base = Path(ms["base"])
for page_number, page in enumerate(ms["page"], start=1):
doc = page["doc"]
if len(doc) > 1:
print(f"{sigil_t:>6} {page_number}: {len(doc)} docs")
if doc and doc[0]:
imgs = doc[0]["img"]
for img in imgs:
pages.append(
{
"repo": repo_id,
"sigil": sigil_t,
"base": base,
"page": page_number,
"img": img,
}
)
return pages
def getargparser():
p = argparse.ArgumentParser()
p.add_argument(
"-a",
"--archives",
metavar="XML",
help="URL for archives.xml",
default="https://raw.githubusercontent.com/faustedition/faust-xml/master/xml/archives.xml",
)
p.add_argument(
"-d",
"--document-metadata",
metavar="JSON",
help="Path to document_metadata.js[on]",
required=True,
)
p.add_argument(
"-i",
"--image-root",
metavar="PATH",
type=Path,
help="root folder for scaled (jpg) facsimiles",
)
p.add_argument(
"-o",
"--output",
metavar="CSV",
type=argparse.FileType("wt"),
default=nullcontext(sys.stdout),
)
return p
def main():
options = getargparser().parse_args()
rules = download_config(options.archives)
page_data = per_documents_data(options.document_metadata)
for page in tqdm(page_data):
page["download"] = find_allowed_facsimile(
options.image_root, page["base"] / page["img"], rules
)
writer = csv.DictWriter(options.output, fieldnames=list(page_data[0]))
writer.writeheader()
writer.writerows(page_data)
options.output.close()
if __name__ == "__main__":
main()