This repository has been archived by the owner on Dec 18, 2019. It is now read-only.
/
ia_mapper.py
234 lines (199 loc) · 9.19 KB
/
ia_mapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
from itertools import cycle
from dplaingestion.utilities import iterify
from dplaingestion.selector import exists, getprop
from dplaingestion.mappers.mapper import Mapper
intermediate_providers = {
"medicalheritagelibrary": "Medical Heritage Library",
"blc": "Boston Library Consortium"
}
class IAMapper(Mapper):
def __init__(self, provider_data):
super(IAMapper, self).__init__(provider_data)
self.meta_key = "metadata/"
def _add_to_list(self, lst, item):
"""Allows a list to be appended or extended conditionally in place.
In Python, the `list.extend()` method can be used to combine two
lists. However, if `list.extend()` is used to extend a list with
a string, the string will be iterated over as if it were a list.
For example, consider the following:
>>> lst = ["1", "2", "3"]
>>> lst.extend("abc")
>>> lst
["1", "2", "3", "a", "b", "c"]
When working with strings, `list.append()` uses the desired behavior.
"""
if isinstance(item, list):
lst.extend(item)
else:
lst.append(item)
def _generate_map_dict(self, to_prop, from_props=None,
prevent_single_item_lists=True):
"""Generates a dict used for mapping metadata.
to_prop specifies the name of the key in the resulting dict.
from_props specifies the name of the key(s) in the incoming data.
If from_props is None, it is set to the value of to_prop.
prevent_single_item_lists will modify prop_values (the returned
property values from from_props) to return a single string if it is
a single item list.
"""
if from_props is None:
from_props = [to_prop]
prop_values = []
for prop in iterify(from_props):
from_prop = self.meta_key + prop
if exists(self.provider_data, from_prop):
self._add_to_list(prop_values, getprop(self.provider_data,
from_prop))
if len(prop_values) == 1 and prevent_single_item_lists:
prop_values = prop_values[0]
if prop_values:
return {to_prop: prop_values}
else:
return {}
def _map_meta(self, to_prop, from_props=None, source_resource=True):
"""Helper function to update the `mapped_data` in a IAMapper object.
to_prop and from_props are described in IAMapper_generate_map_dict().
source_resource indicates whether to_prop should be updated in the
context of source_resource (if True) or the top-level object (if
False).
"""
_dict = self._generate_map_dict(to_prop, from_props)
if _dict:
if source_resource:
self.update_source_resource(_dict)
else:
self.mapped_data.update(_dict)
def map_creator(self):
self._map_meta("creator")
def map_date(self):
self._map_meta("date")
def map_description(self):
self._map_meta("description")
def map_language(self):
self._map_meta("language")
def map_publisher(self):
self._map_meta("publisher")
def map_subject(self):
self._map_meta("subject")
def map_rights(self):
self._map_meta("rights", "possible-copyright-status")
def map_type(self):
self._map_meta("type", "mediatype")
def map_identifier(self):
self._map_meta("identifier", ["identifier", "call_number"])
def map_title(self):
ia_titles = self._generate_map_dict("title", from_props=None,
prevent_single_item_lists=False)
volumes = self._generate_map_dict("volume", from_props=None,
prevent_single_item_lists=False)
if volumes:
if len(ia_titles.get("title")) > len(volumes.get("volume")):
titles = [", ".join(t) for t in zip(ia_titles["title"],
cycle(volumes["volume"]))]
else:
titles = [", ".join(t) for t in zip(cycle(ia_titles["title"]),
volumes["volume"])]
self.update_source_resource({"title": titles})
else:
self.update_source_resource(ia_titles)
def map_data_provider(self):
self._map_meta("dataProvider", "contributor", False)
def map_is_shown_at(self):
self._map_meta("isShownAt", "identifier-access", False)
def map_intermediate_provider(self):
coll_key = self.meta_key + "collection"
# Get all of the "collection" elements from the provider's record.
# We only want the first one that matches one of our identifiers.
cols = getprop(self.provider_data, coll_key, keyErrorAsNone=True)
if cols is not None:
for c in list(cols):
if c in intermediate_providers:
p = intermediate_providers[c]
self.mapped_data.update({"intermediateProvider": p})
return
def map_has_view(self):
file_name = None
_id = None
if exists(self.provider_data, "originalRecord/_id"):
_id = getprop(self.provider_data, "originalRecord/_id", True)
if exists(self.provider_data, "files/pdf"):
file_name = getprop(self.provider_data, "files/pdf", True)
if _id and file_name:
_format = "application/pdf"
url_prefix = "http://www.archive.org/download/{0}/{1}"
rights = getprop(self.provider_data,
"metadata/possible-copyright-status", True)
has_view = {"@id": url_prefix.format(_id, file_name),
"rights": rights,
"format": _format}
self.mapped_data.update({"hasView": has_view})
def map_marc(self):
def _update_field(d, k, v):
if k in d and v:
if isinstance(d[k], list) and v not in d[k]:
d[k].append(v)
elif isinstance(d[k], dict):
d[k].update(v)
elif isinstance(d[k], basestring) and d[k] != v:
d[k] = [d[k], v]
else:
d[k] = v
def _extract_sub_field(d, subfield_name="subfield", codes=tuple(),
exclude_codes=tuple(), content_key="#text",
concat_str=", "):
if subfield_name in d:
if exclude_codes:
values = [v[content_key] for v in d[subfield_name] if
isinstance(v, dict) and "code" in v and
content_key in v and v["code"] not in
exclude_codes]
else:
values = [v[content_key] for v in d[subfield_name] if
isinstance(v, dict) and "code" in v and
content_key in v and (not codes or v["code"] in
codes)]
return concat_str.join(values)
else:
return None
prop = "record/datafield"
if exists(self.provider_data, prop):
data_field_dicts = getprop(self.provider_data, prop)
out = {"contributor": [], "extent": [], "format": [],
"spatial": [], "isPartOf": []}
first_is_part_of_set = False
for _dict in data_field_dicts:
if isinstance(_dict, dict) and "tag" in _dict:
tag = _dict["tag"]
if tag in ("300",):
value = _extract_sub_field(_dict, codes=("c",))
_update_field(out, "extent", value)
elif tag.startswith("6") and len(tag) == 3:
value = _extract_sub_field(_dict, codes=("z",))
_update_field(out, "spatial", value)
elif (tag in
("440", "490", "800", "810", "830", "785") and not
first_is_part_of_set):
first_is_part_of_set = True
value = _extract_sub_field(_dict,
exclude_codes=("w",),
concat_str=". ")
_update_field(out, "isPartOf", value)
elif tag in ("780",):
value = _extract_sub_field(_dict,
exclude_codes=("w",),
concat_str=". ")
_update_field(out, "isPartOf", value)
for k, v in out.items():
if not v:
del out[k]
elif len(v) == 1:
out[k] = v[0]
self.update_source_resource(out)
def map_ia_provider(self):
self.mapped_data.update({"provider": {
"@id": "http://dp.la/api/contributor/internet_archive",
"name": "Internet Archive"
}})
def map_multiple_fields(self):
self.map_marc()
self.map_ia_provider()