This repository has been archived by the owner on Dec 18, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 9
/
enrich-date.py
206 lines (176 loc) · 7.74 KB
/
enrich-date.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import re
import timelib
from akara import logger
from akara import response
from akara.services import simple_service
from amara.thirdparty import json
from dateutil.parser import parse as dateutil_parse
from zen import dateparser
from dplaingestion.selector import getprop, setprop, delprop, exists
HTTP_INTERNAL_SERVER_ERROR = 500
HTTP_TYPE_JSON = 'application/json'
HTTP_TYPE_TEXT = 'text/plain'
HTTP_HEADER_TYPE = 'Content-Type'
# default date used by dateutil-python to populate absent date elements during parse,
# e.g. "1999" would become "1999-01-01" instead of using the current month/day
# Set this to a date far in the future, so we can use it to check if date parsing just failed
DEFAULT_DATETIME_STR = "3000-01-01"
DEFAULT_DATETIME = dateutil_parse(DEFAULT_DATETIME_STR)
# normal way to get DEFAULT_DATIMETIME in seconds is:
# time.mktime(DEFAULT_DATETIME.timetuple())
# but it applies the time zone, which should be added to seconds to get real GMT(UTC)
# as simple solution, hardcoded UTC seconds is given
DEFAULT_DATETIME_SECS = 32503680000.0 # UTC seconds for "3000-01-01"
DATE_RANGE_RE = r'([0-9-]+)\s*-\s*([0-9-]+)'
DATE_RANGE_EXT_RE = r'([0-9-]+)\s*[-/]\s*([0-9-]+)'
def split_date(d):
reg = DATE_RANGE_EXT_RE
if len(d.split("/")) == 3: #so th date is like "2001 / 01 / 01"
reg = DATE_RANGE_RE
range = [robust_date_parser(x) for x in re.search(reg,d).groups()]
return filter(None, range)
DATE_8601 = '%Y-%m-%d'
def robust_date_parser(d):
"""
Robust wrapper around some date parsing libs, making a best effort to return
a single 8601 date from the input string. No range checking is performed, and
any date other than the first occuring will be ignored.
We use timelib for its ability to make at least some sense of invalid dates,
e.g. 2012/02/31 -> 2012/03/03
We rely only on dateutil.parser for picking out dates from nearly arbitrary
strings (fuzzy=True), but at the cost of being forgiving of invalid dates
in those kinds of strings.
Returns None if it fails
"""
q = re.split("(\?$)", d)
circa_re = re.compile("(ca\.|c\.)", re.I)
dd = dateparser.to_iso8601(re.sub(circa_re, "", q[0], count=0).strip()) # simple cleanup prior to parse
if dd is None:
try:
dd = dateutil_parse(d, fuzzy=True, default=DEFAULT_DATETIME)
if dd.year == DEFAULT_DATETIME.year:
dd = None
except Exception:
try:
dd = timelib.strtodatetime(d, now=DEFAULT_DATETIME_SECS)
except ValueError:
pass
except Exception as e:
logger.error(e)
if dd:
ddiso = dd.isoformat()
return ddiso[:ddiso.index('T')]
if len(q) != 1 and dd:
dd += q[1]
return dd
year_range = re.compile("(\d{4})\s*[-/]\s*(\d{4})") # simple for digits year range
circa_range = re.compile("(?:ca\.|c\.)\s*(?P<century>\d{2})(?P<year_begin>\d{2})\s*-\s*(?P<year_end>\d{2})", re.I) # tricky "c. 1970-90" year range
century_date = re.compile("(?P<century>\d{1,2})(?:th|st|nd|rd)\s+c\.", re.I) # for dates with centuries "19th c."
def parse_date_or_range(d):
# FIXME: could be more robust here,
# e.g. use date range regex to handle:
# June 1941 - May 1945
# 1941-06-1945-05
# and do not confuse with just YYYY-MM-DD regex
if ' - ' in d or (len(d.split("/")) == 2) or year_range.match(d):
a, b = split_date(d)
elif circa_range.match(d):
match = circa_range.match(d)
year_begin = match.group("century") + match.group("year_begin")
year_end = match.group("century") + match.group("year_end")
a, b = robust_date_parser(year_begin), robust_date_parser(year_end)
elif century_date.match(d):
match = century_date.match(d)
year_begin = (int(match.group("century"))-1) * 100
year_end = year_begin + 99
a, b = str(year_begin), str(year_end)
else:
parsed = robust_date_parser(d)
a, b = parsed, parsed
return a, b
def remove_brackets_and_strip(d):
"""Removed brackets from the date (range)."""
return re.sub(r"(^\s*\[\s*|\s*\]\s*$)", '', d).strip()
def test_parse_date_or_range():
DATE_TESTS = {
"ca. July 1896": ("1896-07", "1896-07"), # fuzzy dates
"c. 1896": ("1896", "1896"), # fuzzy dates
"c. 1890-95": ("1890", "1895"), # fuzzy date range
"1999.11.01": ("1999-11-01", "1999-11-01"), # period delim
"2012-02-31": ("2012-03-02", "2012-03-02"), # invalid date cleanup
"12-19-2010": ("2010-12-19", "2010-12-19"), # M-D-Y
"5/7/2012": ("2012-05-07", "2012-05-07"), # slash delim MDY
"1999 - 2004": ("1999", "2004"), # year range
"1999-2004": ("1999", "2004"), # year range without spaces
" 1999 - 2004 ": ("1999", "2004"), # range whitespace
}
for i in DATE_TESTS:
res = parse_date_or_range(i)
assert res == DATE_TESTS[i], "For input '%s', expected '%s' but got '%s'"%(i,DATE_TESTS[i],res)
def convert_dates(data, prop, earliest):
"""Converts dates.
Arguments:
data Dict - Data for conversion.
prop Str - Properties dividided with comma.
earliest Bool - True - the function will set only the earliest date.
False - the function will set all dates.
Returns:
Nothing, the replacement is done in place.
"""
dates = []
for p in prop.split(','):
if exists(data, p):
v = getprop(data, p)
for s in (v if not isinstance(v, basestring) else [v]):
for part in s.split(";"):
display_date = remove_brackets_and_strip(part)
stripped = re.sub("\?", "", display_date)
if len(stripped) < 4:
continue
a, b = parse_date_or_range(stripped)
if b != '3000-01-01':
dates.append( {
"begin": a,
"end": b,
"displayDate" : display_date
})
dates.sort(key=lambda d: d["begin"] if d["begin"] is not None else DEFAULT_DATETIME_STR)
value_to_set = dates
if earliest and dates:
value_to_set = dates[0]
if value_to_set:
setprop(data, p, value_to_set)
else:
delprop(data, p)
@simple_service('POST', 'http://purl.org/la/dp/enrich_earliest_date', 'enrich_earliest_date', HTTP_TYPE_JSON)
def enrich_earliest_date(body, ctype, action="enrich_earliest_date", prop="sourceResource/date"):
"""
Service that accepts a JSON document and extracts the "created date" of the item, using the
following rules:
a) Looks in the list of fields specified by the 'prop' parameter
b) Extracts all dates, and sets the created date to the earliest date
"""
try :
data = json.loads(body)
except:
response.code = HTTP_INTERNAL_SERVER_ERROR
response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT)
return "Unable to parse body as JSON"
convert_dates(data, prop, True)
return json.dumps(data)
@simple_service('POST', 'http://purl.org/la/dp/enrich_date', 'enrich_date', HTTP_TYPE_JSON)
def enrich_date(body, ctype, action="enrich_date", prop="sourceResource/temporal"):
"""
Service that accepts a JSON document and extracts the "created date" of the item, using the
following rules:
a) Looks in the list of fields specified by the 'prop' parameter
b) Extracts all dates
"""
try :
data = json.loads(body)
except:
response.code = HTTP_INTERNAL_SERVER_ERROR
response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT)
return "Unable to parse body as JSON"
convert_dates(data, prop, False)
return json.dumps(data)