Skip to content

Commit

Permalink
v0.1.6 Ensure 4 digit years for very historic dates
Browse files Browse the repository at this point in the history
  • Loading branch information
akariv committed Aug 23, 2020
1 parent fe9817f commit 2064148
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 31 deletions.
2 changes: 1 addition & 1 deletion dataflows/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.1.5
0.1.6
28 changes: 19 additions & 9 deletions dataflows/helpers/extended_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,21 @@
import decimal
import isodate


DATE_FORMAT = '%Y-%m-%d'
DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%S.%f'
# On some platforms, pre year 1000 dates are not formatted using 4 digits (which is later unparseable)
if datetime.date(1, 1, 1).strftime('%04Y') == '4Y':
DATE_F_FORMAT = '%Y-%m-%d'
DATETIME_F_FORMAT = '%Y-%m-%dT%H:%M:%S'
else:
DATE_F_FORMAT = '%04Y-%m-%d'
DATETIME_F_FORMAT = '%04Y-%m-%dT%H:%M:%S'
DATE_FORMAT = DATE_F_FORMAT
DATETIME_FORMAT = DATETIME_F_FORMAT
TIME_FORMAT = '%H:%M:%S'

DATE_P_FORMAT = '%Y-%m-%d'
DATETIME_P_FORMAT = '%Y-%m-%dT%H:%M:%S'
TIME_P_FORMAT = TIME_F_FORMAT = TIME_FORMAT


class CommonJSONDecoder(json.JSONDecoder):
"""
Expand All @@ -26,15 +36,15 @@ def object_hook(cls, obj):
if 'type{time}' in obj:
try:
return datetime.datetime \
.strptime(obj["type{time}"], TIME_FORMAT) \
.strptime(obj["type{time}"], TIME_P_FORMAT) \
.time()
except ValueError:
pass
if 'type{datetime}' in obj:
try:
(isoformat, tzofs, tzname) = obj["type{datetime}"]
parsed = datetime.datetime \
.strptime(isoformat, DATETIME_FORMAT)
.strptime(isoformat, DATETIME_P_FORMAT)
if tzname is not None:
return datetime.datetime \
.combine(parsed.date(), parsed.time(),
Expand All @@ -46,7 +56,7 @@ def object_hook(cls, obj):
if 'type{date}' in obj:
try:
return datetime.datetime \
.strptime(obj["type{date}"], DATE_FORMAT) \
.strptime(obj["type{date}"], DATE_P_FORMAT) \
.date()
except ValueError:
pass
Expand Down Expand Up @@ -79,14 +89,14 @@ def default(self, obj):
if isinstance(obj, decimal.Decimal):
return {'type{decimal}': str(obj)}
elif isinstance(obj, datetime.time):
return {'type{time}': obj.strftime(TIME_FORMAT)}
return {'type{time}': obj.strftime(TIME_F_FORMAT)}
elif isinstance(obj, datetime.datetime):
return {'type{datetime}':
(obj.strftime(DATETIME_FORMAT),
(obj.strftime(DATETIME_F_FORMAT),
obj.utcoffset().seconds if obj.utcoffset() is not None else None,
obj.tzname())}
elif isinstance(obj, datetime.date):
return {'type{date}': obj.strftime(DATE_FORMAT)}
return {'type{date}': obj.strftime(DATE_F_FORMAT)}
elif isinstance(obj, (isodate.Duration, datetime.timedelta)):
return {'type{duration}': isodate.duration_isoformat(obj)}
elif isinstance(obj, set):
Expand Down
33 changes: 16 additions & 17 deletions dataflows/processors/dumpers/file_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@
import logging
import datetime
from functools import partial


DATE_FORMAT = '%Y-%m-%d'
DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%S'
TIME_FORMAT = '%H:%M:%S'
from dataflows.helpers.extended_json import (
DATETIME_F_FORMAT, DATE_F_FORMAT, TIME_F_FORMAT,
DATETIME_P_FORMAT, DATE_P_FORMAT, TIME_P_FORMAT,
)


def identity(x):
Expand Down Expand Up @@ -92,9 +91,9 @@ class CSVFormat(FileFormat):
SERIALIZERS = {
'array': json_dumps,
'object': json_dumps,
'datetime': lambda d: d.strftime(DATETIME_FORMAT),
'date': lambda d: d.strftime(DATE_FORMAT),
'time': lambda d: d.strftime(TIME_FORMAT),
'datetime': lambda d: d.strftime(DATETIME_F_FORMAT),
'date': lambda d: d.strftime(DATE_F_FORMAT),
'time': lambda d: d.strftime(TIME_F_FORMAT),
'duration': lambda d: isodate.duration_isoformat(d),
'geopoint': lambda d: '{}, {}'.format(*d),
'geojson': json.dumps,
Expand All @@ -110,13 +109,13 @@ class CSVFormat(FileFormat):
'groupChar': ''
},
'date': {
'format': DATE_FORMAT
'format': DATE_P_FORMAT
},
'time': {
'format': TIME_FORMAT
'format': TIME_P_FORMAT
},
'datetime': {
'format': DATETIME_FORMAT
'format': DATETIME_P_FORMAT
},
}

Expand Down Expand Up @@ -156,9 +155,9 @@ def finalize_file(self):
class JSONFormat(FileFormat):

SERIALIZERS = {
'datetime': lambda d: d.strftime(DATETIME_FORMAT),
'date': lambda d: d.strftime(DATE_FORMAT),
'time': lambda d: d.strftime(TIME_FORMAT),
'datetime': lambda d: d.strftime(DATETIME_F_FORMAT),
'date': lambda d: d.strftime(DATE_F_FORMAT),
'time': lambda d: d.strftime(TIME_F_FORMAT),
'number': float,
'duration': lambda d: isodate.duration_isoformat(d),
'geopoint': lambda d: list(map(float, d)),
Expand All @@ -169,13 +168,13 @@ class JSONFormat(FileFormat):

PYTHON_DIALECT = {
'date': {
'format': DATE_FORMAT
'format': DATE_P_FORMAT
},
'time': {
'format': TIME_FORMAT
'format': TIME_P_FORMAT
},
'datetime': {
'format': DATETIME_FORMAT
'format': DATETIME_P_FORMAT
},
}

Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def read(*paths):
'tableschema-sql',
'xmljson',
'bitstring>=3',
'python-dateutil'
]
SPEEDUP_REQUIRES = [
'plyvel',
Expand Down
4 changes: 2 additions & 2 deletions tests/test_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ def country_population():
for row in rows:
cells = row.findall('td')
if len(cells) > 3:
name = cells[1].find('.//a').attrib.get('title')
population = cells[2].text
name = cells[0].find('.//a').attrib.get('title')
population = cells[1].text
yield(dict(
name=name,
population=population
Expand Down
26 changes: 24 additions & 2 deletions tests/test_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -937,8 +937,8 @@ def test_load_dates_timezones():
import shutil

dates = [
datetime.now(),
datetime.now(timezone.utc).astimezone()
datetime.now().replace(microsecond=0),
datetime.now(timezone.utc).replace(microsecond=0).astimezone()
]

shutil.rmtree('.checkpoints/test_load_dates_timezones', ignore_errors=True)
Expand Down Expand Up @@ -1114,6 +1114,28 @@ def test_stream_simple():
assert results[1] == datas2


def test_stream_bad_dates():
from dataflows import stream, unstream, set_type, dump_to_path
import datetime

datas1 = [
{'a': '0001/1/1'},
]
Flow(
datas1,
set_type('a', type='date', format='%Y/%m/%d'),
stream(open('out/test_stream_bad_dates.stream', 'w'))
).process()

results, dp, _ = Flow(
unstream(open('out/test_stream_bad_dates.stream')),
dump_to_path('out/test_stream_bad_dates')
).results()

assert results[0][0]['a'] == datetime.date(1,1,1)



def test_set_primary_key():
from dataflows import set_primary_key

Expand Down

0 comments on commit 2064148

Please sign in to comment.