From 1e040971b855aea4d3e7720f011389c37b3b35bf Mon Sep 17 00:00:00 2001 From: Yun Zheng Hu Date: Thu, 3 Aug 2023 08:59:50 +0000 Subject: [PATCH 1/6] Make datetime fieldtype timezone aware --- flow/record/base.py | 5 +- flow/record/fieldtypes/__init__.py | 77 +++++++++++++++------- flow/record/jsonpacker.py | 2 +- flow/record/packer.py | 15 +++-- flow/record/stream.py | 4 +- pyproject.toml | 1 + tests/_utils.py | 4 +- tests/test_fieldtypes.py | 102 ++++++++++++++++++++++------- tests/test_json_packer.py | 4 +- tests/test_multi_timestamp.py | 26 ++++---- tests/test_packer.py | 8 ++- tests/test_rdump.py | 48 ++++++++++++++ tests/test_record_adapter.py | 2 +- tests/test_regression.py | 4 +- tests/test_selector.py | 4 +- 15 files changed, 223 insertions(+), 83 deletions(-) diff --git a/flow/record/base.py b/flow/record/base.py index eeea6d8..b9bfd2e 100644 --- a/flow/record/base.py +++ b/flow/record/base.py @@ -12,7 +12,7 @@ import re import sys import warnings -from datetime import datetime +from datetime import datetime, timezone from itertools import zip_longest from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple from urllib.parse import parse_qsl, urlparse @@ -44,6 +44,7 @@ from .whitelist import WHITELIST, WHITELIST_TREE log = logging.getLogger(__package__) +_utcnow = functools.partial(datetime.now, timezone.utc) RECORD_VERSION = 1 RESERVED_FIELDS = OrderedDict( @@ -422,7 +423,7 @@ def _generate_record_class(name: str, fields: Tuple[Tuple[str, str]]) -> type: _globals = { "Record": Record, "RECORD_VERSION": RECORD_VERSION, - "_utcnow": datetime.utcnow, + "_utcnow": _utcnow, "_zip_longest": zip_longest, } for field in all_fields.values(): diff --git a/flow/record/fieldtypes/__init__.py b/flow/record/fieldtypes/__init__.py index ca7d0af..31a4ac7 100644 --- a/flow/record/fieldtypes/__init__.py +++ b/flow/record/fieldtypes/__init__.py @@ -3,11 +3,13 @@ import os import pathlib import re +import sys from binascii import a2b_hex, b2a_hex from datetime import datetime as _dt from datetime import timezone from posixpath import basename, dirname -from typing import Any, Tuple +from typing import Any, Optional, Tuple +from zoneinfo import ZoneInfo try: import urlparse @@ -22,6 +24,12 @@ RE_STRIP_NANOSECS = re.compile(r"(\.\d{6})\d+") NATIVE_UNICODE = isinstance("", str) +UTC = timezone.utc +ISO_FORMAT = "%Y-%m-%dT%H:%M:%S%z" +ISO_FORMAT_WITH_MS = "%Y-%m-%dT%H:%M:%S.%f%z" + +PY_311 = sys.version_info >= (3, 11, 0) + PATH_POSIX = 0 PATH_WINDOWS = 1 @@ -32,6 +40,27 @@ path_type = pathlib.PurePath +def flow_record_tz(default_tz: str = "UTC") -> Optional[ZoneInfo]: + """Return a ZoneInfo object based on the ``FLOW_RECORD_TZ`` environment variable. + + Args: + default_tz: default timezone if ``FLOW_RECORD_TZ`` is not set (default: UTC) + + Returns: + None if ``FLOW_RECORD_TZ=NONE`` otherwise ``ZoneInfo(FLOW_RECORD_TZ)`` + """ + tz = os.environ.get("FLOW_RECORD_TZ", default_tz) + if tz.upper() == "NONE": + return None + return ZoneInfo(tz) + + +# The environment variable ``FLOW_RECORD_TZ`` affects the display of datetime fields. +# +# The timezone to use when displaying datetime fields. By default this is UTC. +DISPLAY_TZINFO = flow_record_tz("UTC") + + def defang(value: str) -> str: """Defangs the value to make URLs or ip addresses unclickable""" value = re.sub("^http://", "hxxp://", value, flags=re.IGNORECASE) @@ -238,24 +267,24 @@ def __new__(cls, *args, **kwargs): # String constructor is used for example in JsonRecordAdapter # Note: ISO 8601 is fully implemented in fromisoformat() from Python 3.11 and onwards. # Until then, we need to manually detect timezone info and handle it. - if any(z in arg[19:] for z in ["Z", "+", "-"]): - if "." in arg[19:]: - try: - return cls.strptime(arg, "%Y-%m-%dT%H:%M:%S.%f%z") - except ValueError: - # Sometimes nanoseconds need to be stripped - return cls.strptime(re.sub(RE_STRIP_NANOSECS, "\\1", arg), "%Y-%m-%dT%H:%M:%S.%f%z") - return cls.strptime(arg, "%Y-%m-%dT%H:%M:%S%z") + if not PY_311 and any(z in arg[19:] for z in ["Z", "+", "-"]): + spec = ISO_FORMAT_WITH_MS if "." in arg[19:] else ISO_FORMAT + try: + obj = cls.strptime(arg, spec) + except ValueError: + # Sometimes nanoseconds need to be stripped + obj = cls.strptime(re.sub(RE_STRIP_NANOSECS, "\\1", arg), spec) else: try: - return cls.fromisoformat(arg) + obj = cls.fromisoformat(arg) except ValueError: # Sometimes nanoseconds need to be stripped - return cls.fromisoformat(re.sub(RE_STRIP_NANOSECS, "\\1", arg)) + obj = cls.fromisoformat(re.sub(RE_STRIP_NANOSECS, "\\1", arg)) elif isinstance(arg, (int, float_type)): - return cls.utcfromtimestamp(arg) + obj = cls.fromtimestamp(arg, UTC) elif isinstance(arg, (_dt,)): - return _dt.__new__( + tzinfo = UTC if arg.tzinfo is None else arg.tzinfo + obj = _dt.__new__( cls, arg.year, arg.month, @@ -264,24 +293,24 @@ def __new__(cls, *args, **kwargs): arg.minute, arg.second, arg.microsecond, - arg.tzinfo, + tzinfo, ) + else: + obj = _dt.__new__(cls, *args, **kwargs) - return _dt.__new__(cls, *args, **kwargs) - - def __eq__(self, other): - # Avoid TypeError: can't compare offset-naive and offset-aware datetimes - # naive datetimes are treated as UTC in flow.record instead of local time - ts1 = self.timestamp() if self.tzinfo else self.replace(tzinfo=timezone.utc).timestamp() - ts2 = other.timestamp() if other.tzinfo else other.replace(tzinfo=timezone.utc).timestamp() - return ts1 == ts2 + # Ensure we always return a timezone aware datetime. Treat naive datetimes as UTC + if obj.tzinfo is None: + obj = obj.replace(tzinfo=UTC) + return obj def _pack(self): return self + def __str__(self): + return self.astimezone(DISPLAY_TZINFO).isoformat(" ") if DISPLAY_TZINFO else self.isoformat(" ") + def __repr__(self): - result = str(self) - return result + return str(self) def __hash__(self): return _dt.__hash__(self) diff --git a/flow/record/jsonpacker.py b/flow/record/jsonpacker.py index af8dec3..7808d24 100644 --- a/flow/record/jsonpacker.py +++ b/flow/record/jsonpacker.py @@ -58,7 +58,7 @@ def pack_obj(self, obj): } return serial if isinstance(obj, datetime): - serial = obj.strftime("%Y-%m-%dT%H:%M:%S.%f") + serial = obj.isoformat() return serial if isinstance(obj, fieldtypes.digest): return { diff --git a/flow/record/packer.py b/flow/record/packer.py index b8835b9..cc0c8ec 100644 --- a/flow/record/packer.py +++ b/flow/record/packer.py @@ -1,6 +1,6 @@ -import datetime import functools import warnings +from datetime import datetime, timezone import msgpack @@ -29,6 +29,8 @@ RECORD_PACK_TYPE_VARINT = 0x11 RECORD_PACK_TYPE_GROUPEDRECORD = 0x12 +UTC = timezone.utc + def identifier_to_str(identifier): if isinstance(identifier, tuple) and len(identifier) == 2: @@ -61,9 +63,11 @@ def register(self, desc, notify=False): def pack_obj(self, obj, unversioned=False): packed = None - if isinstance(obj, datetime.datetime): - t = obj.utctimetuple()[:6] + (obj.microsecond,) - packed = (RECORD_PACK_TYPE_DATETIME, t) + if isinstance(obj, datetime): + if obj.tzinfo is None or obj.tzinfo == UTC: + packed = (RECORD_PACK_TYPE_DATETIME, (*obj.timetuple()[:6], obj.microsecond)) + else: + packed = (RECORD_PACK_TYPE_DATETIME, (obj.isoformat(),)) elif isinstance(obj, int): neg = obj < 0 @@ -102,8 +106,7 @@ def unpack_obj(self, t, data): subtype, value = self.unpack(data) if subtype == RECORD_PACK_TYPE_DATETIME: - dt = fieldtypes.datetime(*value) - return dt + return fieldtypes.datetime(*value) if subtype == RECORD_PACK_TYPE_VARINT: neg, h = value diff --git a/flow/record/stream.py b/flow/record/stream.py index d7f71dd..76835ef 100644 --- a/flow/record/stream.py +++ b/flow/record/stream.py @@ -191,7 +191,7 @@ def __init__(self, path_template=None, name=None): def rotate_existing_file(self, path): if os.path.exists(path): - now = datetime.datetime.utcnow() + now = datetime.datetime.now(datetime.timezone.utc) src = os.path.realpath(path) src_dir = os.path.dirname(src) @@ -226,7 +226,7 @@ def record_stream_for_path(self, path): return self.writer def write(self, record): - ts = record._generated or datetime.datetime.utcnow() + ts = record._generated or datetime.datetime.now(datetime.timezone.utc) path = self.path_template.format(name=self.name, record=record, ts=ts) rs = self.record_stream_for_path(path) rs.write(record) diff --git a/pyproject.toml b/pyproject.toml index da3d157..97309dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ classifiers = [ ] dependencies = [ "msgpack>=0.5.2", + "backports.zoneinfo[tzdata]; python_version<'3.9'", ] dynamic = ["version"] diff --git a/tests/_utils.py b/tests/_utils.py index 6cf1584..fcaf4d1 100644 --- a/tests/_utils.py +++ b/tests/_utils.py @@ -19,7 +19,7 @@ def generate_records(count=100): ) for i in range(count): - embedded = TestRecordEmbedded(datetime.datetime.utcnow()) + embedded = TestRecordEmbedded(datetime.datetime.now(datetime.timezone.utc)) yield TestRecord(number=i, record=embedded) @@ -33,4 +33,4 @@ def generate_plain_records(count=100): ) for i in range(count): - yield TestRecord(number=i, dt=datetime.datetime.utcnow()) + yield TestRecord(number=i, dt=datetime.datetime.now(datetime.timezone.utc)) diff --git a/tests/test_fieldtypes.py b/tests/test_fieldtypes.py index f1500b6..c1c0098 100644 --- a/tests/test_fieldtypes.py +++ b/tests/test_fieldtypes.py @@ -1,9 +1,9 @@ # coding: utf-8 -import datetime import hashlib import os import pathlib +from datetime import datetime, timedelta, timezone import pytest @@ -18,6 +18,8 @@ from flow.record.fieldtypes import datetime as dt from flow.record.fieldtypes import fieldtype_for_value, net, uri +UTC = timezone.utc + INT64_MAX = (1 << 63) - 1 INT32_MAX = (1 << 31) - 1 INT16_MAX = (1 << 15) - 1 @@ -398,29 +400,29 @@ def test_datetime(): ], ) - now = datetime.datetime.utcnow() + now = datetime.now(UTC) r = TestRecord(now) assert r.ts == now r = TestRecord("2018-03-22T15:15:23") - assert r.ts == datetime.datetime(2018, 3, 22, 15, 15, 23) + assert r.ts == datetime(2018, 3, 22, 15, 15, 23, tzinfo=UTC) r = TestRecord("2018-03-22T15:15:23.000000") - assert r.ts == datetime.datetime(2018, 3, 22, 15, 15, 23) + assert r.ts == datetime(2018, 3, 22, 15, 15, 23, tzinfo=UTC) r = TestRecord("2018-03-22T15:15:23.123456") - assert r.ts == datetime.datetime(2018, 3, 22, 15, 15, 23, 123456) + assert r.ts == datetime(2018, 3, 22, 15, 15, 23, 123456, tzinfo=UTC) - dt = datetime.datetime(2018, 3, 22, 15, 15, 23, 123456) + dt = datetime(2018, 3, 22, 15, 15, 23, 123456, tzinfo=UTC) dt_str = dt.isoformat() r = TestRecord(dt_str) assert r.ts == dt r = TestRecord(1521731723) - assert r.ts == datetime.datetime(2018, 3, 22, 15, 15, 23) + assert r.ts == datetime(2018, 3, 22, 15, 15, 23, tzinfo=UTC) r = TestRecord(1521731723.123456) - assert r.ts == datetime.datetime(2018, 3, 22, 15, 15, 23, 123456) + assert r.ts == datetime(2018, 3, 22, 15, 15, 23, 123456, tzinfo=UTC) r = TestRecord("2018-03-22T15:15:23.123456") test = {r.ts: "Success"} @@ -430,18 +432,18 @@ def test_datetime(): @pytest.mark.parametrize( "value,expected_dt", [ - ("2023-12-31T13:37:01.123456Z", datetime.datetime(2023, 12, 31, 13, 37, 1, 123456)), - ("2023-01-10T16:12:01+00:00", datetime.datetime(2023, 1, 10, 16, 12, 1)), - ("2023-01-10T16:12:01", datetime.datetime(2023, 1, 10, 16, 12, 1)), - ("2023-01-10T16:12:01Z", datetime.datetime(2023, 1, 10, 16, 12, 1)), - ("2022-12-01T13:00:23.499460Z", datetime.datetime(2022, 12, 1, 13, 0, 23, 499460)), - ("2019-09-26T07:58:30.996+0200", datetime.datetime(2019, 9, 26, 5, 58, 30, 996000)), - ("2011-11-04T00:05:23+04:00", datetime.datetime(2011, 11, 3, 20, 5, 23)), - ("2023-01-01T12:00:00+01:00", datetime.datetime(2023, 1, 1, 11, 0, 0, tzinfo=datetime.timezone.utc)), - ("2006-11-10T14:29:55.5851926", datetime.datetime(2006, 11, 10, 14, 29, 55, 585192)), - ("2006-11-10T14:29:55.585192699999999", datetime.datetime(2006, 11, 10, 14, 29, 55, 585192)), - (datetime.datetime(2023, 1, 1, tzinfo=datetime.timezone.utc), datetime.datetime(2023, 1, 1)), - (0, datetime.datetime(1970, 1, 1, 0, 0)), + ("2023-12-31T13:37:01.123456Z", datetime(2023, 12, 31, 13, 37, 1, 123456, tzinfo=UTC)), + ("2023-01-10T16:12:01+00:00", datetime(2023, 1, 10, 16, 12, 1, tzinfo=UTC)), + ("2023-01-10T16:12:01", datetime(2023, 1, 10, 16, 12, 1, tzinfo=UTC)), + ("2023-01-10T16:12:01Z", datetime(2023, 1, 10, 16, 12, 1, tzinfo=UTC)), + ("2022-12-01T13:00:23.499460Z", datetime(2022, 12, 1, 13, 0, 23, 499460, tzinfo=UTC)), + ("2019-09-26T07:58:30.996+0200", datetime(2019, 9, 26, 5, 58, 30, 996000, tzinfo=UTC)), + ("2011-11-04T00:05:23+04:00", datetime(2011, 11, 3, 20, 5, 23, tzinfo=UTC)), + ("2023-01-01T12:00:00+01:00", datetime(2023, 1, 1, 11, 0, 0, tzinfo=UTC)), + ("2006-11-10T14:29:55.5851926", datetime(2006, 11, 10, 14, 29, 55, 585192, tzinfo=UTC)), + ("2006-11-10T14:29:55.585192699999999", datetime(2006, 11, 10, 14, 29, 55, 585192, tzinfo=UTC)), + (datetime(2023, 1, 1, tzinfo=UTC), datetime(2023, 1, 1, tzinfo=UTC)), + (0, datetime(1970, 1, 1, 0, 0, tzinfo=UTC)), ], ) def test_datetime_formats(tmp_path, value, expected_dt): @@ -740,7 +742,7 @@ def test_fieldtype_for_value(): assert fieldtype_for_value(1.337) == "float" assert fieldtype_for_value(b"\r\n") == "bytes" assert fieldtype_for_value("hello world") == "string" - assert fieldtype_for_value(datetime.datetime.now()) == "datetime" + assert fieldtype_for_value(datetime.now()) == "datetime" assert fieldtype_for_value([1, 2, 3, 4, 5]) == "string" assert fieldtype_for_value([1, 2, 3, 4, 5], None) is None assert fieldtype_for_value(object(), None) is None @@ -775,7 +777,7 @@ def test_dynamic(): assert r.value == [1, 2, 3] assert isinstance(r.value, flow.record.fieldtypes.stringlist) - now = datetime.datetime.utcnow() + now = datetime.now(UTC) r = TestRecord(now) assert r.value == now assert isinstance(r.value, flow.record.fieldtypes.datetime) @@ -899,11 +901,63 @@ def test_datetime_handle_nanoseconds_without_timezone(): d2 = dt("2006-11-10T14:29:55") assert isinstance(d1, dt) assert isinstance(d2, dt) - assert d1 == datetime.datetime(2006, 11, 10, 14, 29, 55, 585192) + assert d1 == datetime(2006, 11, 10, 14, 29, 55, 585192, tzinfo=UTC) assert d1.microsecond == 585192 - assert d2 == datetime.datetime(2006, 11, 10, 14, 29, 55) + assert d2 == datetime(2006, 11, 10, 14, 29, 55, tzinfo=UTC) assert d2.microsecond == 0 +@pytest.mark.parametrize( + "record_filename", + [ + "out.records.gz", + "out.records", + "out.json", + "out.jsonl", + ], +) +def test_datetime_timezone_aware(tmp_path, record_filename): + TestRecord = RecordDescriptor( + "test/tz", + [ + ("datetime", "ts"), + ], + ) + tz = timezone(timedelta(hours=1)) + stamp = datetime.now(tz) + + with RecordWriter(tmp_path / record_filename) as writer: + record = TestRecord(stamp) + writer.write(record) + assert record.ts == stamp + assert record.ts.utcoffset() == timedelta(hours=1) + assert record._generated.tzinfo == UTC + + with RecordReader(tmp_path / record_filename) as reader: + for record in reader: + assert record.ts == stamp + assert record.ts.utcoffset() == timedelta(hours=1) + assert record._generated.tzinfo == UTC + + +def test_datetime_comparisions(): + with pytest.raises(TypeError, match="can't compare offset-naive and offset-aware datetimes"): + assert dt("2023-01-01") > datetime(2022, 1, 1) + + with pytest.raises(TypeError, match="can't compare offset-naive and offset-aware datetimes"): + assert datetime(2022, 1, 1) < dt("2023-01-01") + + assert dt("2023-01-01") > datetime(2022, 1, 1, tzinfo=UTC) + assert dt("2023-01-01") == datetime(2023, 1, 1, tzinfo=UTC) + assert dt("2023-01-01") == datetime(2023, 1, 1, tzinfo=UTC) + assert dt("2023-01-01T13:36") <= datetime(2023, 1, 1, 13, 37, tzinfo=UTC) + assert dt("2023-01-01T13:37") <= datetime(2023, 1, 1, 13, 37, tzinfo=UTC) + assert dt("2023-01-01T13:37") >= datetime(2023, 1, 1, 13, 36, tzinfo=UTC) + assert dt("2023-01-01T13:37") >= datetime(2023, 1, 1, 13, 37, tzinfo=UTC) + assert dt("2023-01-01T13:36") < datetime(2023, 1, 1, 13, 37, tzinfo=UTC) + assert dt("2023-01-01T13:37") > datetime(2023, 1, 1, 13, 36, tzinfo=UTC) + assert dt("2023-01-02") != datetime(2023, 3, 4, tzinfo=UTC) + + if __name__ == "__main__": __import__("standalone_test").main(globals()) diff --git a/tests/test_json_packer.py b/tests/test_json_packer.py index ccbea02..8b6119d 100644 --- a/tests/test_json_packer.py +++ b/tests/test_json_packer.py @@ -1,5 +1,5 @@ import json -from datetime import datetime +from datetime import datetime, timezone import pytest @@ -9,7 +9,7 @@ def test_record_in_record(): packer = JsonRecordPacker() - dt = datetime.utcnow() + dt = datetime.now(timezone.utc) RecordA = RecordDescriptor( "test/record_a", diff --git a/tests/test_multi_timestamp.py b/tests/test_multi_timestamp.py index e6143d0..8d0acc2 100644 --- a/tests/test_multi_timestamp.py +++ b/tests/test_multi_timestamp.py @@ -1,8 +1,10 @@ -import datetime +from datetime import datetime, timedelta, timezone from flow.record import RecordDescriptor, iter_timestamped_records from flow.record.base import merge_record_descriptors +UTC = timezone.utc + def test_multi_timestamp(): TestRecord = RecordDescriptor( @@ -15,22 +17,22 @@ def test_multi_timestamp(): ) test_record = TestRecord( - ctime=datetime.datetime(2020, 1, 1, 1, 1, 1), - atime=datetime.datetime(2022, 11, 22, 13, 37, 37), + ctime=datetime(2020, 1, 1, 1, 1, 1), + atime=datetime(2022, 11, 22, 13, 37, 37), data="test", ) ts_records = list(iter_timestamped_records(test_record)) for rec in ts_records: - assert rec.ctime == datetime.datetime(2020, 1, 1, 1, 1, 1) - assert rec.atime == datetime.datetime(2022, 11, 22, 13, 37, 37) + assert rec.ctime == datetime(2020, 1, 1, 1, 1, 1, tzinfo=UTC) + assert rec.atime == datetime(2022, 11, 22, 13, 37, 37, tzinfo=UTC) assert rec.data == "test" - assert ts_records[0].ts == datetime.datetime(2020, 1, 1, 1, 1, 1) + assert ts_records[0].ts == datetime(2020, 1, 1, 1, 1, 1, tzinfo=UTC) assert ts_records[0].ts_description == "ctime" - assert ts_records[1].ts == datetime.datetime(2022, 11, 22, 13, 37, 37) + assert ts_records[1].ts == datetime(2022, 11, 22, 13, 37, 37, tzinfo=UTC) assert ts_records[1].ts_description == "atime" @@ -58,7 +60,7 @@ def test_multi_timestamp_single_datetime(): ) test_record = TestRecord( - ctime=datetime.datetime(2020, 1, 1, 1, 1, 1), + ctime=datetime(2020, 1, 1, 1, 1, 1), data="test", ) ts_records = list(iter_timestamped_records(test_record)) @@ -77,7 +79,7 @@ def test_multi_timestamp_ts_fieldname(): ) test_record = TestRecord( - ts=datetime.datetime(2020, 1, 1, 1, 1, 1), + ts=datetime(2020, 1, 1, 1, 1, 1), data="test", ) ts_records = list(iter_timestamped_records(test_record)) @@ -95,7 +97,7 @@ def test_multi_timestamp_timezone(): ], ) - correct_ts = datetime.datetime(2023, 12, 31, 13, 37, 1, 123456, tzinfo=datetime.timezone.utc) + correct_ts = datetime(2023, 12, 31, 13, 37, 1, 123456, tzinfo=UTC) ts_notations = [ correct_ts, @@ -127,8 +129,8 @@ def test_multi_timestamp_descriptor_cache(): merge_record_descriptors.cache_clear() for i in range(10): test_record = TestRecord( - ctime=datetime.datetime.utcnow() + datetime.timedelta(hours=69), - atime=datetime.datetime.utcnow() + datetime.timedelta(hours=420), + ctime=datetime.now(UTC) + timedelta(hours=69), + atime=datetime.now(UTC) + timedelta(hours=420), count=i, data=f"test {i}", ) diff --git a/tests/test_packer.py b/tests/test_packer.py index 5ef017a..8ee012c 100644 --- a/tests/test_packer.py +++ b/tests/test_packer.py @@ -1,4 +1,4 @@ -import datetime +from datetime import datetime, timezone import pytest @@ -7,6 +7,8 @@ from flow.record.fieldtypes import uri from flow.record.packer import RECORD_PACK_EXT_TYPE +UTC = timezone.utc + def test_uri_packing(): packer = RecordPacker() @@ -151,7 +153,7 @@ def test_dynamic_packer(): assert r.value == [1, True, b"b", "u"] assert isinstance(r.value, fieldtypes.stringlist) - now = datetime.datetime.utcnow() + now = datetime.now(UTC) t = TestRecord(now) data = packer.pack(t) r = packer.unpack(data) @@ -195,7 +197,7 @@ def test_pack_digest(): def test_record_in_record(): packer = RecordPacker() - dt = datetime.datetime.utcnow() + dt = datetime.now(UTC) RecordA = RecordDescriptor( "test/record_a", diff --git a/tests/test_rdump.py b/tests/test_rdump.py index 51664d0..a0371ba 100644 --- a/tests/test_rdump.py +++ b/tests/test_rdump.py @@ -4,10 +4,13 @@ import os import platform import subprocess +from unittest import mock import pytest +import flow.record.fieldtypes from flow.record import RecordDescriptor, RecordReader, RecordWriter +from flow.record.fieldtypes import flow_record_tz from flow.record.tools import rdump @@ -509,3 +512,48 @@ def test_rdump_count_and_skip(tmp_path, capsysbinary, total_records, count, skip with RecordReader(subset_path) as reader: numbers = [rec.number for rec in reader] assert numbers == expected_numbers + + +@pytest.mark.parametrize( + "date_str,tz,expected_date_str", + [ + ("2023-08-02T22:28:06.12345+01:00", None, "2023-08-02 21:28:06.123450+00:00"), + ("2023-08-02T22:28:06.12345+01:00", "NONE", "2023-08-02 22:28:06.123450+01:00"), + ("2023-08-02T22:28:06.12345-08:00", "NONE", "2023-08-02 22:28:06.123450-08:00"), + ("2023-08-02T20:51:32.123456+00:00", "Europe/Amsterdam", "2023-08-02 22:51:32.123456+02:00"), + ("2023-08-02T20:51:32.123456+00:00", "America/New_York", "2023-08-02 16:51:32.123456-04:00"), + ], +) +@pytest.mark.parametrize( + "rdump_params", + [ + [], + ["--mode=csv"], + ["--mode=line"], + ], +) +def test_flow_record_tz_output(tmp_path, capsys, date_str, tz, expected_date_str, rdump_params): + TestRecord = RecordDescriptor( + "test/flow_record_tz", + [ + ("datetime", "stamp"), + ], + ) + with RecordWriter(tmp_path / "test.records") as writer: + writer.write(TestRecord(stamp=date_str)) + + env_dict = {} + if tz is not None: + env_dict["FLOW_RECORD_TZ"] = tz + + with mock.patch.dict(os.environ, env_dict, clear=True): + # Reconfigure DISPLAY_TZINFO + flow.record.fieldtypes.DISPLAY_TZINFO = flow_record_tz("UTC") + + rdump.main([str(tmp_path / "test.records")] + rdump_params) + captured = capsys.readouterr() + assert captured.err == "" + assert expected_date_str in captured.out + + # restore DISPLAY_TZINFO just in case + flow.record.fieldtypes.DISPLAY_TZINFO = flow_record_tz("UTC") diff --git a/tests/test_record_adapter.py b/tests/test_record_adapter.py index 7a51a28..39b87f9 100644 --- a/tests/test_record_adapter.py +++ b/tests/test_record_adapter.py @@ -203,7 +203,7 @@ def test_record_writer_stdout(): def test_record_adapter_archive(tmpdir): # archive some records, using "testing" as name writer = RecordWriter("archive://{}?name=testing".format(tmpdir)) - dt = datetime.datetime.utcnow() + dt = datetime.datetime.now(datetime.timezone.utc) count = 0 for rec in generate_records(): writer.write(rec) diff --git a/tests/test_regression.py b/tests/test_regression.py index c55d127..e48610c 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -1,10 +1,10 @@ import codecs -import datetime import json import os import pathlib import subprocess import sys +from datetime import datetime, timezone from unittest.mock import mock_open, patch import msgpack @@ -32,7 +32,7 @@ def test_datetime_serialization(): packer = RecordPacker() - now = datetime.datetime.utcnow() + now = datetime.now(timezone.utc) for tz in ["UTC", "Europe/Amsterdam"]: os.environ["TZ"] = tz diff --git a/tests/test_selector.py b/tests/test_selector.py index b393db7..ad5dde6 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import datetime, timezone import pytest @@ -449,7 +449,7 @@ def test_record_in_records(): ) test_str = "this is a test" - dt = datetime.utcnow() + dt = datetime.now(timezone.utc) record_a = RecordA(some_dt=dt, field=test_str) record_b = RecordB(record=record_a, some_dt=dt) From 306fccc870c4163571137b960f213d1559992705 Mon Sep 17 00:00:00 2001 From: Yun Zheng Hu Date: Thu, 3 Aug 2023 09:00:14 +0000 Subject: [PATCH 2/6] Fix black linting --- flow/record/adapter/elastic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/record/adapter/elastic.py b/flow/record/adapter/elastic.py index 82f3647..71fd6f2 100644 --- a/flow/record/adapter/elastic.py +++ b/flow/record/adapter/elastic.py @@ -99,7 +99,7 @@ def __init__( index: str = "records", http_compress: Union[str, bool] = True, selector: Union[None, Selector, CompiledSelector] = None, - **kwargs + **kwargs, ) -> None: self.index = index self.uri = uri From 73bfa309c921e68b3194d2c49ada5ff9fb7f55e8 Mon Sep 17 00:00:00 2001 From: Yun Zheng Hu Date: Thu, 3 Aug 2023 10:43:41 +0000 Subject: [PATCH 3/6] Fix regex for PyPy --- tests/test_fieldtypes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_fieldtypes.py b/tests/test_fieldtypes.py index c1c0098..3f6e831 100644 --- a/tests/test_fieldtypes.py +++ b/tests/test_fieldtypes.py @@ -941,10 +941,10 @@ def test_datetime_timezone_aware(tmp_path, record_filename): def test_datetime_comparisions(): - with pytest.raises(TypeError, match="can't compare offset-naive and offset-aware datetimes"): + with pytest.raises(TypeError, match=".* compare .*naive"): assert dt("2023-01-01") > datetime(2022, 1, 1) - with pytest.raises(TypeError, match="can't compare offset-naive and offset-aware datetimes"): + with pytest.raises(TypeError, match=".* compare .*naive"): assert datetime(2022, 1, 1) < dt("2023-01-01") assert dt("2023-01-01") > datetime(2022, 1, 1, tzinfo=UTC) From 246db1491bc98c0cccaff13675e6bb17c549d6e4 Mon Sep 17 00:00:00 2001 From: Yun Zheng Hu Date: Wed, 16 Aug 2023 11:43:36 +0200 Subject: [PATCH 4/6] Apply suggestions from code review Co-authored-by: Erik Schamper <1254028+Schamper@users.noreply.github.com> --- flow/record/fieldtypes/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flow/record/fieldtypes/__init__.py b/flow/record/fieldtypes/__init__.py index 31a4ac7..5fd0c6b 100644 --- a/flow/record/fieldtypes/__init__.py +++ b/flow/record/fieldtypes/__init__.py @@ -41,10 +41,10 @@ def flow_record_tz(default_tz: str = "UTC") -> Optional[ZoneInfo]: - """Return a ZoneInfo object based on the ``FLOW_RECORD_TZ`` environment variable. + """Return a ``ZoneInfo`` object based on the ``FLOW_RECORD_TZ`` environment variable. Args: - default_tz: default timezone if ``FLOW_RECORD_TZ`` is not set (default: UTC) + default_tz: Default timezone if ``FLOW_RECORD_TZ`` is not set (default: UTC). Returns: None if ``FLOW_RECORD_TZ=NONE`` otherwise ``ZoneInfo(FLOW_RECORD_TZ)`` @@ -283,7 +283,7 @@ def __new__(cls, *args, **kwargs): elif isinstance(arg, (int, float_type)): obj = cls.fromtimestamp(arg, UTC) elif isinstance(arg, (_dt,)): - tzinfo = UTC if arg.tzinfo is None else arg.tzinfo + tzinfo = arg.tzinfo or UTC obj = _dt.__new__( cls, arg.year, From d3310d4ab35325cb6f1b2a3962b1dbbf39501657 Mon Sep 17 00:00:00 2001 From: Yun Zheng Hu Date: Wed, 16 Aug 2023 11:45:48 +0200 Subject: [PATCH 5/6] Apply suggestions from code review Co-authored-by: Erik Schamper <1254028+Schamper@users.noreply.github.com> --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 97309dd..e0c5b7d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ classifiers = [ dependencies = [ "msgpack>=0.5.2", "backports.zoneinfo[tzdata]; python_version<'3.9'", + "tzdata; platform_system=='Windows'", ] dynamic = ["version"] From 796f150a7699eb95ecccdad86a6da34b2e65ce66 Mon Sep 17 00:00:00 2001 From: Yun Zheng Hu Date: Wed, 16 Aug 2023 16:33:58 +0000 Subject: [PATCH 6/6] Show warning and fallback to UTC if invalid zone --- flow/record/fieldtypes/__init__.py | 27 ++++++++++++------------ tests/test_rdump.py | 34 ++++++++++++++++++++++++++++-- tox.ini | 2 +- 3 files changed, 47 insertions(+), 16 deletions(-) diff --git a/flow/record/fieldtypes/__init__.py b/flow/record/fieldtypes/__init__.py index 5fd0c6b..d4d4e1a 100644 --- a/flow/record/fieldtypes/__init__.py +++ b/flow/record/fieldtypes/__init__.py @@ -1,22 +1,19 @@ +from __future__ import annotations + import binascii import math import os import pathlib import re import sys +import warnings from binascii import a2b_hex, b2a_hex from datetime import datetime as _dt from datetime import timezone from posixpath import basename, dirname from typing import Any, Optional, Tuple -from zoneinfo import ZoneInfo - -try: - import urlparse -except ImportError: - import urllib.parse as urlparse - -import warnings +from urllib.parse import urlparse +from zoneinfo import ZoneInfo, ZoneInfoNotFoundError from flow.record.base import FieldType @@ -40,25 +37,29 @@ path_type = pathlib.PurePath -def flow_record_tz(default_tz: str = "UTC") -> Optional[ZoneInfo]: +def flow_record_tz(*, default_tz: str = "UTC") -> Optional[ZoneInfo | UTC]: """Return a ``ZoneInfo`` object based on the ``FLOW_RECORD_TZ`` environment variable. Args: default_tz: Default timezone if ``FLOW_RECORD_TZ`` is not set (default: UTC). Returns: - None if ``FLOW_RECORD_TZ=NONE`` otherwise ``ZoneInfo(FLOW_RECORD_TZ)`` + None if ``FLOW_RECORD_TZ=NONE`` otherwise ``ZoneInfo(FLOW_RECORD_TZ)`` or ``UTC`` if ZoneInfo is not found. """ tz = os.environ.get("FLOW_RECORD_TZ", default_tz) if tz.upper() == "NONE": return None - return ZoneInfo(tz) + try: + return ZoneInfo(tz) + except ZoneInfoNotFoundError as exc: + warnings.warn(f"{exc!r}, falling back to timezone.utc") + return UTC # The environment variable ``FLOW_RECORD_TZ`` affects the display of datetime fields. # # The timezone to use when displaying datetime fields. By default this is UTC. -DISPLAY_TZINFO = flow_record_tz("UTC") +DISPLAY_TZINFO = flow_record_tz(default_tz="UTC") def defang(value: str) -> str: @@ -491,7 +492,7 @@ def _unpack(cls, data): class uri(string, FieldType): def __init__(self, value): - self._parsed = urlparse.urlparse(value) + self._parsed = urlparse(value) @staticmethod def normalize(path): diff --git a/tests/test_rdump.py b/tests/test_rdump.py index a0371ba..035a3a0 100644 --- a/tests/test_rdump.py +++ b/tests/test_rdump.py @@ -4,6 +4,7 @@ import os import platform import subprocess +from datetime import timezone from unittest import mock import pytest @@ -548,7 +549,7 @@ def test_flow_record_tz_output(tmp_path, capsys, date_str, tz, expected_date_str with mock.patch.dict(os.environ, env_dict, clear=True): # Reconfigure DISPLAY_TZINFO - flow.record.fieldtypes.DISPLAY_TZINFO = flow_record_tz("UTC") + flow.record.fieldtypes.DISPLAY_TZINFO = flow_record_tz(default_tz="UTC") rdump.main([str(tmp_path / "test.records")] + rdump_params) captured = capsys.readouterr() @@ -556,4 +557,33 @@ def test_flow_record_tz_output(tmp_path, capsys, date_str, tz, expected_date_str assert expected_date_str in captured.out # restore DISPLAY_TZINFO just in case - flow.record.fieldtypes.DISPLAY_TZINFO = flow_record_tz("UTC") + flow.record.fieldtypes.DISPLAY_TZINFO = flow_record_tz(default_tz="UTC") + + +def test_flow_record_invalid_tz(tmp_path, capsys): + TestRecord = RecordDescriptor( + "test/flow_record_tz", + [ + ("datetime", "stamp"), + ], + ) + with RecordWriter(tmp_path / "test.records") as writer: + writer.write(TestRecord(stamp="2023-08-16T17:46:55.390691+02:00")) + + env_dict = { + "FLOW_RECORD_TZ": "invalid", + } + + with mock.patch.dict(os.environ, env_dict, clear=True): + # Reconfigure DISPLAY_TZINFO + with pytest.warns(UserWarning, match=".* falling back to timezone.utc"): + flow.record.fieldtypes.DISPLAY_TZINFO = flow_record_tz() + + rdump.main([str(tmp_path / "test.records")]) + captured = capsys.readouterr() + assert captured.err == "" + assert "2023-08-16 15:46:55.390691+00:00" in captured.out + assert flow.record.fieldtypes.DISPLAY_TZINFO == timezone.utc + + # restore DISPLAY_TZINFO just in case + flow.record.fieldtypes.DISPLAY_TZINFO = flow_record_tz(default_tz="UTC") diff --git a/tox.ini b/tox.ini index a036145..a026ea9 100644 --- a/tox.ini +++ b/tox.ini @@ -49,7 +49,7 @@ deps = vermin commands = flake8 flow tests - vermin -t=3.7- --no-tips --lint flow tests + vermin -t=3.7- --no-tips --lint --exclude zoneinfo flow tests [flake8] max-line-length = 120