Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.
This repository was archived by the owner on May 17, 2024. It is now read-only.

--min-age and --max-age should be computed relative to database session time #284

@nenb

Description

@nenb

Describe the bug
I expect the --min-age and --max-age command-line arguments to be relative to the database session time, rather than relative to the time on the machine where data-diff is run (eg consider a database in a different time zone to a local machine running data-diff). I believe that this gives a user a potentially misleading impression on what data they have analysed with data-diff.

I think the easiest way to see this situation is via TestCLI in test_cli.py, by changing the timezone in the database session during the test setup (self.conn.query("SET @@session.time_zone='-01:00'")) :

class TestCLI(unittest.TestCase):
    def setUp(self) -> None:
        self.conn = get_conn(MySQL)
        self.conn.query("drop table if exists test_cli")
        self.conn.query("drop table if exists test_cli_2")
        table_src_name = "test_cli"
        table_dst_name = "test_cli_2"

        src_table = table(table_src_name, schema={"id": int, "datetime": datetime, "text_comment": str})
        self.conn.query(src_table.create())

        self.conn.query("SET @@session.time_zone='-01:00'")  # change time zone from +00:00
        db_time = self.conn.query("select now()", datetime)
        self.now = now = arrow.get(db_time)

        rows = [
            (now, "now"),
            (self.now.shift(seconds=-10), "a"),
            (self.now.shift(seconds=-7), "b"),
            (self.now.shift(seconds=-6), "c"),
        ]

        self.conn.query(src_table.insert_rows((i, ts.datetime, s) for i, (ts, s) in enumerate(rows)))
        _commit(self.conn)

        self.conn.query(f"CREATE TABLE {table_dst_name} AS SELECT * FROM {table_src_name}")
        _commit(self.conn)

        self.conn.query(src_table.insert_row(len(rows), self.now.shift(seconds=-3).datetime, "3 seconds ago"))
        _commit(self.conn)

    def tearDown(self) -> None:
        self.conn.query("drop table if exists test_cli")
        self.conn.query("drop table if exists test_cli_2")
        _commit(self.conn)

        return super().tearDown()

    def test_basic(self):
        diff = run_datadiff_cli(TEST_MYSQL_CONN_STRING, "test_cli", TEST_MYSQL_CONN_STRING, "test_cli_2")
        assert len(diff) == 1

    def test_options(self):
        diff = run_datadiff_cli(
            TEST_MYSQL_CONN_STRING,
            "test_cli",
            TEST_MYSQL_CONN_STRING,
            "test_cli_2",
            "--bisection-factor",
            "16",
            "--bisection-threshold",
            "10000",
            "--limit",
            "5",
            "-t",
            "datetime",
            "--max-age",
            "1h",  # can make test pass again by changing to 2h
        )
        assert len(diff) == 1

The tests should now fail, and can only be fixed by changing the --max-age value.

I've opened a draft PR (#283) with a sketch of a fix. If you think it has potential, please let me know what can be edited/improved, and then I will update the PR with test coverage (and possibly support for other databases if required).

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions