From dcc8aaacdd23fed707e3794deab926cf89985d94 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 10 May 2022 15:39:03 +0200 Subject: [PATCH 01/18] Switched interface to -t/--update-column, now -c is meant only for other extra columns --- data_diff/__main__.py | 10 ++++++---- data_diff/diff_tables.py | 3 ++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/data_diff/__main__.py b/data_diff/__main__.py index 9f68bdda..b53351b7 100644 --- a/data_diff/__main__.py +++ b/data_diff/__main__.py @@ -17,8 +17,9 @@ @click.argument("table1_name") @click.argument("db2_uri") @click.argument("table2_name") -@click.option("-k", "--key_column", default="id", help="Name of primary key column") -@click.option("-c", "--columns", default=["updated_at"], multiple=True, help="Names of extra columns to compare") +@click.option("-k", "--key-column", default="id", help="Name of primary key column") +@click.option("-t", "--update-column", default="updated_at", help="Name of updated_at/last_updated column") +@click.option("-c", "--columns", default=[], multiple=True, help="Names of extra columns to compare") @click.option("-l", "--limit", default=None, help="Maximum number of differences to find") @click.option("--bisection-factor", default=32, help="Segments per iteration") @click.option("--bisection-threshold", default=1024**2, help="Minimal bisection threshold") @@ -31,6 +32,7 @@ def main( db2_uri, table2_name, key_column, + update_column, columns, limit, bisection_factor, @@ -53,8 +55,8 @@ def main( start = time.time() - table1 = TableSegment(db1, (table1_name,), key_column, columns) - table2 = TableSegment(db2, (table2_name,), key_column, columns) + table1 = TableSegment(db1, (table1_name,), key_column, update_column, columns) + table2 = TableSegment(db2, (table2_name,), key_column, update_column, columns) differ = TableDiffer(bisection_factor=bisection_factor, bisection_threshold=bisection_threshold, debug=debug) diff_iter = differ.diff_tables(table1, table2) diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py index 53053826..8f63006e 100644 --- a/data_diff/diff_tables.py +++ b/data_diff/diff_tables.py @@ -23,6 +23,7 @@ class TableSegment: database: Database table_path: DbPath key_column: str + update_column: str extra_columns: Tuple[str, ...] start: DbKey = None end: DbKey = None @@ -102,7 +103,7 @@ def count(self) -> int: @property def _relevant_columns(self) -> List[str]: - return [self.key_column] + list(self.extra_columns) + return [self.key_column, self.update_column] + list(self.extra_columns) @property def checksum(self) -> int: From 23f1cca1c3815d42531293f56bef0ed579349992 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 10 May 2022 15:46:02 +0200 Subject: [PATCH 02/18] Fix docs --- README.md | 3 ++- example.sh | 12 ++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 24bd8d5b..b0e3f4d4 100644 --- a/README.md +++ b/README.md @@ -102,7 +102,8 @@ Usage: `data_diff DB1_URI TABLE1_NAME DB2_URI TABLE2_NAME [OPTIONS]` Options: - `--help` - Show help message and exit. - - `-k` or `--key_column` - Name of the primary key column + - `-k` or `--key-column` - Name of the primary key column + - `-t` or `--update-column` - Name of updated_at/last_updated column - `-c` or `--columns` - List of names of extra columns to compare - `-l` or `--limit` - Maximum number of differences to find (limits maximum bandwidth and runtime) - `-s` or `--stats` - Print stats instead of a detailed diff diff --git a/example.sh b/example.sh index ff392ae6..4f3844aa 100755 --- a/example.sh +++ b/example.sh @@ -26,12 +26,12 @@ prepare_db() { } data_diff() { - docker-compose run -e DB1_URI=postgres://postgres:Password1@postgresql/postgres -e TABLE1_NAME=Rating -e DB2_URI=mysql://mysql:Password1@mysql/mysql -e TABLE2_NAME=Rating_del1 -e OPTIONS='-c timestamp --bisection-factor 4 -v -s' data-diff - docker-compose run -e DB1_URI=postgres://postgres:Password1@postgresql/postgres -e TABLE1_NAME=Rating -e DB2_URI=mysql://mysql:Password1@mysql/mysql -e TABLE2_NAME=Rating_update1 -e OPTIONS='-c timestamp --bisection-factor 4 -v' data-diff - docker-compose run -e DB1_URI=postgres://postgres:Password1@postgresql/postgres -e TABLE1_NAME=Rating -e DB2_URI=mysql://mysql:Password1@mysql/mysql -e TABLE2_NAME=Rating_update001p -e OPTIONS='-c timestamp --bisection-factor 64 -v -s' data-diff - docker-compose run -e DB1_URI=postgres://postgres:Password1@postgresql/postgres -e TABLE1_NAME=Rating -e DB2_URI=mysql://mysql:Password1@mysql/mysql -e TABLE2_NAME=Rating_update1p -e OPTIONS='-c timestamp --bisection-factor 4 -v -s' data-diff - docker-compose run -e DB1_URI=postgres://postgres:Password1@postgresql/postgres -e TABLE1_NAME=Rating -e DB2_URI=mysql://mysql:Password1@mysql/mysql -e TABLE2_NAME=Rating_del1p -e OPTIONS='-c timestamp --bisection-factor 4 -v -s' data-diff - docker-compose run -e DB1_URI=postgres://postgres:Password1@postgresql/postgres -e TABLE1_NAME=Rating -e DB2_URI=mysql://mysql:Password1@mysql/mysql -e TABLE2_NAME=Rating_update50p -e OPTIONS='-c timestamp --bisection-factor 4 -v -s' data-diff + docker-compose run -e DB1_URI=postgres://postgres:Password1@postgresql/postgres -e TABLE1_NAME=Rating -e DB2_URI=mysql://mysql:Password1@mysql/mysql -e TABLE2_NAME=Rating_del1 -e OPTIONS='-t timestamp --bisection-factor 4 -v -s' data-diff + docker-compose run -e DB1_URI=postgres://postgres:Password1@postgresql/postgres -e TABLE1_NAME=Rating -e DB2_URI=mysql://mysql:Password1@mysql/mysql -e TABLE2_NAME=Rating_update1 -e OPTIONS='-t timestamp --bisection-factor 4 -v' data-diff + docker-compose run -e DB1_URI=postgres://postgres:Password1@postgresql/postgres -e TABLE1_NAME=Rating -e DB2_URI=mysql://mysql:Password1@mysql/mysql -e TABLE2_NAME=Rating_update001p -e OPTIONS='-t timestamp --bisection-factor 64 -v -s' data-diff + docker-compose run -e DB1_URI=postgres://postgres:Password1@postgresql/postgres -e TABLE1_NAME=Rating -e DB2_URI=mysql://mysql:Password1@mysql/mysql -e TABLE2_NAME=Rating_update1p -e OPTIONS='-t timestamp --bisection-factor 4 -v -s' data-diff + docker-compose run -e DB1_URI=postgres://postgres:Password1@postgresql/postgres -e TABLE1_NAME=Rating -e DB2_URI=mysql://mysql:Password1@mysql/mysql -e TABLE2_NAME=Rating_del1p -e OPTIONS='-t timestamp --bisection-factor 4 -v -s' data-diff + docker-compose run -e DB1_URI=postgres://postgres:Password1@postgresql/postgres -e TABLE1_NAME=Rating -e DB2_URI=mysql://mysql:Password1@mysql/mysql -e TABLE2_NAME=Rating_update50p -e OPTIONS='-t timestamp --bisection-factor 4 -v -s' data-diff } shutdown() { From 6d33dbc9b0ede335649a57d6966977236b284d15 Mon Sep 17 00:00:00 2001 From: Simon Eskildsen Date: Tue, 10 May 2022 10:37:41 -0400 Subject: [PATCH 03/18] dev: simplify --- .dockerignore | 1 + Dockerfile | 11 ++ README.md | 118 ++++++++++++++----- dev/Dockerfile | 31 ----- dev/prepare_db.pql | 6 +- dev/prepdb.sh | 23 ---- dev/docker-compose.yml => docker-compose.yml | 14 +-- example.sh | 41 ------- pyproject.toml | 7 +- setup_testenv.md | 90 -------------- tests/setup.pql | 2 +- tests/test_diff_tables.py | 54 ++++----- 12 files changed, 142 insertions(+), 256 deletions(-) create mode 100644 Dockerfile delete mode 100644 dev/Dockerfile delete mode 100644 dev/prepdb.sh rename dev/docker-compose.yml => docker-compose.yml (86%) delete mode 100755 example.sh delete mode 100644 setup_testenv.md diff --git a/.dockerignore b/.dockerignore index 5beb7e93..4e4fa7d6 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,2 +1,3 @@ .venv ml-25m* +dev/ml-25m* diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..bd17028a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.10 +RUN apt-get update && apt-get install -y \ + python3-dev libpq-dev wget unzip \ + python3-setuptools gcc bc +RUN pip install --no-cache-dir poetry==1.1.13 +COPY . /app +WORKDIR /app +# For now while we are in heavy development we install the latest with Poetry +# and execute directly with Poetry. Later, we'll move to the released Pip package. +RUN poetry install -E preql -E mysql -E pgsql -E snowflake +ENTRYPOINT ["poetry", "run", "python3", "-m", "data_diff"] diff --git a/README.md b/README.md index 24bd8d5b..7b708de5 100644 --- a/README.md +++ b/README.md @@ -2,27 +2,46 @@ A cross-database, efficient diff between mostly-similar database tables. +* + +It uses a bisection algorithm to + +```python +$ data-diff postgres:/// Original postgres:/// Original_1diff -v --bisection-factor=4 +[16:55:19] INFO - Diffing tables of size 25000095 and 25000095 | segments: 4, bisection threshold: 1048576. +[16:55:36] INFO - Diffing segment 0/4 of size 8333364 and 8333364 +[16:55:45] INFO - . Diffing segment 0/4 of size 2777787 and 2777787 +[16:55:52] INFO - . . Diffing segment 0/4 of size 925928 and 925928 +[16:55:54] INFO - . . . Diff found 2 different rows. ++ (20000, 942013020) +- (20000, 942013021) +[16:55:54] INFO - . . Diffing segment 1/4 of size 925929 and 925929 +[16:55:55] INFO - . . Diffing segment 2/4 of size 925929 and 925929 +[16:55:55] INFO - . . Diffing segment 3/4 of size 1 and 1 +[16:55:56] INFO - . Diffing segment 1/4 of size 2777788 and 2777788 +[16:55:58] INFO - . Diffing segment 2/4 of size 2777788 and 2777788 +[16:55:59] INFO - . Diffing segment 3/4 of size 1 and 1 +[16:56:00] INFO - Diffing segment 1/4 of size 8333365 and 8333365 +[16:56:06] INFO - Diffing segment 2/4 of size 8333365 and 8333365 +[16:56:11] INFO - Diffing segment 3/4 of size 1 and 1 +[16:56:11] INFO - Duration: 53.51 seconds. +``` + + Use cases: - Quickly validate that a table was copied correctly - - Find changes between two versions of the same table We currently support the following databases: - PostgreSQL - - MySQL - - Oracle - - Snowflake - - BigQuery - - Redshift - # How does it work? Data Diff finds the differences between two tables by utilizing checksum calculations and logarithmic search. @@ -63,28 +82,6 @@ We ran it with a very low bisection factor, and with the verbose flag, to demons Note: It's usually much faster to use high bisection factors, especially when there are very few changes, like in this example. -```python -$ data_diff postgres:/// Original postgres:/// Original_1diff -v --bisection-factor=4 -[16:55:19] INFO - Diffing tables of size 25000095 and 25000095 | segments: 4, bisection threshold: 1048576. -[16:55:36] INFO - Diffing segment 0/4 of size 8333364 and 8333364 -[16:55:45] INFO - . Diffing segment 0/4 of size 2777787 and 2777787 -[16:55:52] INFO - . . Diffing segment 0/4 of size 925928 and 925928 -[16:55:54] INFO - . . . Diff found 2 different rows. -+ (20000, 942013020) -- (20000, 942013021) -[16:55:54] INFO - . . Diffing segment 1/4 of size 925929 and 925929 -[16:55:55] INFO - . . Diffing segment 2/4 of size 925929 and 925929 -[16:55:55] INFO - . . Diffing segment 3/4 of size 1 and 1 -[16:55:56] INFO - . Diffing segment 1/4 of size 2777788 and 2777788 -[16:55:58] INFO - . Diffing segment 2/4 of size 2777788 and 2777788 -[16:55:59] INFO - . Diffing segment 3/4 of size 1 and 1 -[16:56:00] INFO - Diffing segment 1/4 of size 8333365 and 8333365 -[16:56:06] INFO - Diffing segment 2/4 of size 8333365 and 8333365 -[16:56:11] INFO - Diffing segment 3/4 of size 1 and 1 -[16:56:11] INFO - Duration: 53.51 seconds. -``` - - # How to install Requires Python 3.7+ with pip. @@ -97,7 +94,7 @@ or when you need extras like mysql and postgres # How to use -Usage: `data_diff DB1_URI TABLE1_NAME DB2_URI TABLE2_NAME [OPTIONS]` +Usage: `data-diff DB1_URI TABLE1_NAME DB2_URI TABLE2_NAME [OPTIONS]` Options: @@ -115,6 +112,67 @@ Options: It's highly recommended that all involved columns are indexed. +## Development Setup + +The development setup centers around using `docker-compose` to boot up various +databases, and then inserting data into them. + +For Mac for performance of Docker, we suggest enabling in the UI: + +* Use new Virtualization Framework +* Enable VirtioFS accelerated directory sharing + +**1. Install Data Diff** + +When developing/debugging, it's recommended to install dependencies and run it +directly with `poetry` rather than go through the package. + +``` +poetry install -E preql -E mysql -E pgsql -E snowflake +``` + +**2. Download CSV of Testing Data** + +```shell-session +wget https://files.grouplens.org/datasets/movielens/ml-25m.zip +unzip ml-25m.zip -d dev/ +``` + +**3. Start Databases** + +```shell-session +docker-compose up -d mysql postgres +``` + +**4. Run Unit Tests** + +```shell-session +poetry run python3 -m unittest +``` + +**5. Seed the Database(s)** + +If you're just testing, we recommend just setting up one database (e.g. +Postgres) to avoid incurring the long setup time repeatedly. + +```shell-session +preql -f dev/prepare_db.pql postgres://postgres:Password1@127.0.0.1:5432/postgres +preql -f dev/prepare_db.pql mysql://mysql:Password1@127.0.0.1:3306/mysql +preql -f dev/prepare_db snowflake:// +preql -f dev/prepare_db mssql:// +preql -f dev/prepare_db_bigquery bigquery:/// # Bigquery has its own +``` + +**6. Run data-diff against seeded database** + +```bash +poetry run python3 -m data_diff postgres://user:password@host:db Rating mysql://user:password@host:db Rating_del1 -c timestamp --stats + +Diff-Total: 250156 changed rows out of 25000095 +Diff-Percent: 1.0006% +Diff-Split: +250156 -0 +``` + # License [MIT License](https://github.com/datafold/data-diff/blob/master/LICENSE) diff --git a/dev/Dockerfile b/dev/Dockerfile deleted file mode 100644 index 4e619ef5..00000000 --- a/dev/Dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -FROM python:3.10 - -# install dependencies -RUN apt update && apt install -y \ - python3-dev libpq-dev wget unzip \ - python3-setuptools gcc bc -RUN pip install poetry - -ADD . /app - -WORKDIR /app -RUN chmod +x dev/prepdb.sh - -RUN wget https://files.grouplens.org/datasets/movielens/ml-25m.zip -RUN unzip ml-25m.zip -d /app/dev - -RUN pip install 'data-diff[preql,mysql,pgsql]' - -ARG DB1_URI -ARG TABLE1_NAME -ARG DB2_URI -ARG TABLE2_NAME -ARG OPTIONS - -ENV DB1_URI ${DB1_URI} -ENV TABLE1_NAME ${TABLE1_NAME} -ENV DB2_URI ${DB2_URI} -ENV TABLE2_NAME ${TABLE2_NAME} -ENV OPTIONS ${OPTIONS} - -CMD data_diff ${DB1_URI} ${TABLE1_NAME} ${DB2_URI} ${TABLE2_NAME} ${OPTIONS} \ No newline at end of file diff --git a/dev/prepare_db.pql b/dev/prepare_db.pql index 83aff9f7..f0e4a146 100644 --- a/dev/prepare_db.pql +++ b/dev/prepare_db.pql @@ -24,7 +24,7 @@ if (db_type == "snowflake") { print "Uploading ratings CSV" run_sql("RM @~/ratings.csv.gz") - run_sql("PUT file://ml-25m/ratings.csv @~") + run_sql("PUT file://dev/ml-25m/ratings.csv @~") print "Loading ratings CSV" @@ -86,7 +86,7 @@ if (db_type == "snowflake") { run_sql("create table tmp_rating(userid int, movieid int, rating float, timestamp int)") table tmp_rating {...} print "Loading ratings CSV" - run_sql("BULK INSERT tmp_rating from 'ml-25m/ratings.csv' with (fieldterminator = ',', rowterminator = '0x0a', FIRSTROW = 2);") + run_sql("BULK INSERT tmp_rating from 'dev/ml-25m/ratings.csv' with (fieldterminator = ',', rowterminator = '0x0a', FIRSTROW = 2);") print "Populating actual table" rating += tmp_rating commit() @@ -99,7 +99,7 @@ if (db_type == "snowflake") { rating: float timestamp: int } - import_csv(rating, 'ml-25m/ratings.csv', true) + import_csv(rating, 'dev/ml-25m/ratings.csv', true) rating.add_index("id", true) rating.add_index("timestamp") run_sql("CREATE INDEX index_rating_id_timestamp ON rating (id, timestamp)") diff --git a/dev/prepdb.sh b/dev/prepdb.sh deleted file mode 100644 index e9a8447b..00000000 --- a/dev/prepdb.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env bash -set -ex - -main () { - cd dev - prepare_db - cd .. -} - -prepare_db() { - START=$(date +%s) - preql -m prepare_db mysql://mysql:Password1@mysql/mysql - END=$(date +%s) - DIFF=$(echo "$END - $START" | bc) - echo "Prepare_db for mysql took: $DIFF s" - START=$(date +%s) - preql -m prepare_db postgres://postgres:Password1@postgresql/postgres - END=$(date +%s) - DIFF=$(echo "$END - $START" | bc) - echo "Prepare_db for postgres took: $DIFF s" -} - -main \ No newline at end of file diff --git a/dev/docker-compose.yml b/docker-compose.yml similarity index 86% rename from dev/docker-compose.yml rename to docker-compose.yml index 8728fa6e..41cc666d 100644 --- a/dev/docker-compose.yml +++ b/docker-compose.yml @@ -39,6 +39,9 @@ services: postgres: container_name: postgresql image: postgres:14.1-alpine + # work_mem: less tmp files + # maintenance_work_mem: improve table-level op perf + # max_wal_size: allow more time before merging to heap command: > -c work_mem=1GB -c maintenance_work_mem=1GB @@ -51,7 +54,7 @@ services: expose: - '5432' env_file: - - dev.env + - dev/dev.env tty: true networks: - local @@ -59,17 +62,14 @@ services: mysql: container_name: mysql image: mysql:oracle + # fsync less aggressively for insertion perf for test setup command: > --default-authentication-plugin=mysql_native_password - --innodb-buffer-pool-size=8G - --innodb_io_capacity=2000 - --innodb_log_file_size=1G --binlog-cache-size=16M --key_buffer_size=0 --max_connections=10 --innodb_flush_log_at_trx_commit=2 --innodb_flush_log_at_timeout=10 - --innodb_flush_method=O_DSYNC --innodb_log_compressed_pages=OFF --sync_binlog=0 restart: always @@ -81,7 +81,7 @@ services: expose: - '3306' env_file: - - dev.env + - dev/dev.env tty: true networks: - local @@ -93,4 +93,4 @@ volumes: networks: local: - driver: bridge \ No newline at end of file + driver: bridge diff --git a/example.sh b/example.sh deleted file mode 100755 index ff392ae6..00000000 --- a/example.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env bash -set -ex - -main () { - cd dev/ - initialize - prepare_db - data_diff - shutdown - cd .. -} - -initialize() { - docker-compose up -d postgres mysql - - until nc -z -v -w30 localhost 3306 && nc -z -v -w30 localhost 5432; do - echo "Databases not yet ready.." - sleep 5 - done - - docker-compose up -d data-diff prepdb -} - -prepare_db() { - . ./prepdb.sh -} - -data_diff() { - docker-compose run -e DB1_URI=postgres://postgres:Password1@postgresql/postgres -e TABLE1_NAME=Rating -e DB2_URI=mysql://mysql:Password1@mysql/mysql -e TABLE2_NAME=Rating_del1 -e OPTIONS='-c timestamp --bisection-factor 4 -v -s' data-diff - docker-compose run -e DB1_URI=postgres://postgres:Password1@postgresql/postgres -e TABLE1_NAME=Rating -e DB2_URI=mysql://mysql:Password1@mysql/mysql -e TABLE2_NAME=Rating_update1 -e OPTIONS='-c timestamp --bisection-factor 4 -v' data-diff - docker-compose run -e DB1_URI=postgres://postgres:Password1@postgresql/postgres -e TABLE1_NAME=Rating -e DB2_URI=mysql://mysql:Password1@mysql/mysql -e TABLE2_NAME=Rating_update001p -e OPTIONS='-c timestamp --bisection-factor 64 -v -s' data-diff - docker-compose run -e DB1_URI=postgres://postgres:Password1@postgresql/postgres -e TABLE1_NAME=Rating -e DB2_URI=mysql://mysql:Password1@mysql/mysql -e TABLE2_NAME=Rating_update1p -e OPTIONS='-c timestamp --bisection-factor 4 -v -s' data-diff - docker-compose run -e DB1_URI=postgres://postgres:Password1@postgresql/postgres -e TABLE1_NAME=Rating -e DB2_URI=mysql://mysql:Password1@mysql/mysql -e TABLE2_NAME=Rating_del1p -e OPTIONS='-c timestamp --bisection-factor 4 -v -s' data-diff - docker-compose run -e DB1_URI=postgres://postgres:Password1@postgresql/postgres -e TABLE1_NAME=Rating -e DB2_URI=mysql://mysql:Password1@mysql/mysql -e TABLE2_NAME=Rating_update50p -e OPTIONS='-c timestamp --bisection-factor 4 -v -s' data-diff -} - -shutdown() { - docker-compose down -} - -main diff --git a/pyproject.toml b/pyproject.toml index 43da7aa0..7d26d959 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,6 @@ packages = [{ include = "data_diff" }] python = "^3.7" runtype = "^0.2.4" dsnparse = "*" -pyparsing = "ˆ3.0" click = "^8.1" preql = { version = "^0.2.11", optional = true } @@ -35,9 +34,11 @@ mysql-connector-python = { version = "*", optional = true} snowflake-connector-python = { version = "*", optional = true } [tool.poetry.dev-dependencies] -protobuf = "^3.20.1" +mysql-connector-python = "*" +preql = "^0.2.11" [tool.poetry.extras] +# When adding, update also: README + Dockerfile + dev deps preql = ["preql"] mysql = ["mysql-connector-python"] pgsql = ["psycopg2"] @@ -48,4 +49,4 @@ requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" [tool.poetry.scripts] -data_diff = 'data_diff.__main__:main' \ No newline at end of file +data-diff = 'data_diff.__main__:main' diff --git a/setup_testenv.md b/setup_testenv.md deleted file mode 100644 index 72f6a235..00000000 --- a/setup_testenv.md +++ /dev/null @@ -1,90 +0,0 @@ -# Test Data Diff with Postgres and MySQL - - -``` -chmod +x ./dev/example.sh -./dev/example.sh -``` - -NB for Mac. If the process takes very long (e.g. importing CSV file takes >30m), make sure that you have the latest version of Docker installed and have enabled the experimental features `Use the new Virtualization framework` and `Enable VirtioFS accelerated directory sharing`. Because the interaction with Docker and the MacOS FS is a bottleneck. - -## Manual setup - -1. Install Data Diff - -``` -pip install "data-diff[preql,mysql,pgsql]" -``` - -2. Download CSV - -``` -wget https://files.grouplens.org/datasets/movielens/ml-25m.zip -unzip ml-25m.zip -d dev/ -``` - -4. Setup databases - -(note: bigquery has its own setup script) - -``` -preql -f dev/prepare_db postgres:// - -preql -f dev/prepare_db mysql:// - -preql -f dev/prepare_db snowflake:// - -preql -f dev/prepare_db mssql:// - -preql -f dev/prepare_db_bigquery bigquery:/// - - -etc. -``` - -And it's ready to use! - -Example: - -```bash -data_diff postgres://user:password@host:db Rating mysql://user:password@host:db Rating_del1 -c timestamp --stats - -Diff-Total: 250156 changed rows out of 25000095 -Diff-Percent: 1.0006% -Diff-Split: +250156 -0 - -``` - -## Database settings with explanation -*Inline comments in docker-compose.yml will break the databases.* - -**PostgreSQL:** - -``` --c work_mem=1GB # Reduce writing temporary disk files. --c maintenance_work_mem=1GB # Improve VACUUM, CREATE INDEX, ALTER TABLE ADD FOREIGN KEY operations. --c max_wal_size=8GB # Filling of the table with movie lens data creates an higher write - # load than the default assumption of 1GB/hour. -``` -**MySQL:** -``` ---default-authentication-plugin=mysql_native_password # Required for setting password via env vars. ---innodb-buffer-pool-size=8G # Recommendation is to set to 50-75% of available - # memmory. However, this is no dedicated instance. ---innodb_io_capacity=2000 # Default setting is for hard drives. SSD benefits - # from higher values. ---innodb_log_file_size=1G # Tuning recommendation based on the - # innodb-buffer-pool-size setting. ---binlog-cache-size=16M # Tuning recommendation ---key_buffer_size=0 # No MyISAM tables, InnoDB engine is used. ---max_connections=10 # Test setup, not a lot connection needed. ---innodb_flush_log_at_trx_commit=2 # Reduce creation of logs for performance. ---innodb_flush_log_at_timeout=10 # Idem ---innodb_flush_method=O_DSYNC # Suffers less from race conditions than fsync. ---innodb_log_compressed_pages=OFF # To write less data to the redo_log. ---sync_binlog=0 # Disables synchronization of the binary log to disk - # by the MySQL server. Instead, the MySQL server relies - # on the operating system to flush the binary log to - # disk from time to time as it does for any other file. - # This setting provides the best performance. -``` \ No newline at end of file diff --git a/tests/setup.pql b/tests/setup.pql index c40f7f66..08e9f5a6 100644 --- a/tests/setup.pql +++ b/tests/setup.pql @@ -9,7 +9,7 @@ table ratings_test { timestamp: timestamp } -table ratings_est2 { +table ratings_test2 { userid: int movieid: int rating: float diff --git a/tests/test_diff_tables.py b/tests/test_diff_tables.py index f4409d1a..eb0bb6fe 100644 --- a/tests/test_diff_tables.py +++ b/tests/test_diff_tables.py @@ -18,20 +18,20 @@ def setUpClass(cls): cls.connection = connect_to_uri(TEST_MYSQL_CONN_STRING) def setUp(self): - self.table_name = "RatingsTest" + self.connection.query("DROP TABLE IF EXISTS ratings_test", None) + self.connection.query("DROP TABLE IF EXISTS ratings_test2", None) + self.preql.load("./tests/setup.pql") + self.preql.commit() + self.table = TableSegment(TestDiffTables.connection, - (self.table_name, ), + ('ratings_test', ), 'id', ('timestamp', )) self.table2 = TableSegment(TestDiffTables.connection, - ("RatingsTest2", ), + ("ratings_test2", ), 'id', ('timestamp', )) - self.connection.query("DROP TABLE IF EXISTS RatingsTest", None) - self.connection.query("DROP TABLE IF EXISTS RatingsTest2", None) - self.preql.load("./tests/setup.pql") - self.preql.commit() self.differ = TableDiffer(3, 4) @@ -43,7 +43,7 @@ def test_properties_on_empty_table(self): def test_get_values(self): time = "2022-01-01 00:00:00" res = self.preql(f""" - new RatingsTest(1, 1, 9, '{time}') + new ratings_test(1, 1, 9, '{time}') """) self.preql.commit() @@ -54,10 +54,10 @@ def test_get_values(self): def test_checkpoints(self): time = "2022-01-01 00:00:00" self.preql(f""" - new RatingsTest(userId: 1, movieId: 1, rating: 9, timestamp: '{time}') - new RatingsTest(userId: 1, movieId: 1, rating: 9, timestamp: '{time}') - new RatingsTest(userId: 1, movieId: 1, rating: 9, timestamp: '{time}') - new RatingsTest(userId: 1, movieId: 1, rating: 9, timestamp: '{time}') + new ratings_test(userid: 1, movieid: 1, rating: 9, timestamp: '{time}') + new ratings_test(userid: 1, movieid: 1, rating: 9, timestamp: '{time}') + new ratings_test(userid: 1, movieid: 1, rating: 9, timestamp: '{time}') + new ratings_test(userid: 1, movieid: 1, rating: 9, timestamp: '{time}') """) self.preql.commit() self.assertEqual([2, 4], self.table.choose_checkpoints(2)) @@ -65,10 +65,10 @@ def test_checkpoints(self): def test_diff_small_tables(self): time = "2022-01-01 00:00:00" self.preql(f""" - new RatingsTest(userId: 1, movieId: 1, rating: 9, timestamp: '{time}') - new RatingsTest(userId: 2, movieId: 2, rating: 9, timestamp: '{time}') + new ratings_test(userid: 1, movieid: 1, rating: 9, timestamp: '{time}') + new ratings_test(userid: 2, movieid: 2, rating: 9, timestamp: '{time}') - new RatingsTest2(userId: 1, movieId: 1, rating: 9, timestamp: '{time}') + new ratings_test2(userid: 1, movieid: 1, rating: 9, timestamp: '{time}') """) self.preql.commit() diff = list(self.differ.diff_tables(self.table, self.table2)) @@ -78,16 +78,16 @@ def test_diff_small_tables(self): def test_diff_table_above_bisection_threshold(self): time = "2022-01-01 00:00:00" self.preql(f""" - new RatingsTest(userId: 1, movieId: 1, rating: 9, timestamp: '{time}') - new RatingsTest(userId: 2, movieId: 2, rating: 9, timestamp: '{time}') - new RatingsTest(userId: 3, movieId: 3, rating: 9, timestamp: '{time}') - new RatingsTest(userId: 4, movieId: 4, rating: 9, timestamp: '{time}') - new RatingsTest(userId: 5, movieId: 5, rating: 9, timestamp: '{time}') - - new RatingsTest2(userId: 1, movieId: 1, rating: 9, timestamp: '{time}') - new RatingsTest2(userId: 2, movieId: 2, rating: 9, timestamp: '{time}') - new RatingsTest2(userId: 3, movieId: 3, rating: 9, timestamp: '{time}') - new RatingsTest2(userId: 4, movieId: 4, rating: 9, timestamp: '{time}') + new ratings_test(userid: 1, movieid: 1, rating: 9, timestamp: '{time}') + new ratings_test(userid: 2, movieid: 2, rating: 9, timestamp: '{time}') + new ratings_test(userid: 3, movieid: 3, rating: 9, timestamp: '{time}') + new ratings_test(userid: 4, movieid: 4, rating: 9, timestamp: '{time}') + new ratings_test(userid: 5, movieid: 5, rating: 9, timestamp: '{time}') + + new ratings_test2(userid: 1, movieid: 1, rating: 9, timestamp: '{time}') + new ratings_test2(userid: 2, movieid: 2, rating: 9, timestamp: '{time}') + new ratings_test2(userid: 3, movieid: 3, rating: 9, timestamp: '{time}') + new ratings_test2(userid: 4, movieid: 4, rating: 9, timestamp: '{time}') """) self.preql.commit() diff = list(self.differ.diff_tables(self.table, self.table2)) @@ -97,8 +97,8 @@ def test_diff_table_above_bisection_threshold(self): def test_return_empty_array_when_same(self): time = "2022-01-01 00:00:00" self.preql(f""" - new RatingsTest(userId: 1, movieId: 1, rating: 9, timestamp: '{time}') - new RatingsTest2(userId: 1, movieId: 1, rating: 9, timestamp: '{time}') + new ratings_test(userid: 1, movieid: 1, rating: 9, timestamp: '{time}') + new ratings_test2(userid: 1, movieid: 1, rating: 9, timestamp: '{time}') """) self.preql.commit() diff = list(self.differ.diff_tables(self.table, self.table2)) From 179bccdf2e17188777c1855b223b12aeb064d396 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 11 May 2022 15:55:00 +0200 Subject: [PATCH 04/18] Now filtering based on update_column --- data_diff/diff_tables.py | 34 +++++++++++++++++++++------------- data_diff/sql.py | 2 ++ 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py index 8f63006e..033c969b 100644 --- a/data_diff/diff_tables.py +++ b/data_diff/diff_tables.py @@ -6,7 +6,7 @@ from runtype import dataclass -from .sql import Select, Checksum, Compare, DbPath, DbKey, Count, Enum, TableName, In, Value +from .sql import Select, Checksum, Compare, DbPath, DbKey, DbTime, Count, Enum, TableName, In, Value from .database import Database logger = logging.getLogger("diff_tables") @@ -25,22 +25,30 @@ class TableSegment: key_column: str update_column: str extra_columns: Tuple[str, ...] - start: DbKey = None - end: DbKey = None + start_key: DbKey = None + end_key: DbKey = None + start_time: DbTime = None + end_time: DbTime = None _count: int = None _checksum: int = None - def _make_range_pred(self): - if self.start is not None: - yield Compare("<=", str(self.start), self.key_column) - if self.end is not None: - yield Compare("<", self.key_column, str(self.end)) + def _make_key_range(self): + if self.start_key is not None: + yield Compare("<=", str(self.start_key), self.key_column) + if self.end_key is not None: + yield Compare("<", self.key_column, str(self.end_key)) + + def _make_update_range(self): + if self.start_time is not None: + yield Compare("<=", str(self.start_time), self.update_column) + if self.end_time is not None: + yield Compare("<", self.update_column, str(self.end_time)) def _make_select(self, *, table=None, columns=None, where=None, group_by=None, order_by=None): if columns is None: columns = [self.key_column] - where = list(self._make_range_pred()) + ([] if where is None else [where]) + where = list(self._make_key_range()) + list(self._make_update_range()) + ([] if where is None else [where]) order_by = None if order_by is None else [order_by] return Select( table=table or TableName(self.table_path), @@ -71,16 +79,16 @@ def find_checkpoints(self, checkpoints: List[DbKey]) -> List[DbKey]: def segment_by_checkpoints(self, checkpoints: List[DbKey]) -> List["TableSegment"]: "Split the current TableSegment to a bunch of smaller ones, separate by the given checkpoints" - if self.start and self.end: - assert all(self.start <= c < self.end for c in checkpoints) + if self.start_key and self.end_key: + assert all(self.start_key <= c < self.end_key for c in checkpoints) checkpoints.sort() # Calculate sub-segments - positions = [self.start] + checkpoints + [self.end] + positions = [self.start_key] + checkpoints + [self.end_key] ranges = list(zip(positions[:-1], positions[1:])) # Create table segments - tables = [self.new(start=s, end=e) for s, e in ranges] + tables = [self.new(start_key=s, end_key=e) for s, e in ranges] return tables diff --git a/data_diff/sql.py b/data_diff/sql.py index 3699b27a..273205ad 100644 --- a/data_diff/sql.py +++ b/data_diff/sql.py @@ -2,11 +2,13 @@ """ from typing import List, Union, Tuple, Optional +from datetime import datetime from runtype import dataclass DbPath = Tuple[str, ...] DbKey = Union[int, str, bytes] +DbTime = datetime class Sql: From f4d75bc5656fe580029ed19a31884487962bf990 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 11 May 2022 16:57:41 +0200 Subject: [PATCH 05/18] Fix README after merge error --- README.md | 61 ------------------------------------------------------- 1 file changed, 61 deletions(-) diff --git a/README.md b/README.md index 6986d61d..7896d1e4 100644 --- a/README.md +++ b/README.md @@ -176,67 +176,6 @@ Diff-Percent: 1.0006% Diff-Split: +250156 -0 ``` -## Development Setup - -The development setup centers around using `docker-compose` to boot up various -databases, and then inserting data into them. - -For Mac for performance of Docker, we suggest enabling in the UI: - -* Use new Virtualization Framework -* Enable VirtioFS accelerated directory sharing - -**1. Install Data Diff** - -When developing/debugging, it's recommended to install dependencies and run it -directly with `poetry` rather than go through the package. - -``` -poetry install -E preql -E mysql -E pgsql -E snowflake -``` - -**2. Download CSV of Testing Data** - -```shell-session -wget https://files.grouplens.org/datasets/movielens/ml-25m.zip -unzip ml-25m.zip -d dev/ -``` - -**3. Start Databases** - -```shell-session -docker-compose up -d mysql postgres -``` - -**4. Run Unit Tests** - -```shell-session -poetry run python3 -m unittest -``` - -**5. Seed the Database(s)** - -If you're just testing, we recommend just setting up one database (e.g. -Postgres) to avoid incurring the long setup time repeatedly. - -```shell-session -preql -f dev/prepare_db.pql postgres://postgres:Password1@127.0.0.1:5432/postgres -preql -f dev/prepare_db.pql mysql://mysql:Password1@127.0.0.1:3306/mysql -preql -f dev/prepare_db snowflake:// -preql -f dev/prepare_db mssql:// -preql -f dev/prepare_db_bigquery bigquery:/// # Bigquery has its own -``` - -**6. Run data-diff against seeded database** - -```bash -poetry run python3 -m data_diff postgres://user:password@host:db Rating mysql://user:password@host:db Rating_del1 -c timestamp --stats - -Diff-Total: 250156 changed rows out of 25000095 -Diff-Percent: 1.0006% -Diff-Split: +250156 -0 -``` - # License [MIT License](https://github.com/datafold/data-diff/blob/master/LICENSE) From 439635bdd19668366be8cad758583264beb519f2 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 11 May 2022 17:42:46 +0200 Subject: [PATCH 06/18] Fix time formatting for SQL; Added tests (mysql only for now) --- data_diff/diff_tables.py | 16 ++++---- data_diff/sql.py | 7 ++++ tests/common.py | 3 +- tests/test_diff_tables.py | 82 ++++++++++++++++++++++++++++++++++++--- 4 files changed, 93 insertions(+), 15 deletions(-) diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py index 033c969b..67a675f5 100644 --- a/data_diff/diff_tables.py +++ b/data_diff/diff_tables.py @@ -6,7 +6,7 @@ from runtype import dataclass -from .sql import Select, Checksum, Compare, DbPath, DbKey, DbTime, Count, Enum, TableName, In, Value +from .sql import Select, Checksum, Compare, DbPath, DbKey, DbTime, Count, Enum, TableName, In, Value, Time from .database import Database logger = logging.getLogger("diff_tables") @@ -24,11 +24,11 @@ class TableSegment: table_path: DbPath key_column: str update_column: str - extra_columns: Tuple[str, ...] + extra_columns: Tuple[str, ...] = () start_key: DbKey = None end_key: DbKey = None - start_time: DbTime = None - end_time: DbTime = None + min_time: DbTime = None + max_time: DbTime = None _count: int = None _checksum: int = None @@ -40,10 +40,10 @@ def _make_key_range(self): yield Compare("<", self.key_column, str(self.end_key)) def _make_update_range(self): - if self.start_time is not None: - yield Compare("<=", str(self.start_time), self.update_column) - if self.end_time is not None: - yield Compare("<", self.update_column, str(self.end_time)) + if self.min_time is not None: + yield Compare("<=", Time(self.min_time), self.update_column) + if self.max_time is not None: + yield Compare("<", self.update_column, Time(self.max_time)) def _make_select(self, *, table=None, columns=None, where=None, group_by=None, order_by=None): if columns is None: diff --git a/data_diff/sql.py b/data_diff/sql.py index 273205ad..6bb98af9 100644 --- a/data_diff/sql.py +++ b/data_diff/sql.py @@ -141,3 +141,10 @@ def compile(self, c: Compiler): if self.column: return f"count({c.compile(self.column)})" return "count(*)" + +@dataclass +class Time(Sql): + time: datetime + + def compile(self, c: Compiler): + return "'%s'" % self.time.isoformat() \ No newline at end of file diff --git a/tests/common.py b/tests/common.py index db9feeb9..5c6b3e56 100644 --- a/tests/common.py +++ b/tests/common.py @@ -5,7 +5,8 @@ logging.basicConfig(level=logging.WARN) -TEST_MYSQL_CONN_STRING = "mysql://mysql:Password1@localhost/mysql" +# TEST_MYSQL_CONN_STRING = "mysql://mysql:Password1@localhost/mysql" +TEST_MYSQL_CONN_STRING = "mysql://erez:qweqwe123@localhost/erez" def str_to_checksum(str: str): # hello world diff --git a/tests/test_diff_tables.py b/tests/test_diff_tables.py index eb0bb6fe..f1304eb4 100644 --- a/tests/test_diff_tables.py +++ b/tests/test_diff_tables.py @@ -2,14 +2,14 @@ import unittest import preql +import arrow # comes with preql from data_diff.database import connect_to_uri from data_diff.diff_tables import TableDiffer, TableSegment from .common import TEST_MYSQL_CONN_STRING, str_to_checksum - -class TestDiffTables(unittest.TestCase): +class TestWithConnection(unittest.TestCase): @classmethod def setUpClass(cls): # Avoid leaking connections that require waiting for the GC, which can @@ -17,21 +17,91 @@ def setUpClass(cls): cls.preql = preql.Preql(TEST_MYSQL_CONN_STRING) cls.connection = connect_to_uri(TEST_MYSQL_CONN_STRING) +class TestDates(TestWithConnection): + def setUp(self): + self.connection.query("DROP TABLE IF EXISTS a", None) + self.connection.query("DROP TABLE IF EXISTS b", None) + self.preql(r""" + table a { + datetime: datetime + comment: string + } + commit() + + func add(date, comment) { + new a(date, comment) + } + """) + self.now = now = arrow.get(self.preql.now()) + self.preql.add(now.shift(days=-50), "50 days ago") + self.preql.add(now.shift(hours=-3), "3 hours ago") + self.preql.add(now.shift(minutes=-10), "10 mins ago") + self.preql.add(now.shift(seconds=-1), "1 second ago") + self.preql.add(now, "now") + + self.preql(r""" + const table b = a + commit() + """) + + self.preql.add(self.now.shift(seconds=-3), "2 seconds ago") + self.preql.commit() + + + def test_basic(self): + differ = TableDiffer(10, 100) + a = TableSegment(self.connection, ('a', ), 'id', 'datetime') + b = TableSegment(self.connection, ('b', ), 'id', 'datetime') + assert a.count == 6 + assert b.count == 5 + + assert not list(differ.diff_tables(a, a)) + self.assertEqual( len( list(differ.diff_tables(a, b)) ), 1 ) + + def test_offset(self): + differ = TableDiffer(2, 10) + sec1 = self.now.shift(seconds=-1).datetime + a = TableSegment(self.connection, ('a', ), 'id', 'datetime', max_time=sec1) + b = TableSegment(self.connection, ('b', ), 'id', 'datetime', max_time=sec1) + assert a.count == 4 + assert b.count == 3 + + assert not list(differ.diff_tables(a, a)) + self.assertEqual( len( list(differ.diff_tables(a, b)) ), 1 ) + + a = TableSegment(self.connection, ('a', ), 'id', 'datetime', min_time=sec1) + b = TableSegment(self.connection, ('b', ), 'id', 'datetime', min_time=sec1) + assert a.count == 2 + assert b.count == 2 + assert not list(differ.diff_tables(a, b)) + + day1 = self.now.shift(days=-1).datetime + + a = TableSegment(self.connection, ('a', ), 'id', 'datetime', min_time=day1, max_time=sec1) + b = TableSegment(self.connection, ('b', ), 'id', 'datetime', min_time=day1, max_time=sec1) + assert a.count == 3 + assert b.count == 2 + assert not list(differ.diff_tables(a, a)) + self.assertEqual( len( list(differ.diff_tables(a, b)) ), 1) + + +class TestDiffTables(TestWithConnection): + def setUp(self): self.connection.query("DROP TABLE IF EXISTS ratings_test", None) self.connection.query("DROP TABLE IF EXISTS ratings_test2", None) self.preql.load("./tests/setup.pql") self.preql.commit() - self.table = TableSegment(TestDiffTables.connection, + self.table = TableSegment(self.connection, ('ratings_test', ), 'id', - ('timestamp', )) + 'timestamp') - self.table2 = TableSegment(TestDiffTables.connection, + self.table2 = TableSegment(self.connection, ("ratings_test2", ), 'id', - ('timestamp', )) + 'timestamp') self.differ = TableDiffer(3, 4) From a766ab71df0e89ef1e898a4f751a00ea70bfe2aa Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 11 May 2022 18:05:03 +0200 Subject: [PATCH 07/18] -t/--update-column and TableSegment.update_column are now optional --- data_diff/__main__.py | 2 +- data_diff/diff_tables.py | 8 ++++++-- tests/test_diff_tables.py | 4 ++++ 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/data_diff/__main__.py b/data_diff/__main__.py index b53351b7..57a76752 100644 --- a/data_diff/__main__.py +++ b/data_diff/__main__.py @@ -18,7 +18,7 @@ @click.argument("db2_uri") @click.argument("table2_name") @click.option("-k", "--key-column", default="id", help="Name of primary key column") -@click.option("-t", "--update-column", default="updated_at", help="Name of updated_at/last_updated column") +@click.option("-t", "--update-column", default=None, help="Name of updated_at/last_updated column") @click.option("-c", "--columns", default=[], multiple=True, help="Names of extra columns to compare") @click.option("-l", "--limit", default=None, help="Maximum number of differences to find") @click.option("--bisection-factor", default=32, help="Segments per iteration") diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py index 67a675f5..eb08334f 100644 --- a/data_diff/diff_tables.py +++ b/data_diff/diff_tables.py @@ -23,7 +23,7 @@ class TableSegment: database: Database table_path: DbPath key_column: str - update_column: str + update_column: str = None extra_columns: Tuple[str, ...] = () start_key: DbKey = None end_key: DbKey = None @@ -33,6 +33,10 @@ class TableSegment: _count: int = None _checksum: int = None + def __post_init__(self): + if not self.update_column and (self.min_time or self.max_time): + raise ValueError("Error: min_time/max_time feature requires to specify 'update_column'") + def _make_key_range(self): if self.start_key is not None: yield Compare("<=", str(self.start_key), self.key_column) @@ -111,7 +115,7 @@ def count(self) -> int: @property def _relevant_columns(self) -> List[str]: - return [self.key_column, self.update_column] + list(self.extra_columns) + return [self.key_column] + ([self.update_column] if self.update_column is not None else []) + list(self.extra_columns) @property def checksum(self) -> int: diff --git a/tests/test_diff_tables.py b/tests/test_diff_tables.py index f1304eb4..9b340890 100644 --- a/tests/test_diff_tables.py +++ b/tests/test_diff_tables.py @@ -48,6 +48,10 @@ def setUp(self): self.preql.commit() + def test_init(self): + a = TableSegment(self.connection, ('a', ), 'id', 'datetime', max_time=self.now.datetime) + self.assertRaises(ValueError, TableSegment, self.connection, ('a', ), 'id', max_time=self.now.datetime) + def test_basic(self): differ = TableDiffer(10, 100) a = TableSegment(self.connection, ('a', ), 'id', 'datetime') From 3be1a08d6bcf0dff9751a2f06e80b056c7081808 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 11 May 2022 18:13:45 +0200 Subject: [PATCH 08/18] tests: Restore 'default' conn string --- tests/common.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/common.py b/tests/common.py index 5c6b3e56..db9feeb9 100644 --- a/tests/common.py +++ b/tests/common.py @@ -5,8 +5,7 @@ logging.basicConfig(level=logging.WARN) -# TEST_MYSQL_CONN_STRING = "mysql://mysql:Password1@localhost/mysql" -TEST_MYSQL_CONN_STRING = "mysql://erez:qweqwe123@localhost/erez" +TEST_MYSQL_CONN_STRING = "mysql://mysql:Password1@localhost/mysql" def str_to_checksum(str: str): # hello world From e0f3d995ed7f24a41af82bd5336bce6aac503c79 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 10 May 2022 15:39:03 +0200 Subject: [PATCH 09/18] Switched interface to -t/--update-column, now -c is meant only for other extra columns --- data_diff/__main__.py | 10 ++++++---- data_diff/diff_tables.py | 3 ++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/data_diff/__main__.py b/data_diff/__main__.py index 9f68bdda..b53351b7 100644 --- a/data_diff/__main__.py +++ b/data_diff/__main__.py @@ -17,8 +17,9 @@ @click.argument("table1_name") @click.argument("db2_uri") @click.argument("table2_name") -@click.option("-k", "--key_column", default="id", help="Name of primary key column") -@click.option("-c", "--columns", default=["updated_at"], multiple=True, help="Names of extra columns to compare") +@click.option("-k", "--key-column", default="id", help="Name of primary key column") +@click.option("-t", "--update-column", default="updated_at", help="Name of updated_at/last_updated column") +@click.option("-c", "--columns", default=[], multiple=True, help="Names of extra columns to compare") @click.option("-l", "--limit", default=None, help="Maximum number of differences to find") @click.option("--bisection-factor", default=32, help="Segments per iteration") @click.option("--bisection-threshold", default=1024**2, help="Minimal bisection threshold") @@ -31,6 +32,7 @@ def main( db2_uri, table2_name, key_column, + update_column, columns, limit, bisection_factor, @@ -53,8 +55,8 @@ def main( start = time.time() - table1 = TableSegment(db1, (table1_name,), key_column, columns) - table2 = TableSegment(db2, (table2_name,), key_column, columns) + table1 = TableSegment(db1, (table1_name,), key_column, update_column, columns) + table2 = TableSegment(db2, (table2_name,), key_column, update_column, columns) differ = TableDiffer(bisection_factor=bisection_factor, bisection_threshold=bisection_threshold, debug=debug) diff_iter = differ.diff_tables(table1, table2) diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py index 53053826..8f63006e 100644 --- a/data_diff/diff_tables.py +++ b/data_diff/diff_tables.py @@ -23,6 +23,7 @@ class TableSegment: database: Database table_path: DbPath key_column: str + update_column: str extra_columns: Tuple[str, ...] start: DbKey = None end: DbKey = None @@ -102,7 +103,7 @@ def count(self) -> int: @property def _relevant_columns(self) -> List[str]: - return [self.key_column] + list(self.extra_columns) + return [self.key_column, self.update_column] + list(self.extra_columns) @property def checksum(self) -> int: From a89349360cce99645514243dc5febf1219cae09e Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 10 May 2022 15:46:02 +0200 Subject: [PATCH 10/18] Fix docs --- README.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/README.md b/README.md index 8e061a10..985546e6 100644 --- a/README.md +++ b/README.md @@ -113,6 +113,33 @@ We ran it with a very low bisection factor, and with the verbose flag, to demons Note: It's usually much faster to use high bisection factors, especially when there are very few changes, like in this example. +# How to install + +Requires Python 3.7+ with pip. + +```pip install data-diff``` + +or when you need extras like mysql and postgres + +```pip install "data-diff[mysql,pgsql]"``` + +# How to use + +Usage: `data_diff DB1_URI TABLE1_NAME DB2_URI TABLE2_NAME [OPTIONS]` + +Options: + + - `--help` - Show help message and exit. + - `-k` or `--key-column` - Name of the primary key column + - `-t` or `--update-column` - Name of updated_at/last_updated column + - `-c` or `--columns` - List of names of extra columns to compare + - `-l` or `--limit` - Maximum number of differences to find (limits maximum bandwidth and runtime) + - `-s` or `--stats` - Print stats instead of a detailed diff + - `-d` or `--debug` - Print debug info + - `-v` or `--verbose` - Print extra info + - `--bisection-factor` - Segments per iteration. When set to 2, it performs binary search. + - `--bisection-threshold` - Minimal bisection threshold. i.e. maximum size of pages to diff locally. + ## Tips for performance It's highly recommended that all involved columns are indexed. From b7aa5e02530670427fcaabf0eb6c555d099f38b3 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 11 May 2022 15:55:00 +0200 Subject: [PATCH 11/18] Now filtering based on update_column --- data_diff/diff_tables.py | 34 +++++++++++++++++++++------------- data_diff/sql.py | 2 ++ 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py index 8f63006e..033c969b 100644 --- a/data_diff/diff_tables.py +++ b/data_diff/diff_tables.py @@ -6,7 +6,7 @@ from runtype import dataclass -from .sql import Select, Checksum, Compare, DbPath, DbKey, Count, Enum, TableName, In, Value +from .sql import Select, Checksum, Compare, DbPath, DbKey, DbTime, Count, Enum, TableName, In, Value from .database import Database logger = logging.getLogger("diff_tables") @@ -25,22 +25,30 @@ class TableSegment: key_column: str update_column: str extra_columns: Tuple[str, ...] - start: DbKey = None - end: DbKey = None + start_key: DbKey = None + end_key: DbKey = None + start_time: DbTime = None + end_time: DbTime = None _count: int = None _checksum: int = None - def _make_range_pred(self): - if self.start is not None: - yield Compare("<=", str(self.start), self.key_column) - if self.end is not None: - yield Compare("<", self.key_column, str(self.end)) + def _make_key_range(self): + if self.start_key is not None: + yield Compare("<=", str(self.start_key), self.key_column) + if self.end_key is not None: + yield Compare("<", self.key_column, str(self.end_key)) + + def _make_update_range(self): + if self.start_time is not None: + yield Compare("<=", str(self.start_time), self.update_column) + if self.end_time is not None: + yield Compare("<", self.update_column, str(self.end_time)) def _make_select(self, *, table=None, columns=None, where=None, group_by=None, order_by=None): if columns is None: columns = [self.key_column] - where = list(self._make_range_pred()) + ([] if where is None else [where]) + where = list(self._make_key_range()) + list(self._make_update_range()) + ([] if where is None else [where]) order_by = None if order_by is None else [order_by] return Select( table=table or TableName(self.table_path), @@ -71,16 +79,16 @@ def find_checkpoints(self, checkpoints: List[DbKey]) -> List[DbKey]: def segment_by_checkpoints(self, checkpoints: List[DbKey]) -> List["TableSegment"]: "Split the current TableSegment to a bunch of smaller ones, separate by the given checkpoints" - if self.start and self.end: - assert all(self.start <= c < self.end for c in checkpoints) + if self.start_key and self.end_key: + assert all(self.start_key <= c < self.end_key for c in checkpoints) checkpoints.sort() # Calculate sub-segments - positions = [self.start] + checkpoints + [self.end] + positions = [self.start_key] + checkpoints + [self.end_key] ranges = list(zip(positions[:-1], positions[1:])) # Create table segments - tables = [self.new(start=s, end=e) for s, e in ranges] + tables = [self.new(start_key=s, end_key=e) for s, e in ranges] return tables diff --git a/data_diff/sql.py b/data_diff/sql.py index 3699b27a..273205ad 100644 --- a/data_diff/sql.py +++ b/data_diff/sql.py @@ -2,11 +2,13 @@ """ from typing import List, Union, Tuple, Optional +from datetime import datetime from runtype import dataclass DbPath = Tuple[str, ...] DbKey = Union[int, str, bytes] +DbTime = datetime class Sql: From 507e78895fc97688b8213852ff375e22a31e4842 Mon Sep 17 00:00:00 2001 From: Simon Eskildsen Date: Tue, 10 May 2022 10:37:41 -0400 Subject: [PATCH 12/18] dev: simplify --- README.md | 12 ++++-------- pyproject.toml | 3 +++ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 985546e6..433ac484 100644 --- a/README.md +++ b/README.md @@ -9,10 +9,6 @@ tables. - Validate that a table was copied properly - Be alerted before your customer finds out, or your report is wrong - Validate that your replication mechnism is working correctly -- Find changes between two versions of the same table - -It uses a bisection algorithm to efficiently check if e.g. a table is the same -between MySQL and Postgres, or Postgres and Snowflake, or MySQL and RDS! ```python $ data-diff postgres:/// Original postgres:/// Original_1diff -v --bisection-factor=4 @@ -125,7 +121,7 @@ or when you need extras like mysql and postgres # How to use -Usage: `data_diff DB1_URI TABLE1_NAME DB2_URI TABLE2_NAME [OPTIONS]` +Usage: `data-diff DB1_URI TABLE1_NAME DB2_URI TABLE2_NAME [OPTIONS]` Options: @@ -191,9 +187,9 @@ Postgres) to avoid incurring the long setup time repeatedly. ```shell-session preql -f dev/prepare_db.pql postgres://postgres:Password1@127.0.0.1:5432/postgres preql -f dev/prepare_db.pql mysql://mysql:Password1@127.0.0.1:3306/mysql -preql -f dev/prepare_db.psq snowflake:// -preql -f dev/prepare_db.psq mssql:// -preql -f dev/prepare_db_bigquery.pql bigquery:/// # Bigquery has its own +preql -f dev/prepare_db snowflake:// +preql -f dev/prepare_db mssql:// +preql -f dev/prepare_db_bigquery bigquery:/// # Bigquery has its own scripts ``` **6. Run data-diff against seeded database** diff --git a/pyproject.toml b/pyproject.toml index 81f868d5..4b4db2a7 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,8 +36,11 @@ snowflake-connector-python = { version = "*", optional = true } [tool.poetry.dev-dependencies] mysql-connector-python = "*" preql = "^0.2.11" +<<<<<<< HEAD snowflake-connector-python = "*" psycopg2 = "*" +======= +>>>>>>> 6d33dbc (dev: simplify) [tool.poetry.extras] # When adding, update also: README + Dockerfile + dev deps From 82390d4d8a18ae8649f6ce6ec8e870c4a52f68d5 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 11 May 2022 16:57:41 +0200 Subject: [PATCH 13/18] Fix README after merge error --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 433ac484..3f6a6e4c 100644 --- a/README.md +++ b/README.md @@ -204,6 +204,7 @@ Diff-Split: +250156 -0 # How to publish to PyPI Before you can publish, you need to increment the version number in the [pyproject.toml](pyproject.toml) and then run: + ```shell-session poetry build poetry publish From 2055d7e664bc8485983fc3587d2453c1a056083f Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 11 May 2022 17:42:46 +0200 Subject: [PATCH 14/18] Fix time formatting for SQL; Added tests (mysql only for now) --- data_diff/diff_tables.py | 16 ++++---- data_diff/sql.py | 7 ++++ tests/common.py | 3 +- tests/test_diff_tables.py | 82 ++++++++++++++++++++++++++++++++++++--- 4 files changed, 93 insertions(+), 15 deletions(-) diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py index 033c969b..67a675f5 100644 --- a/data_diff/diff_tables.py +++ b/data_diff/diff_tables.py @@ -6,7 +6,7 @@ from runtype import dataclass -from .sql import Select, Checksum, Compare, DbPath, DbKey, DbTime, Count, Enum, TableName, In, Value +from .sql import Select, Checksum, Compare, DbPath, DbKey, DbTime, Count, Enum, TableName, In, Value, Time from .database import Database logger = logging.getLogger("diff_tables") @@ -24,11 +24,11 @@ class TableSegment: table_path: DbPath key_column: str update_column: str - extra_columns: Tuple[str, ...] + extra_columns: Tuple[str, ...] = () start_key: DbKey = None end_key: DbKey = None - start_time: DbTime = None - end_time: DbTime = None + min_time: DbTime = None + max_time: DbTime = None _count: int = None _checksum: int = None @@ -40,10 +40,10 @@ def _make_key_range(self): yield Compare("<", self.key_column, str(self.end_key)) def _make_update_range(self): - if self.start_time is not None: - yield Compare("<=", str(self.start_time), self.update_column) - if self.end_time is not None: - yield Compare("<", self.update_column, str(self.end_time)) + if self.min_time is not None: + yield Compare("<=", Time(self.min_time), self.update_column) + if self.max_time is not None: + yield Compare("<", self.update_column, Time(self.max_time)) def _make_select(self, *, table=None, columns=None, where=None, group_by=None, order_by=None): if columns is None: diff --git a/data_diff/sql.py b/data_diff/sql.py index 273205ad..6bb98af9 100644 --- a/data_diff/sql.py +++ b/data_diff/sql.py @@ -141,3 +141,10 @@ def compile(self, c: Compiler): if self.column: return f"count({c.compile(self.column)})" return "count(*)" + +@dataclass +class Time(Sql): + time: datetime + + def compile(self, c: Compiler): + return "'%s'" % self.time.isoformat() \ No newline at end of file diff --git a/tests/common.py b/tests/common.py index db9feeb9..5c6b3e56 100644 --- a/tests/common.py +++ b/tests/common.py @@ -5,7 +5,8 @@ logging.basicConfig(level=logging.WARN) -TEST_MYSQL_CONN_STRING = "mysql://mysql:Password1@localhost/mysql" +# TEST_MYSQL_CONN_STRING = "mysql://mysql:Password1@localhost/mysql" +TEST_MYSQL_CONN_STRING = "mysql://erez:qweqwe123@localhost/erez" def str_to_checksum(str: str): # hello world diff --git a/tests/test_diff_tables.py b/tests/test_diff_tables.py index eb0bb6fe..f1304eb4 100644 --- a/tests/test_diff_tables.py +++ b/tests/test_diff_tables.py @@ -2,14 +2,14 @@ import unittest import preql +import arrow # comes with preql from data_diff.database import connect_to_uri from data_diff.diff_tables import TableDiffer, TableSegment from .common import TEST_MYSQL_CONN_STRING, str_to_checksum - -class TestDiffTables(unittest.TestCase): +class TestWithConnection(unittest.TestCase): @classmethod def setUpClass(cls): # Avoid leaking connections that require waiting for the GC, which can @@ -17,21 +17,91 @@ def setUpClass(cls): cls.preql = preql.Preql(TEST_MYSQL_CONN_STRING) cls.connection = connect_to_uri(TEST_MYSQL_CONN_STRING) +class TestDates(TestWithConnection): + def setUp(self): + self.connection.query("DROP TABLE IF EXISTS a", None) + self.connection.query("DROP TABLE IF EXISTS b", None) + self.preql(r""" + table a { + datetime: datetime + comment: string + } + commit() + + func add(date, comment) { + new a(date, comment) + } + """) + self.now = now = arrow.get(self.preql.now()) + self.preql.add(now.shift(days=-50), "50 days ago") + self.preql.add(now.shift(hours=-3), "3 hours ago") + self.preql.add(now.shift(minutes=-10), "10 mins ago") + self.preql.add(now.shift(seconds=-1), "1 second ago") + self.preql.add(now, "now") + + self.preql(r""" + const table b = a + commit() + """) + + self.preql.add(self.now.shift(seconds=-3), "2 seconds ago") + self.preql.commit() + + + def test_basic(self): + differ = TableDiffer(10, 100) + a = TableSegment(self.connection, ('a', ), 'id', 'datetime') + b = TableSegment(self.connection, ('b', ), 'id', 'datetime') + assert a.count == 6 + assert b.count == 5 + + assert not list(differ.diff_tables(a, a)) + self.assertEqual( len( list(differ.diff_tables(a, b)) ), 1 ) + + def test_offset(self): + differ = TableDiffer(2, 10) + sec1 = self.now.shift(seconds=-1).datetime + a = TableSegment(self.connection, ('a', ), 'id', 'datetime', max_time=sec1) + b = TableSegment(self.connection, ('b', ), 'id', 'datetime', max_time=sec1) + assert a.count == 4 + assert b.count == 3 + + assert not list(differ.diff_tables(a, a)) + self.assertEqual( len( list(differ.diff_tables(a, b)) ), 1 ) + + a = TableSegment(self.connection, ('a', ), 'id', 'datetime', min_time=sec1) + b = TableSegment(self.connection, ('b', ), 'id', 'datetime', min_time=sec1) + assert a.count == 2 + assert b.count == 2 + assert not list(differ.diff_tables(a, b)) + + day1 = self.now.shift(days=-1).datetime + + a = TableSegment(self.connection, ('a', ), 'id', 'datetime', min_time=day1, max_time=sec1) + b = TableSegment(self.connection, ('b', ), 'id', 'datetime', min_time=day1, max_time=sec1) + assert a.count == 3 + assert b.count == 2 + assert not list(differ.diff_tables(a, a)) + self.assertEqual( len( list(differ.diff_tables(a, b)) ), 1) + + +class TestDiffTables(TestWithConnection): + def setUp(self): self.connection.query("DROP TABLE IF EXISTS ratings_test", None) self.connection.query("DROP TABLE IF EXISTS ratings_test2", None) self.preql.load("./tests/setup.pql") self.preql.commit() - self.table = TableSegment(TestDiffTables.connection, + self.table = TableSegment(self.connection, ('ratings_test', ), 'id', - ('timestamp', )) + 'timestamp') - self.table2 = TableSegment(TestDiffTables.connection, + self.table2 = TableSegment(self.connection, ("ratings_test2", ), 'id', - ('timestamp', )) + 'timestamp') self.differ = TableDiffer(3, 4) From 42a48c8de3e80166a448335fbddfcf1f33ea5af4 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 11 May 2022 18:05:03 +0200 Subject: [PATCH 15/18] -t/--update-column and TableSegment.update_column are now optional --- data_diff/__main__.py | 2 +- data_diff/diff_tables.py | 8 ++++++-- tests/test_diff_tables.py | 4 ++++ 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/data_diff/__main__.py b/data_diff/__main__.py index b53351b7..57a76752 100644 --- a/data_diff/__main__.py +++ b/data_diff/__main__.py @@ -18,7 +18,7 @@ @click.argument("db2_uri") @click.argument("table2_name") @click.option("-k", "--key-column", default="id", help="Name of primary key column") -@click.option("-t", "--update-column", default="updated_at", help="Name of updated_at/last_updated column") +@click.option("-t", "--update-column", default=None, help="Name of updated_at/last_updated column") @click.option("-c", "--columns", default=[], multiple=True, help="Names of extra columns to compare") @click.option("-l", "--limit", default=None, help="Maximum number of differences to find") @click.option("--bisection-factor", default=32, help="Segments per iteration") diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py index 67a675f5..eb08334f 100644 --- a/data_diff/diff_tables.py +++ b/data_diff/diff_tables.py @@ -23,7 +23,7 @@ class TableSegment: database: Database table_path: DbPath key_column: str - update_column: str + update_column: str = None extra_columns: Tuple[str, ...] = () start_key: DbKey = None end_key: DbKey = None @@ -33,6 +33,10 @@ class TableSegment: _count: int = None _checksum: int = None + def __post_init__(self): + if not self.update_column and (self.min_time or self.max_time): + raise ValueError("Error: min_time/max_time feature requires to specify 'update_column'") + def _make_key_range(self): if self.start_key is not None: yield Compare("<=", str(self.start_key), self.key_column) @@ -111,7 +115,7 @@ def count(self) -> int: @property def _relevant_columns(self) -> List[str]: - return [self.key_column, self.update_column] + list(self.extra_columns) + return [self.key_column] + ([self.update_column] if self.update_column is not None else []) + list(self.extra_columns) @property def checksum(self) -> int: diff --git a/tests/test_diff_tables.py b/tests/test_diff_tables.py index f1304eb4..9b340890 100644 --- a/tests/test_diff_tables.py +++ b/tests/test_diff_tables.py @@ -48,6 +48,10 @@ def setUp(self): self.preql.commit() + def test_init(self): + a = TableSegment(self.connection, ('a', ), 'id', 'datetime', max_time=self.now.datetime) + self.assertRaises(ValueError, TableSegment, self.connection, ('a', ), 'id', max_time=self.now.datetime) + def test_basic(self): differ = TableDiffer(10, 100) a = TableSegment(self.connection, ('a', ), 'id', 'datetime') From 5ec06ab5a0edc6bdd1d2cb7cdac6f54145ed3c1e Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 11 May 2022 18:13:45 +0200 Subject: [PATCH 16/18] tests: Restore 'default' conn string --- tests/common.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/common.py b/tests/common.py index 5c6b3e56..db9feeb9 100644 --- a/tests/common.py +++ b/tests/common.py @@ -5,8 +5,7 @@ logging.basicConfig(level=logging.WARN) -# TEST_MYSQL_CONN_STRING = "mysql://mysql:Password1@localhost/mysql" -TEST_MYSQL_CONN_STRING = "mysql://erez:qweqwe123@localhost/erez" +TEST_MYSQL_CONN_STRING = "mysql://mysql:Password1@localhost/mysql" def str_to_checksum(str: str): # hello world From b9688f0083779fbb3d65f163630beb4e1f873bac Mon Sep 17 00:00:00 2001 From: Simon Eskildsen Date: Wed, 11 May 2022 15:41:24 -0400 Subject: [PATCH 17/18] readme: fix some conflicts after merging --- README.md | 39 ++++++++------------------------------- pyproject.toml | 3 --- 2 files changed, 8 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 3f6a6e4c..ec5fd77e 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,11 @@ tables. - Validate that a table was copied properly - Be alerted before your customer finds out, or your report is wrong - Validate that your replication mechnism is working correctly +- Find changes between two versions of the same table + +It uses a bisection algorithm and checksums to efficiently check if e.g. a table +is the same between MySQL and Postgres, or Postgres and Snowflake, or MySQL and +RDS! ```python $ data-diff postgres:/// Original postgres:/// Original_1diff -v --bisection-factor=4 @@ -109,33 +114,6 @@ We ran it with a very low bisection factor, and with the verbose flag, to demons Note: It's usually much faster to use high bisection factors, especially when there are very few changes, like in this example. -# How to install - -Requires Python 3.7+ with pip. - -```pip install data-diff``` - -or when you need extras like mysql and postgres - -```pip install "data-diff[mysql,pgsql]"``` - -# How to use - -Usage: `data-diff DB1_URI TABLE1_NAME DB2_URI TABLE2_NAME [OPTIONS]` - -Options: - - - `--help` - Show help message and exit. - - `-k` or `--key-column` - Name of the primary key column - - `-t` or `--update-column` - Name of updated_at/last_updated column - - `-c` or `--columns` - List of names of extra columns to compare - - `-l` or `--limit` - Maximum number of differences to find (limits maximum bandwidth and runtime) - - `-s` or `--stats` - Print stats instead of a detailed diff - - `-d` or `--debug` - Print debug info - - `-v` or `--verbose` - Print extra info - - `--bisection-factor` - Segments per iteration. When set to 2, it performs binary search. - - `--bisection-threshold` - Minimal bisection threshold. i.e. maximum size of pages to diff locally. - ## Tips for performance It's highly recommended that all involved columns are indexed. @@ -187,9 +165,9 @@ Postgres) to avoid incurring the long setup time repeatedly. ```shell-session preql -f dev/prepare_db.pql postgres://postgres:Password1@127.0.0.1:5432/postgres preql -f dev/prepare_db.pql mysql://mysql:Password1@127.0.0.1:3306/mysql -preql -f dev/prepare_db snowflake:// -preql -f dev/prepare_db mssql:// -preql -f dev/prepare_db_bigquery bigquery:/// # Bigquery has its own scripts +preql -f dev/prepare_db.pql snowflake:// +preql -f dev/prepare_db.pql mssql:// +preql -f dev/prepare_db_bigquery.pql bigquery:/// # Bigquery has its own scripts ``` **6. Run data-diff against seeded database** @@ -204,7 +182,6 @@ Diff-Split: +250156 -0 # How to publish to PyPI Before you can publish, you need to increment the version number in the [pyproject.toml](pyproject.toml) and then run: - ```shell-session poetry build poetry publish diff --git a/pyproject.toml b/pyproject.toml index 4b4db2a7..81f868d5 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,11 +36,8 @@ snowflake-connector-python = { version = "*", optional = true } [tool.poetry.dev-dependencies] mysql-connector-python = "*" preql = "^0.2.11" -<<<<<<< HEAD snowflake-connector-python = "*" psycopg2 = "*" -======= ->>>>>>> 6d33dbc (dev: simplify) [tool.poetry.extras] # When adding, update also: README + Dockerfile + dev deps From 79a440ee5d208004135deb22362802d4de42359e Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Thu, 12 May 2022 10:57:01 +0200 Subject: [PATCH 18/18] Reformat with black --- data_diff/diff_tables.py | 6 +++++- data_diff/sql.py | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py index eb08334f..b372b661 100644 --- a/data_diff/diff_tables.py +++ b/data_diff/diff_tables.py @@ -115,7 +115,11 @@ def count(self) -> int: @property def _relevant_columns(self) -> List[str]: - return [self.key_column] + ([self.update_column] if self.update_column is not None else []) + list(self.extra_columns) + return ( + [self.key_column] + + ([self.update_column] if self.update_column is not None else []) + + list(self.extra_columns) + ) @property def checksum(self) -> int: diff --git a/data_diff/sql.py b/data_diff/sql.py index 6bb98af9..94978584 100644 --- a/data_diff/sql.py +++ b/data_diff/sql.py @@ -142,9 +142,10 @@ def compile(self, c: Compiler): return f"count({c.compile(self.column)})" return "count(*)" + @dataclass class Time(Sql): time: datetime def compile(self, c: Compiler): - return "'%s'" % self.time.isoformat() \ No newline at end of file + return "'%s'" % self.time.isoformat()