diff --git a/.dockerignore b/.dockerignore index 5beb7e93..4e4fa7d6 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,2 +1,3 @@ .venv ml-25m* +dev/ml-25m* diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..bd17028a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.10 +RUN apt-get update && apt-get install -y \ + python3-dev libpq-dev wget unzip \ + python3-setuptools gcc bc +RUN pip install --no-cache-dir poetry==1.1.13 +COPY . /app +WORKDIR /app +# For now while we are in heavy development we install the latest with Poetry +# and execute directly with Poetry. Later, we'll move to the released Pip package. +RUN poetry install -E preql -E mysql -E pgsql -E snowflake +ENTRYPOINT ["poetry", "run", "python3", "-m", "data_diff"] diff --git a/README.md b/README.md index 24bd8d5b..7896d1e4 100644 --- a/README.md +++ b/README.md @@ -1,27 +1,74 @@ # Data Diff -A cross-database, efficient diff between mostly-similar database tables. +A cross-database, efficient diff using checksums between mostly-similar database +tables. -Use cases: +- Validate that a table was copied properly +- Be alerted before your customer finds out, or your report is wrong +- Validate that your replication mechnism is working correctly +- Find changes between two versions of the same table -- Quickly validate that a table was copied correctly +It uses a bisection algorithm to efficiently check if e.g. a table is the same +between MySQL and Postgres, or Postgres and Snowflake, or MySQL and RDS! -- Find changes between two versions of the same table +```python +$ data-diff postgres:/// Original postgres:/// Original_1diff -v --bisection-factor=4 +[16:55:19] INFO - Diffing tables of size 25000095 and 25000095 | segments: 4, bisection threshold: 1048576. +[16:55:36] INFO - Diffing segment 0/4 of size 8333364 and 8333364 +[16:55:45] INFO - . Diffing segment 0/4 of size 2777787 and 2777787 +[16:55:52] INFO - . . Diffing segment 0/4 of size 925928 and 925928 +[16:55:54] INFO - . . . Diff found 2 different rows. ++ (20000, 942013020) +- (20000, 942013021) +[16:55:54] INFO - . . Diffing segment 1/4 of size 925929 and 925929 +[16:55:55] INFO - . . Diffing segment 2/4 of size 925929 and 925929 +[16:55:55] INFO - . . Diffing segment 3/4 of size 1 and 1 +[16:55:56] INFO - . Diffing segment 1/4 of size 2777788 and 2777788 +[16:55:58] INFO - . Diffing segment 2/4 of size 2777788 and 2777788 +[16:55:59] INFO - . Diffing segment 3/4 of size 1 and 1 +[16:56:00] INFO - Diffing segment 1/4 of size 8333365 and 8333365 +[16:56:06] INFO - Diffing segment 2/4 of size 8333365 and 8333365 +[16:56:11] INFO - Diffing segment 3/4 of size 1 and 1 +[16:56:11] INFO - Duration: 53.51 seconds. +``` We currently support the following databases: - PostgreSQL - - MySQL - - Oracle - - Snowflake - - BigQuery - - Redshift +We plan to add more, including NoSQL, and even APIs like Shopify! + +# How to install + +Requires Python 3.7+ with pip. + +```pip install data-diff``` + +or when you need extras like mysql and postgres + +```pip install "data-diff[mysql,pgsql]"``` + +# How to use + +Usage: `data-diff DB1_URI TABLE1_NAME DB2_URI TABLE2_NAME [OPTIONS]` + +Options: + + - `--help` - Show help message and exit. + - `-k` or `--key_column` - Name of the primary key column + - `-c` or `--columns` - List of names of extra columns to compare + - `-l` or `--limit` - Maximum number of differences to find (limits maximum bandwidth and runtime) + - `-s` or `--stats` - Print stats instead of a detailed diff + - `-d` or `--debug` - Print debug info + - `-v` or `--verbose` - Print extra info + - `--bisection-factor` - Segments per iteration. When set to 2, it performs binary search. + - `--bisection-threshold` - Minimal bisection threshold. i.e. maximum size of pages to diff locally. + # How does it work? @@ -63,57 +110,71 @@ We ran it with a very low bisection factor, and with the verbose flag, to demons Note: It's usually much faster to use high bisection factors, especially when there are very few changes, like in this example. -```python -$ data_diff postgres:/// Original postgres:/// Original_1diff -v --bisection-factor=4 -[16:55:19] INFO - Diffing tables of size 25000095 and 25000095 | segments: 4, bisection threshold: 1048576. -[16:55:36] INFO - Diffing segment 0/4 of size 8333364 and 8333364 -[16:55:45] INFO - . Diffing segment 0/4 of size 2777787 and 2777787 -[16:55:52] INFO - . . Diffing segment 0/4 of size 925928 and 925928 -[16:55:54] INFO - . . . Diff found 2 different rows. -+ (20000, 942013020) -- (20000, 942013021) -[16:55:54] INFO - . . Diffing segment 1/4 of size 925929 and 925929 -[16:55:55] INFO - . . Diffing segment 2/4 of size 925929 and 925929 -[16:55:55] INFO - . . Diffing segment 3/4 of size 1 and 1 -[16:55:56] INFO - . Diffing segment 1/4 of size 2777788 and 2777788 -[16:55:58] INFO - . Diffing segment 2/4 of size 2777788 and 2777788 -[16:55:59] INFO - . Diffing segment 3/4 of size 1 and 1 -[16:56:00] INFO - Diffing segment 1/4 of size 8333365 and 8333365 -[16:56:06] INFO - Diffing segment 2/4 of size 8333365 and 8333365 -[16:56:11] INFO - Diffing segment 3/4 of size 1 and 1 -[16:56:11] INFO - Duration: 53.51 seconds. +## Tips for performance + +It's highly recommended that all involved columns are indexed. + +## Development Setup + +The development setup centers around using `docker-compose` to boot up various +databases, and then inserting data into them. + +For Mac for performance of Docker, we suggest enabling in the UI: + +* Use new Virtualization Framework +* Enable VirtioFS accelerated directory sharing + +**1. Install Data Diff** + +When developing/debugging, it's recommended to install dependencies and run it +directly with `poetry` rather than go through the package. + +``` +brew install mysql postgresql # MacOS dependencies for C bindings +poetry install ``` +**2. Download CSV of Testing Data** -# How to install +```shell-session +wget https://files.grouplens.org/datasets/movielens/ml-25m.zip +unzip ml-25m.zip -d dev/ +``` -Requires Python 3.7+ with pip. +**3. Start Databases** -```pip install data-diff``` +```shell-session +docker-compose up -d mysql postgres +``` -or when you need extras like mysql and postgres +**4. Run Unit Tests** -```pip install "data-diff[mysql,pgsql]"``` +```shell-session +poetry run python3 -m unittest +``` -# How to use +**5. Seed the Database(s)** -Usage: `data_diff DB1_URI TABLE1_NAME DB2_URI TABLE2_NAME [OPTIONS]` +If you're just testing, we recommend just setting up one database (e.g. +Postgres) to avoid incurring the long setup time repeatedly. -Options: +```shell-session +preql -f dev/prepare_db.pql postgres://postgres:Password1@127.0.0.1:5432/postgres +preql -f dev/prepare_db.pql mysql://mysql:Password1@127.0.0.1:3306/mysql +preql -f dev/prepare_db.psq snowflake:// +preql -f dev/prepare_db.psq mssql:// +preql -f dev/prepare_db_bigquery.pql bigquery:/// # Bigquery has its own +``` - - `--help` - Show help message and exit. - - `-k` or `--key_column` - Name of the primary key column - - `-c` or `--columns` - List of names of extra columns to compare - - `-l` or `--limit` - Maximum number of differences to find (limits maximum bandwidth and runtime) - - `-s` or `--stats` - Print stats instead of a detailed diff - - `-d` or `--debug` - Print debug info - - `-v` or `--verbose` - Print extra info - - `--bisection-factor` - Segments per iteration. When set to 2, it performs binary search. - - `--bisection-threshold` - Minimal bisection threshold. i.e. maximum size of pages to diff locally. +**6. Run data-diff against seeded database** -## Tips for performance +```bash +poetry run python3 -m data_diff postgres://user:password@host:db Rating mysql://user:password@host:db Rating_del1 -c timestamp --stats -It's highly recommended that all involved columns are indexed. +Diff-Total: 250156 changed rows out of 25000095 +Diff-Percent: 1.0006% +Diff-Split: +250156 -0 +``` # License diff --git a/dev/Dockerfile b/dev/Dockerfile deleted file mode 100644 index 4e619ef5..00000000 --- a/dev/Dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -FROM python:3.10 - -# install dependencies -RUN apt update && apt install -y \ - python3-dev libpq-dev wget unzip \ - python3-setuptools gcc bc -RUN pip install poetry - -ADD . /app - -WORKDIR /app -RUN chmod +x dev/prepdb.sh - -RUN wget https://files.grouplens.org/datasets/movielens/ml-25m.zip -RUN unzip ml-25m.zip -d /app/dev - -RUN pip install 'data-diff[preql,mysql,pgsql]' - -ARG DB1_URI -ARG TABLE1_NAME -ARG DB2_URI -ARG TABLE2_NAME -ARG OPTIONS - -ENV DB1_URI ${DB1_URI} -ENV TABLE1_NAME ${TABLE1_NAME} -ENV DB2_URI ${DB2_URI} -ENV TABLE2_NAME ${TABLE2_NAME} -ENV OPTIONS ${OPTIONS} - -CMD data_diff ${DB1_URI} ${TABLE1_NAME} ${DB2_URI} ${TABLE2_NAME} ${OPTIONS} \ No newline at end of file diff --git a/dev/prepare_db.pql b/dev/prepare_db.pql index 83aff9f7..f0e4a146 100644 --- a/dev/prepare_db.pql +++ b/dev/prepare_db.pql @@ -24,7 +24,7 @@ if (db_type == "snowflake") { print "Uploading ratings CSV" run_sql("RM @~/ratings.csv.gz") - run_sql("PUT file://ml-25m/ratings.csv @~") + run_sql("PUT file://dev/ml-25m/ratings.csv @~") print "Loading ratings CSV" @@ -86,7 +86,7 @@ if (db_type == "snowflake") { run_sql("create table tmp_rating(userid int, movieid int, rating float, timestamp int)") table tmp_rating {...} print "Loading ratings CSV" - run_sql("BULK INSERT tmp_rating from 'ml-25m/ratings.csv' with (fieldterminator = ',', rowterminator = '0x0a', FIRSTROW = 2);") + run_sql("BULK INSERT tmp_rating from 'dev/ml-25m/ratings.csv' with (fieldterminator = ',', rowterminator = '0x0a', FIRSTROW = 2);") print "Populating actual table" rating += tmp_rating commit() @@ -99,7 +99,7 @@ if (db_type == "snowflake") { rating: float timestamp: int } - import_csv(rating, 'ml-25m/ratings.csv', true) + import_csv(rating, 'dev/ml-25m/ratings.csv', true) rating.add_index("id", true) rating.add_index("timestamp") run_sql("CREATE INDEX index_rating_id_timestamp ON rating (id, timestamp)") diff --git a/dev/prepdb.sh b/dev/prepdb.sh deleted file mode 100644 index e9a8447b..00000000 --- a/dev/prepdb.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env bash -set -ex - -main () { - cd dev - prepare_db - cd .. -} - -prepare_db() { - START=$(date +%s) - preql -m prepare_db mysql://mysql:Password1@mysql/mysql - END=$(date +%s) - DIFF=$(echo "$END - $START" | bc) - echo "Prepare_db for mysql took: $DIFF s" - START=$(date +%s) - preql -m prepare_db postgres://postgres:Password1@postgresql/postgres - END=$(date +%s) - DIFF=$(echo "$END - $START" | bc) - echo "Prepare_db for postgres took: $DIFF s" -} - -main \ No newline at end of file diff --git a/dev/docker-compose.yml b/docker-compose.yml similarity index 55% rename from dev/docker-compose.yml rename to docker-compose.yml index 8728fa6e..58e006fa 100644 --- a/dev/docker-compose.yml +++ b/docker-compose.yml @@ -1,44 +1,12 @@ version: "3.8" services: - data-diff: - container_name: data-diff - build: - context: ../ - dockerfile: ./dev/Dockerfile - args: - - DB1_URI - - TABLE1_NAME - - DB2_URI - - TABLE2_NAME - - OPTIONS - ports: - - '9992:9992' - expose: - - '9992' - tty: true - networks: - - local - - prepdb: - container_name: prepdb - build: - context: ../ - dockerfile: ./dev/Dockerfile - command: ["bash", "./dev/prepdb.sh"] - volumes: - - prepdb-data:/app:delegated - ports: - - '9991:9991' - expose: - - '9991' - tty: true - networks: - - local - postgres: container_name: postgresql image: postgres:14.1-alpine + # work_mem: less tmp files + # maintenance_work_mem: improve table-level op perf + # max_wal_size: allow more time before merging to heap command: > -c work_mem=1GB -c maintenance_work_mem=1GB @@ -51,7 +19,7 @@ services: expose: - '5432' env_file: - - dev.env + - dev/dev.env tty: true networks: - local @@ -59,17 +27,14 @@ services: mysql: container_name: mysql image: mysql:oracle + # fsync less aggressively for insertion perf for test setup command: > --default-authentication-plugin=mysql_native_password - --innodb-buffer-pool-size=8G - --innodb_io_capacity=2000 - --innodb_log_file_size=1G --binlog-cache-size=16M --key_buffer_size=0 --max_connections=10 --innodb_flush_log_at_trx_commit=2 --innodb_flush_log_at_timeout=10 - --innodb_flush_method=O_DSYNC --innodb_log_compressed_pages=OFF --sync_binlog=0 restart: always @@ -81,7 +46,7 @@ services: expose: - '3306' env_file: - - dev.env + - dev/dev.env tty: true networks: - local @@ -89,8 +54,7 @@ services: volumes: postgresql-data: mysql-data: - prepdb-data: networks: local: - driver: bridge \ No newline at end of file + driver: bridge diff --git a/example.sh b/example.sh deleted file mode 100755 index ff392ae6..00000000 --- a/example.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env bash -set -ex - -main () { - cd dev/ - initialize - prepare_db - data_diff - shutdown - cd .. -} - -initialize() { - docker-compose up -d postgres mysql - - until nc -z -v -w30 localhost 3306 && nc -z -v -w30 localhost 5432; do - echo "Databases not yet ready.." - sleep 5 - done - - docker-compose up -d data-diff prepdb -} - -prepare_db() { - . ./prepdb.sh -} - -data_diff() { - docker-compose run -e DB1_URI=postgres://postgres:Password1@postgresql/postgres -e TABLE1_NAME=Rating -e DB2_URI=mysql://mysql:Password1@mysql/mysql -e TABLE2_NAME=Rating_del1 -e OPTIONS='-c timestamp --bisection-factor 4 -v -s' data-diff - docker-compose run -e DB1_URI=postgres://postgres:Password1@postgresql/postgres -e TABLE1_NAME=Rating -e DB2_URI=mysql://mysql:Password1@mysql/mysql -e TABLE2_NAME=Rating_update1 -e OPTIONS='-c timestamp --bisection-factor 4 -v' data-diff - docker-compose run -e DB1_URI=postgres://postgres:Password1@postgresql/postgres -e TABLE1_NAME=Rating -e DB2_URI=mysql://mysql:Password1@mysql/mysql -e TABLE2_NAME=Rating_update001p -e OPTIONS='-c timestamp --bisection-factor 64 -v -s' data-diff - docker-compose run -e DB1_URI=postgres://postgres:Password1@postgresql/postgres -e TABLE1_NAME=Rating -e DB2_URI=mysql://mysql:Password1@mysql/mysql -e TABLE2_NAME=Rating_update1p -e OPTIONS='-c timestamp --bisection-factor 4 -v -s' data-diff - docker-compose run -e DB1_URI=postgres://postgres:Password1@postgresql/postgres -e TABLE1_NAME=Rating -e DB2_URI=mysql://mysql:Password1@mysql/mysql -e TABLE2_NAME=Rating_del1p -e OPTIONS='-c timestamp --bisection-factor 4 -v -s' data-diff - docker-compose run -e DB1_URI=postgres://postgres:Password1@postgresql/postgres -e TABLE1_NAME=Rating -e DB2_URI=mysql://mysql:Password1@mysql/mysql -e TABLE2_NAME=Rating_update50p -e OPTIONS='-c timestamp --bisection-factor 4 -v -s' data-diff -} - -shutdown() { - docker-compose down -} - -main diff --git a/pyproject.toml b/pyproject.toml index 43da7aa0..81f868d5 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,6 @@ packages = [{ include = "data_diff" }] python = "^3.7" runtype = "^0.2.4" dsnparse = "*" -pyparsing = "ˆ3.0" click = "^8.1" preql = { version = "^0.2.11", optional = true } @@ -35,9 +34,13 @@ mysql-connector-python = { version = "*", optional = true} snowflake-connector-python = { version = "*", optional = true } [tool.poetry.dev-dependencies] -protobuf = "^3.20.1" +mysql-connector-python = "*" +preql = "^0.2.11" +snowflake-connector-python = "*" +psycopg2 = "*" [tool.poetry.extras] +# When adding, update also: README + Dockerfile + dev deps preql = ["preql"] mysql = ["mysql-connector-python"] pgsql = ["psycopg2"] @@ -48,4 +51,4 @@ requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" [tool.poetry.scripts] -data_diff = 'data_diff.__main__:main' \ No newline at end of file +data-diff = 'data_diff.__main__:main' diff --git a/setup_testenv.md b/setup_testenv.md deleted file mode 100644 index 72f6a235..00000000 --- a/setup_testenv.md +++ /dev/null @@ -1,90 +0,0 @@ -# Test Data Diff with Postgres and MySQL - - -``` -chmod +x ./dev/example.sh -./dev/example.sh -``` - -NB for Mac. If the process takes very long (e.g. importing CSV file takes >30m), make sure that you have the latest version of Docker installed and have enabled the experimental features `Use the new Virtualization framework` and `Enable VirtioFS accelerated directory sharing`. Because the interaction with Docker and the MacOS FS is a bottleneck. - -## Manual setup - -1. Install Data Diff - -``` -pip install "data-diff[preql,mysql,pgsql]" -``` - -2. Download CSV - -``` -wget https://files.grouplens.org/datasets/movielens/ml-25m.zip -unzip ml-25m.zip -d dev/ -``` - -4. Setup databases - -(note: bigquery has its own setup script) - -``` -preql -f dev/prepare_db postgres:// - -preql -f dev/prepare_db mysql:// - -preql -f dev/prepare_db snowflake:// - -preql -f dev/prepare_db mssql:// - -preql -f dev/prepare_db_bigquery bigquery:/// - - -etc. -``` - -And it's ready to use! - -Example: - -```bash -data_diff postgres://user:password@host:db Rating mysql://user:password@host:db Rating_del1 -c timestamp --stats - -Diff-Total: 250156 changed rows out of 25000095 -Diff-Percent: 1.0006% -Diff-Split: +250156 -0 - -``` - -## Database settings with explanation -*Inline comments in docker-compose.yml will break the databases.* - -**PostgreSQL:** - -``` --c work_mem=1GB # Reduce writing temporary disk files. --c maintenance_work_mem=1GB # Improve VACUUM, CREATE INDEX, ALTER TABLE ADD FOREIGN KEY operations. --c max_wal_size=8GB # Filling of the table with movie lens data creates an higher write - # load than the default assumption of 1GB/hour. -``` -**MySQL:** -``` ---default-authentication-plugin=mysql_native_password # Required for setting password via env vars. ---innodb-buffer-pool-size=8G # Recommendation is to set to 50-75% of available - # memmory. However, this is no dedicated instance. ---innodb_io_capacity=2000 # Default setting is for hard drives. SSD benefits - # from higher values. ---innodb_log_file_size=1G # Tuning recommendation based on the - # innodb-buffer-pool-size setting. ---binlog-cache-size=16M # Tuning recommendation ---key_buffer_size=0 # No MyISAM tables, InnoDB engine is used. ---max_connections=10 # Test setup, not a lot connection needed. ---innodb_flush_log_at_trx_commit=2 # Reduce creation of logs for performance. ---innodb_flush_log_at_timeout=10 # Idem ---innodb_flush_method=O_DSYNC # Suffers less from race conditions than fsync. ---innodb_log_compressed_pages=OFF # To write less data to the redo_log. ---sync_binlog=0 # Disables synchronization of the binary log to disk - # by the MySQL server. Instead, the MySQL server relies - # on the operating system to flush the binary log to - # disk from time to time as it does for any other file. - # This setting provides the best performance. -``` \ No newline at end of file diff --git a/tests/setup.pql b/tests/setup.pql index c40f7f66..08e9f5a6 100644 --- a/tests/setup.pql +++ b/tests/setup.pql @@ -9,7 +9,7 @@ table ratings_test { timestamp: timestamp } -table ratings_est2 { +table ratings_test2 { userid: int movieid: int rating: float diff --git a/tests/test_diff_tables.py b/tests/test_diff_tables.py index f4409d1a..eb0bb6fe 100644 --- a/tests/test_diff_tables.py +++ b/tests/test_diff_tables.py @@ -18,20 +18,20 @@ def setUpClass(cls): cls.connection = connect_to_uri(TEST_MYSQL_CONN_STRING) def setUp(self): - self.table_name = "RatingsTest" + self.connection.query("DROP TABLE IF EXISTS ratings_test", None) + self.connection.query("DROP TABLE IF EXISTS ratings_test2", None) + self.preql.load("./tests/setup.pql") + self.preql.commit() + self.table = TableSegment(TestDiffTables.connection, - (self.table_name, ), + ('ratings_test', ), 'id', ('timestamp', )) self.table2 = TableSegment(TestDiffTables.connection, - ("RatingsTest2", ), + ("ratings_test2", ), 'id', ('timestamp', )) - self.connection.query("DROP TABLE IF EXISTS RatingsTest", None) - self.connection.query("DROP TABLE IF EXISTS RatingsTest2", None) - self.preql.load("./tests/setup.pql") - self.preql.commit() self.differ = TableDiffer(3, 4) @@ -43,7 +43,7 @@ def test_properties_on_empty_table(self): def test_get_values(self): time = "2022-01-01 00:00:00" res = self.preql(f""" - new RatingsTest(1, 1, 9, '{time}') + new ratings_test(1, 1, 9, '{time}') """) self.preql.commit() @@ -54,10 +54,10 @@ def test_get_values(self): def test_checkpoints(self): time = "2022-01-01 00:00:00" self.preql(f""" - new RatingsTest(userId: 1, movieId: 1, rating: 9, timestamp: '{time}') - new RatingsTest(userId: 1, movieId: 1, rating: 9, timestamp: '{time}') - new RatingsTest(userId: 1, movieId: 1, rating: 9, timestamp: '{time}') - new RatingsTest(userId: 1, movieId: 1, rating: 9, timestamp: '{time}') + new ratings_test(userid: 1, movieid: 1, rating: 9, timestamp: '{time}') + new ratings_test(userid: 1, movieid: 1, rating: 9, timestamp: '{time}') + new ratings_test(userid: 1, movieid: 1, rating: 9, timestamp: '{time}') + new ratings_test(userid: 1, movieid: 1, rating: 9, timestamp: '{time}') """) self.preql.commit() self.assertEqual([2, 4], self.table.choose_checkpoints(2)) @@ -65,10 +65,10 @@ def test_checkpoints(self): def test_diff_small_tables(self): time = "2022-01-01 00:00:00" self.preql(f""" - new RatingsTest(userId: 1, movieId: 1, rating: 9, timestamp: '{time}') - new RatingsTest(userId: 2, movieId: 2, rating: 9, timestamp: '{time}') + new ratings_test(userid: 1, movieid: 1, rating: 9, timestamp: '{time}') + new ratings_test(userid: 2, movieid: 2, rating: 9, timestamp: '{time}') - new RatingsTest2(userId: 1, movieId: 1, rating: 9, timestamp: '{time}') + new ratings_test2(userid: 1, movieid: 1, rating: 9, timestamp: '{time}') """) self.preql.commit() diff = list(self.differ.diff_tables(self.table, self.table2)) @@ -78,16 +78,16 @@ def test_diff_small_tables(self): def test_diff_table_above_bisection_threshold(self): time = "2022-01-01 00:00:00" self.preql(f""" - new RatingsTest(userId: 1, movieId: 1, rating: 9, timestamp: '{time}') - new RatingsTest(userId: 2, movieId: 2, rating: 9, timestamp: '{time}') - new RatingsTest(userId: 3, movieId: 3, rating: 9, timestamp: '{time}') - new RatingsTest(userId: 4, movieId: 4, rating: 9, timestamp: '{time}') - new RatingsTest(userId: 5, movieId: 5, rating: 9, timestamp: '{time}') - - new RatingsTest2(userId: 1, movieId: 1, rating: 9, timestamp: '{time}') - new RatingsTest2(userId: 2, movieId: 2, rating: 9, timestamp: '{time}') - new RatingsTest2(userId: 3, movieId: 3, rating: 9, timestamp: '{time}') - new RatingsTest2(userId: 4, movieId: 4, rating: 9, timestamp: '{time}') + new ratings_test(userid: 1, movieid: 1, rating: 9, timestamp: '{time}') + new ratings_test(userid: 2, movieid: 2, rating: 9, timestamp: '{time}') + new ratings_test(userid: 3, movieid: 3, rating: 9, timestamp: '{time}') + new ratings_test(userid: 4, movieid: 4, rating: 9, timestamp: '{time}') + new ratings_test(userid: 5, movieid: 5, rating: 9, timestamp: '{time}') + + new ratings_test2(userid: 1, movieid: 1, rating: 9, timestamp: '{time}') + new ratings_test2(userid: 2, movieid: 2, rating: 9, timestamp: '{time}') + new ratings_test2(userid: 3, movieid: 3, rating: 9, timestamp: '{time}') + new ratings_test2(userid: 4, movieid: 4, rating: 9, timestamp: '{time}') """) self.preql.commit() diff = list(self.differ.diff_tables(self.table, self.table2)) @@ -97,8 +97,8 @@ def test_diff_table_above_bisection_threshold(self): def test_return_empty_array_when_same(self): time = "2022-01-01 00:00:00" self.preql(f""" - new RatingsTest(userId: 1, movieId: 1, rating: 9, timestamp: '{time}') - new RatingsTest2(userId: 1, movieId: 1, rating: 9, timestamp: '{time}') + new ratings_test(userid: 1, movieid: 1, rating: 9, timestamp: '{time}') + new ratings_test2(userid: 1, movieid: 1, rating: 9, timestamp: '{time}') """) self.preql.commit() diff = list(self.differ.diff_tables(self.table, self.table2))