From 8b8563199fc477a2db249a2c142af8239011bd8d Mon Sep 17 00:00:00 2001 From: Florian Valeye Date: Tue, 18 May 2021 20:12:41 +0200 Subject: [PATCH 1/2] Add get_file_paths_by_partitions in Python bindings --- python/Cargo.toml | 2 +- python/deltalake/table.py | 6 +---- python/src/lib.rs | 2 +- python/tests/test_table_read.py | 42 ++++++++++++++++++++++----------- 4 files changed, 31 insertions(+), 21 deletions(-) diff --git a/python/Cargo.toml b/python/Cargo.toml index 7a838eea02..156c4554a2 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "deltalake-python" -version = "0.4.8" +version = "0.4.9" authors = ["Qingping Hou "] homepage = "https://github.com/delta-io/delta-rs" license = "Apache-2.0" diff --git a/python/deltalake/table.py b/python/deltalake/table.py index 6957d3a275..1bbdbfc8da 100644 --- a/python/deltalake/table.py +++ b/python/deltalake/table.py @@ -193,11 +193,7 @@ def to_pyarrow_dataset( if partitions is None: file_paths = self._table.file_paths() else: - table_path = self._table.table_path() - file_paths = [ - f"{table_path}/{file_name}" - for file_name in self._table.files_by_partitions(partitions) - ] + file_paths = self._table.files_by_partitions(partitions) paths = [urlparse(curr_file) for curr_file in file_paths] # Decide based on the first file, if the file is on cloud storage or local diff --git a/python/src/lib.rs b/python/src/lib.rs index f299e6a03f..92a2336d36 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -121,7 +121,7 @@ impl RawDeltaTable { match partition_filters { Ok(filters) => Ok(self ._table - .get_files_by_partitions(&filters) + .get_file_paths_by_partitions(&filters) .map_err(PyDeltaTableError::from_raw)?), Err(err) => Err(PyDeltaTableError::from_raw(err)), } diff --git a/python/tests/test_table_read.py b/python/tests/test_table_read.py index 07931f4603..8b82a8f13f 100644 --- a/python/tests/test_table_read.py +++ b/python/tests/test_table_read.py @@ -29,6 +29,20 @@ def test_read_partitioned_table_to_dict(): assert dt.to_pyarrow_dataset().to_table().to_pydict() == expected +def test_read_partitioned_table_with_partitions_filters_to_dict(): + table_path = "../rust/tests/data/delta-0.8.0-partitioned" + dt = DeltaTable(table_path) + partitions = [("year", "=", "2021")] + expected = { + "value": ["6", "7", "5", "4"], + "year": ["2021", "2021", "2021", "2021"], + "month": ["12", "12", "12", "4"], + "day": ["20", "20", "4", "5"], + } + + assert dt.to_pyarrow_dataset(partitions).to_table().to_pydict() == expected + + def test_vacuum_dry_run_simple_table(): table_path = "../rust/tests/data/delta-0.2.0" dt = DeltaTable(table_path) @@ -73,32 +87,32 @@ def test_get_files_partitioned_table(): dt = DeltaTable(table_path) partition_filters = [("day", "=", "3")] assert dt.files_by_partitions(partition_filters=partition_filters) == [ - "year=2020/month=2/day=3/part-00000-94d16827-f2fd-42cd-a060-f67ccc63ced9.c000.snappy.parquet" + f"{table_path}/year=2020/month=2/day=3/part-00000-94d16827-f2fd-42cd-a060-f67ccc63ced9.c000.snappy.parquet" ] partition_filters = [("day", "!=", "3")] assert dt.files_by_partitions(partition_filters=partition_filters) == [ - "year=2020/month=1/day=1/part-00000-8eafa330-3be9-4a39-ad78-fd13c2027c7e.c000.snappy.parquet", - "year=2020/month=2/day=5/part-00000-89cdd4c8-2af7-4add-8ea3-3990b2f027b5.c000.snappy.parquet", - "year=2021/month=12/day=20/part-00000-9275fdf4-3961-4184-baa0-1c8a2bb98104.c000.snappy.parquet", - "year=2021/month=12/day=4/part-00000-6dc763c0-3e8b-4d52-b19e-1f92af3fbb25.c000.snappy.parquet", - "year=2021/month=4/day=5/part-00000-c5856301-3439-4032-a6fc-22b7bc92bebb.c000.snappy.parquet", + f"{table_path}/year=2020/month=1/day=1/part-00000-8eafa330-3be9-4a39-ad78-fd13c2027c7e.c000.snappy.parquet", + f"{table_path}/year=2020/month=2/day=5/part-00000-89cdd4c8-2af7-4add-8ea3-3990b2f027b5.c000.snappy.parquet", + f"{table_path}/year=2021/month=12/day=20/part-00000-9275fdf4-3961-4184-baa0-1c8a2bb98104.c000.snappy.parquet", + f"{table_path}/year=2021/month=12/day=4/part-00000-6dc763c0-3e8b-4d52-b19e-1f92af3fbb25.c000.snappy.parquet", + f"{table_path}/year=2021/month=4/day=5/part-00000-c5856301-3439-4032-a6fc-22b7bc92bebb.c000.snappy.parquet", ] partition_filters = [("day", "in", ["3", "20"])] assert dt.files_by_partitions(partition_filters=partition_filters) == [ - "year=2020/month=2/day=3/part-00000-94d16827-f2fd-42cd-a060-f67ccc63ced9.c000.snappy.parquet", - "year=2021/month=12/day=20/part-00000-9275fdf4-3961-4184-baa0-1c8a2bb98104.c000.snappy.parquet", + f"{table_path}/year=2020/month=2/day=3/part-00000-94d16827-f2fd-42cd-a060-f67ccc63ced9.c000.snappy.parquet", + f"{table_path}/year=2021/month=12/day=20/part-00000-9275fdf4-3961-4184-baa0-1c8a2bb98104.c000.snappy.parquet", ] partition_filters = [("day", "not in", ["3", "20"])] assert dt.files_by_partitions(partition_filters=partition_filters) == [ - "year=2020/month=1/day=1/part-00000-8eafa330-3be9-4a39-ad78-fd13c2027c7e.c000.snappy.parquet", - "year=2020/month=2/day=5/part-00000-89cdd4c8-2af7-4add-8ea3-3990b2f027b5.c000.snappy.parquet", - "year=2021/month=12/day=4/part-00000-6dc763c0-3e8b-4d52-b19e-1f92af3fbb25.c000.snappy.parquet", - "year=2021/month=4/day=5/part-00000-c5856301-3439-4032-a6fc-22b7bc92bebb.c000.snappy.parquet", + f"{table_path}/year=2020/month=1/day=1/part-00000-8eafa330-3be9-4a39-ad78-fd13c2027c7e.c000.snappy.parquet", + f"{table_path}/year=2020/month=2/day=5/part-00000-89cdd4c8-2af7-4add-8ea3-3990b2f027b5.c000.snappy.parquet", + f"{table_path}/year=2021/month=12/day=4/part-00000-6dc763c0-3e8b-4d52-b19e-1f92af3fbb25.c000.snappy.parquet", + f"{table_path}/year=2021/month=4/day=5/part-00000-c5856301-3439-4032-a6fc-22b7bc92bebb.c000.snappy.parquet", ] partition_filters = [("day", "not in", ["3", "20"]), ("year", "=", "2021")] assert dt.files_by_partitions(partition_filters=partition_filters) == [ - "year=2021/month=12/day=4/part-00000-6dc763c0-3e8b-4d52-b19e-1f92af3fbb25.c000.snappy.parquet", - "year=2021/month=4/day=5/part-00000-c5856301-3439-4032-a6fc-22b7bc92bebb.c000.snappy.parquet", + f"{table_path}/year=2021/month=12/day=4/part-00000-6dc763c0-3e8b-4d52-b19e-1f92af3fbb25.c000.snappy.parquet", + f"{table_path}/year=2021/month=4/day=5/part-00000-c5856301-3439-4032-a6fc-22b7bc92bebb.c000.snappy.parquet", ] partition_filters = [("invalid_operation", "=>", "3")] with pytest.raises(Exception) as exception: From a18229150627e255c3b4678336f7e3d96f44fbca Mon Sep 17 00:00:00 2001 From: Florian Date: Mon, 24 May 2021 13:35:18 +0200 Subject: [PATCH 2/2] Add test resources for vacuum tests --- rust/tests/data/delta-0.8.0/_change_data/.gitkeep | 0 rust/tests/data/delta-0.8.0/_delta_index/.gitkeep | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 rust/tests/data/delta-0.8.0/_change_data/.gitkeep create mode 100644 rust/tests/data/delta-0.8.0/_delta_index/.gitkeep diff --git a/rust/tests/data/delta-0.8.0/_change_data/.gitkeep b/rust/tests/data/delta-0.8.0/_change_data/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/rust/tests/data/delta-0.8.0/_delta_index/.gitkeep b/rust/tests/data/delta-0.8.0/_delta_index/.gitkeep new file mode 100644 index 0000000000..e69de29bb2