Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enforce compression extensions for CSV Files #11903

Merged
merged 12 commits into from
May 29, 2024
16 changes: 15 additions & 1 deletion src/planner/binder/statement/bind_copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ BoundStatement Binder::BindCopyTo(CopyStatement &stmt) {

auto original_options = stmt.info->options;
stmt.info->options.clear();

for (auto &option : original_options) {
auto loption = StringUtil::Lower(option.first);
if (loption == "use_tmp_file") {
Expand Down Expand Up @@ -105,6 +104,21 @@ BoundStatement Binder::BindCopyTo(CopyStatement &stmt) {
auto converted = ConvertVectorToValue(std::move(option.second));
partition_cols = ParseColumnsOrdered(converted, select_node.names, loption);
} else {
if (loption == "compression") {
if (option.second.empty()) {
// This can't be empty
throw BinderException("COMPRESSION option, in the file scanner, can't be empty. It should be set "
"to AUTO, UNCOMPRESSED, GZIP, SNAPPY or ZSTD. Depending on the file format.");
}
auto parameter = StringUtil::Lower(option.second[0].ToString());
if (parameter == "gzip" && !StringUtil::EndsWith(bind_input.file_extension, ".gz")) {
// We just add .gz
bind_input.file_extension += ".gz";
} else if (parameter == "zstd" && !StringUtil::EndsWith(bind_input.file_extension, ".zst")) {
// We just add .zst
bind_input.file_extension += ".zst";
}
}
stmt.info->options[option.first] = option.second;
}
}
Expand Down
63 changes: 63 additions & 0 deletions test/sql/copy/csv/test_partition_compression.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# name: test/sql/copy/csv/test_partition_compression.test
# description: Test we can round-trip partitioned compressed CSV files
# group: [csv]

statement ok
PRAGMA enable_verification

require no_extension_autoloading

statement ok
CREATE TABLE test AS VALUES ('a', 'foo', 1), ('a', 'foo', 2), ('a', 'bar', 1), ('b', 'bar', 1);

statement ok
COPY (FROM test) TO '__TEST_DIR__/data.csv.d' (FORMAT 'csv', COMPRESSION 'gzip', PARTITION_BY ('col0', 'col1'));

# Specify Compression
statement error
FROM read_csv_auto('__TEST_DIR__/data.csv.d/*/*/*.csv')
----
No files found that match the pattern

query III
FROM read_csv_auto('__TEST_DIR__/data.csv.d/*/*/*.csv.gz');
----
a bar 1
a foo 1
a foo 2
b bar 1

query III
FROM read_csv_auto('__TEST_DIR__/data.csv.d/*/*/*.csv.*');
----
a bar 1
a foo 1
a foo 2
b bar 1

require parquet

statement ok
COPY (FROM test) TO '__TEST_DIR__/data.csv.d2' (FORMAT 'csv', COMPRESSION 'zstd', PARTITION_BY ('col0', 'col1'));

# Specify Compression
statement error
FROM read_csv_auto('__TEST_DIR__/data.csv.d2/*/*/*.csv')
----
No files found that match the pattern

query III
FROM read_csv_auto('__TEST_DIR__/data.csv.d2/*/*/*.csv.zst');
----
a bar 1
a foo 1
a foo 2
b bar 1

query III
FROM read_csv_auto('__TEST_DIR__/data.csv.d2/*/*/*.csv.*');
----
a bar 1
a foo 1
a foo 2
b bar 1