From 007407d41b1a53eb66b3fb9314ad14bc6faab80e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hannes=20M=C3=BChleisen?= Date: Sun, 19 Dec 2021 15:33:37 +0100 Subject: [PATCH 01/10] adding parquet reader to node module --- .../sql/copy/csv/test_control_characters.test | 131 ++++++++++++++++++ tools/nodejs/.gitignore | 3 +- tools/nodejs/binding.gyp | 1 + tools/nodejs/configure | 9 +- tools/nodejs/src/database.cpp | 3 + tools/nodejs/test/parquet.js | 16 +++ 6 files changed, 156 insertions(+), 7 deletions(-) create mode 100644 test/sql/copy/csv/test_control_characters.test create mode 100644 tools/nodejs/test/parquet.js diff --git a/test/sql/copy/csv/test_control_characters.test b/test/sql/copy/csv/test_control_characters.test new file mode 100644 index 00000000000..f30f0f590c2 --- /dev/null +++ b/test/sql/copy/csv/test_control_characters.test @@ -0,0 +1,131 @@ +# name: test/sql/copy/csv/test_copy_unicode.test +# description: Test copy statement with unicode delimiter/quote/escape +# group: [csv] + +# create tables for testing +statement ok +CREATE TABLE test_unicode_1 (col_a INTEGER, col_b VARCHAR(10), col_c VARCHAR(10), col_d VARCHAR(10)); + +statement ok +CREATE TABLE test_unicode_2 (col_a INTEGER, col_b VARCHAR(10), col_c VARCHAR(10), col_d VARCHAR(10)); + +statement ok +CREATE TABLE test_unicode_3 (col_a INTEGER, col_b VARCHAR(10), col_c VARCHAR(10), col_d VARCHAR(10)); + +statement ok +CREATE TABLE test_unicode_4 (col_a VARCHAR, col_b VARCHAR); + +# test COPY ... FROM ... +# test unicode delimiter/quote/escape +query I +COPY test_unicode_1 FROM 'test/sql/copy/csv/data/test/multi_char.csv' (DELIMITER '馃', QUOTE '水', ESCAPE '帅'); +---- +4 + +query ITTT +SELECT * FROM test_unicode_1 ORDER BY 1 LIMIT 4; +---- +0 du帅馃ck d水水u馃ck duck +1 dou水ble NULL duck +2 NULL NULL NULL +3 duck inv帅asion NULL NULL + +# test unicode delimiter/quote/escape that exceeds the buffer size a few times +query I +COPY test_unicode_2 FROM 'test/sql/copy/csv/data/test/multi_char_large.csv' (DELIMITER '馃', QUOTE '水', ESCAPE '帅'); +---- +16384 + +query ITTT +SELECT * FROM test_unicode_2 ORDER BY 1 LIMIT 4; +---- +16 values hashing to aba4d1f7a3b0d043a672e09344cf7d55 + +query I +SELECT SUM(col_a) FROM test_unicode_2; +---- +134209536 + +# test correct shared substring behavior at buffer borders +query I +COPY test_unicode_4 FROM 'test/sql/copy/csv/data/test/shared_substring_large.csv' (DELIMITER 'AAA', ESCAPE 'AAB'); +---- +1 + +query TT +SELECT * FROM test_unicode_4; +---- +2 values hashing to 0bafca4292fa123d931621a1fff765a1 + +# test same string for delimiter and quote +statement error +COPY test_unicode_1 FROM 'test/sql/copy/csv/data/test/multi_char.csv' (DELIMITER '馃', QUOTE '馃'); + +# escape and quote cannot be substrings of each other +statement error +COPY test_unicode_1 FROM 'test/sql/copy/csv/data/test/multi_char.csv' (ESCAPE 'du', QUOTE 'duck'); + +statement error +COPY test_unicode_1 FROM 'test/sql/copy/csv/data/test/multi_char.csv' (ESCAPE 'duck', QUOTE 'du'); + +# delimiter and quote cannot be substrings of each other +statement error +COPY test_unicode_1 FROM 'test/sql/copy/csv/data/test/multi_char.csv' (DELIMITER 'du', QUOTE 'duck'); + +statement error +COPY test_unicode_1 FROM 'test/sql/copy/csv/data/test/multi_char.csv' (DELIMITER 'duck', QUOTE 'du'); + +# delimiter and escape cannot be substrings of each other +statement error +COPY test_unicode_1 FROM 'test/sql/copy/csv/data/test/multi_char.csv' (DELIMITER 'AA', ESCAPE 'AAAA'); + +statement error +COPY test_unicode_1 FROM 'test/sql/copy/csv/data/test/multi_char.csv' (DELIMITER 'AAAA', ESCAPE 'AA'); + +# COPY ... TO ... +# test unicode delimiter/quote/escape +query I +COPY test_unicode_1 TO '__TEST_DIR__/test_unicode_1.csv' (DELIMITER '馃', QUOTE '水', ESCAPE '帅'); +---- +4 + +statement ok +DELETE FROM test_unicode_1; + +query I +COPY test_unicode_1 FROM '__TEST_DIR__/test_unicode_1.csv' (DELIMITER '馃', QUOTE '水', ESCAPE '帅'); +---- +4 + +query ITTT +SELECT * FROM test_unicode_1 ORDER BY 1 LIMIT 4; +---- +0 du帅馃ck d水水u馃ck duck +1 dou水ble NULL duck +2 NULL NULL NULL +3 duck inv帅asion NULL NULL + +# test unicode delimiter/quote/escape +query I +COPY test_unicode_2 TO '__TEST_DIR__/test_unicode_2.csv' (DELIMITER '馃', QUOTE '水', ESCAPE '帅'); +---- +16384 + +statement ok +DELETE FROM test_unicode_2; + +query I +COPY test_unicode_2 FROM '__TEST_DIR__/test_unicode_2.csv' (DELIMITER '馃', QUOTE '水', ESCAPE '帅'); +---- +16384 + +query ITTT +SELECT * FROM test_unicode_2 ORDER BY 1 LIMIT 4; +---- +16 values hashing to aba4d1f7a3b0d043a672e09344cf7d55 + +query R +SELECT SUM(col_a) FROM test_unicode_2; +---- +134209536.000000 + diff --git a/tools/nodejs/.gitignore b/tools/nodejs/.gitignore index 548500e9051..e23b307837c 100644 --- a/tools/nodejs/.gitignore +++ b/tools/nodejs/.gitignore @@ -5,4 +5,5 @@ src/duckdb.hpp lib/binding package-lock.json test/support/big.db* -test/tmp/* \ No newline at end of file +test/tmp/* +src/parquet-amalgamation* diff --git a/tools/nodejs/binding.gyp b/tools/nodejs/binding.gyp index 5d9c2fa5b3f..27e7435f508 100644 --- a/tools/nodejs/binding.gyp +++ b/tools/nodejs/binding.gyp @@ -8,6 +8,7 @@ "src/connection.cpp", "src/statement.cpp", "src/utils.cpp", + "src/parquet-amalgamation.cpp", "src/duckdb.cpp" # comment this out to build against existing lib ], "include_dirs": [ diff --git a/tools/nodejs/configure b/tools/nodejs/configure index c4f7937fd0e..413cc3f76df 100755 --- a/tools/nodejs/configure +++ b/tools/nodejs/configure @@ -6,12 +6,9 @@ set -x cd `dirname $0` if [ ! -f "../../scripts/amalgamation.py" ]; then - echo "Could find neither the amalgamation build script" + echo "Could not find the amalgamation build script" exit 1 fi -(cd ../.. && python scripts/amalgamation.py --source=tools/nodejs/src/duckdb.cpp --header=tools/nodejs/src/duckdb.hpp) -# (cd ../.. && python extension/parquet/parquet_amalgamation.py --source=tools/rpkg/src/parquet-extension.cpp --header=tools/rpkg/src/parquet-extension.h) -# cp src/parquet-extension.h src/parquet-extension.h.tmp -# sed 's/duckdb[.]hpp/duckdb.h/g' src/parquet-extension.h.tmp > src/parquet-extension.h -# rm src/parquet-extension.h.tmp +(cd ../.. && python scripts/amalgamation.py --extended --source=tools/nodejs/src/duckdb.cpp --header=tools/nodejs/src/duckdb.hpp) +(cd ../.. && python scripts/parquet_amalgamation.py && cp src/amalgamation/parquet-amalgamation* tools/nodejs/src/) diff --git a/tools/nodejs/src/database.cpp b/tools/nodejs/src/database.cpp index 6c1a727c096..36b7b28644c 100644 --- a/tools/nodejs/src/database.cpp +++ b/tools/nodejs/src/database.cpp @@ -1,4 +1,5 @@ #include "duckdb_node.hpp" +#include "parquet-amalgamation.hpp" namespace node_duckdb { @@ -28,6 +29,8 @@ struct OpenTask : public Task { void DoWork() override { try { Get().database = duckdb::make_unique(filename); + duckdb::ParquetExtension extension; + extension.Load(*Get().database); success = true; } catch (std::exception &ex) { diff --git a/tools/nodejs/test/parquet.js b/tools/nodejs/test/parquet.js new file mode 100644 index 00000000000..ac9bf90476b --- /dev/null +++ b/tools/nodejs/test/parquet.js @@ -0,0 +1,16 @@ +var sqlite3 = require('..'); +var assert = require('assert'); +var helper = require('./support/helper'); + +describe('can query parquet', function() { + var db; + + before(function(done) { + db = new sqlite3.Database(':memory:', done); + }); + + it('should be able to read parquet files', function(done) { + db.run("select * from parquet_scan('../../data/parquet-testing/userdata1.parquet')", done); + }); + +}); From 837ef82ea172b478ceda0b6871119006ca224957 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hannes=20M=C3=BChleisen?= Date: Sun, 19 Dec 2021 15:35:42 +0100 Subject: [PATCH 02/10] removing spurious file --- .../sql/copy/csv/test_control_characters.test | 131 ------------------ 1 file changed, 131 deletions(-) delete mode 100644 test/sql/copy/csv/test_control_characters.test diff --git a/test/sql/copy/csv/test_control_characters.test b/test/sql/copy/csv/test_control_characters.test deleted file mode 100644 index f30f0f590c2..00000000000 --- a/test/sql/copy/csv/test_control_characters.test +++ /dev/null @@ -1,131 +0,0 @@ -# name: test/sql/copy/csv/test_copy_unicode.test -# description: Test copy statement with unicode delimiter/quote/escape -# group: [csv] - -# create tables for testing -statement ok -CREATE TABLE test_unicode_1 (col_a INTEGER, col_b VARCHAR(10), col_c VARCHAR(10), col_d VARCHAR(10)); - -statement ok -CREATE TABLE test_unicode_2 (col_a INTEGER, col_b VARCHAR(10), col_c VARCHAR(10), col_d VARCHAR(10)); - -statement ok -CREATE TABLE test_unicode_3 (col_a INTEGER, col_b VARCHAR(10), col_c VARCHAR(10), col_d VARCHAR(10)); - -statement ok -CREATE TABLE test_unicode_4 (col_a VARCHAR, col_b VARCHAR); - -# test COPY ... FROM ... -# test unicode delimiter/quote/escape -query I -COPY test_unicode_1 FROM 'test/sql/copy/csv/data/test/multi_char.csv' (DELIMITER '馃', QUOTE '水', ESCAPE '帅'); ----- -4 - -query ITTT -SELECT * FROM test_unicode_1 ORDER BY 1 LIMIT 4; ----- -0 du帅馃ck d水水u馃ck duck -1 dou水ble NULL duck -2 NULL NULL NULL -3 duck inv帅asion NULL NULL - -# test unicode delimiter/quote/escape that exceeds the buffer size a few times -query I -COPY test_unicode_2 FROM 'test/sql/copy/csv/data/test/multi_char_large.csv' (DELIMITER '馃', QUOTE '水', ESCAPE '帅'); ----- -16384 - -query ITTT -SELECT * FROM test_unicode_2 ORDER BY 1 LIMIT 4; ----- -16 values hashing to aba4d1f7a3b0d043a672e09344cf7d55 - -query I -SELECT SUM(col_a) FROM test_unicode_2; ----- -134209536 - -# test correct shared substring behavior at buffer borders -query I -COPY test_unicode_4 FROM 'test/sql/copy/csv/data/test/shared_substring_large.csv' (DELIMITER 'AAA', ESCAPE 'AAB'); ----- -1 - -query TT -SELECT * FROM test_unicode_4; ----- -2 values hashing to 0bafca4292fa123d931621a1fff765a1 - -# test same string for delimiter and quote -statement error -COPY test_unicode_1 FROM 'test/sql/copy/csv/data/test/multi_char.csv' (DELIMITER '馃', QUOTE '馃'); - -# escape and quote cannot be substrings of each other -statement error -COPY test_unicode_1 FROM 'test/sql/copy/csv/data/test/multi_char.csv' (ESCAPE 'du', QUOTE 'duck'); - -statement error -COPY test_unicode_1 FROM 'test/sql/copy/csv/data/test/multi_char.csv' (ESCAPE 'duck', QUOTE 'du'); - -# delimiter and quote cannot be substrings of each other -statement error -COPY test_unicode_1 FROM 'test/sql/copy/csv/data/test/multi_char.csv' (DELIMITER 'du', QUOTE 'duck'); - -statement error -COPY test_unicode_1 FROM 'test/sql/copy/csv/data/test/multi_char.csv' (DELIMITER 'duck', QUOTE 'du'); - -# delimiter and escape cannot be substrings of each other -statement error -COPY test_unicode_1 FROM 'test/sql/copy/csv/data/test/multi_char.csv' (DELIMITER 'AA', ESCAPE 'AAAA'); - -statement error -COPY test_unicode_1 FROM 'test/sql/copy/csv/data/test/multi_char.csv' (DELIMITER 'AAAA', ESCAPE 'AA'); - -# COPY ... TO ... -# test unicode delimiter/quote/escape -query I -COPY test_unicode_1 TO '__TEST_DIR__/test_unicode_1.csv' (DELIMITER '馃', QUOTE '水', ESCAPE '帅'); ----- -4 - -statement ok -DELETE FROM test_unicode_1; - -query I -COPY test_unicode_1 FROM '__TEST_DIR__/test_unicode_1.csv' (DELIMITER '馃', QUOTE '水', ESCAPE '帅'); ----- -4 - -query ITTT -SELECT * FROM test_unicode_1 ORDER BY 1 LIMIT 4; ----- -0 du帅馃ck d水水u馃ck duck -1 dou水ble NULL duck -2 NULL NULL NULL -3 duck inv帅asion NULL NULL - -# test unicode delimiter/quote/escape -query I -COPY test_unicode_2 TO '__TEST_DIR__/test_unicode_2.csv' (DELIMITER '馃', QUOTE '水', ESCAPE '帅'); ----- -16384 - -statement ok -DELETE FROM test_unicode_2; - -query I -COPY test_unicode_2 FROM '__TEST_DIR__/test_unicode_2.csv' (DELIMITER '馃', QUOTE '水', ESCAPE '帅'); ----- -16384 - -query ITTT -SELECT * FROM test_unicode_2 ORDER BY 1 LIMIT 4; ----- -16 values hashing to aba4d1f7a3b0d043a672e09344cf7d55 - -query R -SELECT SUM(col_a) FROM test_unicode_2; ----- -134209536.000000 - From 2aa2d7f348a0cd545eb6dd84576676e4f1fb1dd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hannes=20M=C3=BChleisen?= Date: Sun, 19 Dec 2021 20:52:36 +0100 Subject: [PATCH 03/10] thrift amalgamation windows fixes for nodejs --- third_party/thrift/thrift/config.h | 2 -- .../thrift/thrift/protocol/TCompactProtocol.tcc | 2 -- third_party/thrift/thrift/thrift-config.h | 14 ++++++++++++-- 3 files changed, 12 insertions(+), 6 deletions(-) delete mode 100644 third_party/thrift/thrift/config.h diff --git a/third_party/thrift/thrift/config.h b/third_party/thrift/thrift/config.h deleted file mode 100644 index bc943fc1a7f..00000000000 --- a/third_party/thrift/thrift/config.h +++ /dev/null @@ -1,2 +0,0 @@ -#define SIGNED_RIGHT_SHIFT_IS 1 -#define ARITHMETIC_RIGHT_SHIFT 1 diff --git a/third_party/thrift/thrift/protocol/TCompactProtocol.tcc b/third_party/thrift/thrift/protocol/TCompactProtocol.tcc index 21fe48a4198..7b7d881c04d 100644 --- a/third_party/thrift/thrift/protocol/TCompactProtocol.tcc +++ b/third_party/thrift/thrift/protocol/TCompactProtocol.tcc @@ -21,8 +21,6 @@ #include -#include "thrift/config.h" - /* * TCompactProtocol::i*ToZigzag depend on the fact that the right shift * operator on a signed integer is an arithmetic (sign-extending) shift. diff --git a/third_party/thrift/thrift/thrift-config.h b/third_party/thrift/thrift/thrift-config.h index 9a10ac10e6e..19e473cf635 100755 --- a/third_party/thrift/thrift/thrift-config.h +++ b/third_party/thrift/thrift/thrift-config.h @@ -17,8 +17,18 @@ * under the License. */ +#ifndef THRIFT_CONFIG_H +#define THRIFT_CONFIG_H + + #ifdef _WIN32 -//#include +#if defined(_M_IX86) || defined(_M_X64) +#define ARITHMETIC_RIGHT_SHIFT 1 +#define SIGNED_RIGHT_SHIFT_IS 1 +#endif #else -#include "thrift/config.h" +#define SIGNED_RIGHT_SHIFT_IS 1 +#define ARITHMETIC_RIGHT_SHIFT 1 #endif + +#endif \ No newline at end of file From 7ee0420de57c7522207e30ed1e0fb8ea06415969 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hannes=20M=C3=BChleisen?= Date: Mon, 20 Dec 2021 09:06:44 +0100 Subject: [PATCH 04/10] drop dynamic_pointer_cast --- extension/parquet/parquet-extension.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extension/parquet/parquet-extension.cpp b/extension/parquet/parquet-extension.cpp index 186c665b879..b82b2ed5e23 100644 --- a/extension/parquet/parquet-extension.cpp +++ b/extension/parquet/parquet-extension.cpp @@ -139,7 +139,7 @@ class ParquetScanFunction { FileSystem &fs = FileSystem::GetFileSystem(context); for (idx_t file_idx = 1; file_idx < bind_data.files.size(); file_idx++) { auto &file_name = bind_data.files[file_idx]; - auto metadata = std::dynamic_pointer_cast(cache.Get(file_name)); + auto metadata = (ParquetFileMetadataCache*)(cache.Get(file_name).get()); auto handle = fs.OpenFile(file_name, FileFlags::FILE_FLAGS_READ, FileSystem::DEFAULT_LOCK, FileSystem::DEFAULT_COMPRESSION, FileSystem::GetFileOpener(context)); // but we need to check if the metadata cache entries are current From 42581365935100270d1abe6a76c5f639cfb5431f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hannes=20M=C3=BChleisen?= Date: Mon, 20 Dec 2021 09:24:51 +0100 Subject: [PATCH 05/10] format --- extension/parquet/parquet-extension.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extension/parquet/parquet-extension.cpp b/extension/parquet/parquet-extension.cpp index b82b2ed5e23..9dba49b2acd 100644 --- a/extension/parquet/parquet-extension.cpp +++ b/extension/parquet/parquet-extension.cpp @@ -139,7 +139,7 @@ class ParquetScanFunction { FileSystem &fs = FileSystem::GetFileSystem(context); for (idx_t file_idx = 1; file_idx < bind_data.files.size(); file_idx++) { auto &file_name = bind_data.files[file_idx]; - auto metadata = (ParquetFileMetadataCache*)(cache.Get(file_name).get()); + auto metadata = (ParquetFileMetadataCache *)(cache.Get(file_name).get()); auto handle = fs.OpenFile(file_name, FileFlags::FILE_FLAGS_READ, FileSystem::DEFAULT_LOCK, FileSystem::DEFAULT_COMPRESSION, FileSystem::GetFileOpener(context)); // but we need to check if the metadata cache entries are current From 13fc65ccf3d88fbc28b35539271e882cd99a202e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hannes=20M=C3=BChleisen?= Date: Mon, 20 Dec 2021 14:33:05 +0100 Subject: [PATCH 06/10] maybe vs likes a move there? --- extension/parquet/parquet-extension.cpp | 4 ++-- extension/parquet/parquet_reader.cpp | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/extension/parquet/parquet-extension.cpp b/extension/parquet/parquet-extension.cpp index 9dba49b2acd..68eb7e6621d 100644 --- a/extension/parquet/parquet-extension.cpp +++ b/extension/parquet/parquet-extension.cpp @@ -139,7 +139,7 @@ class ParquetScanFunction { FileSystem &fs = FileSystem::GetFileSystem(context); for (idx_t file_idx = 1; file_idx < bind_data.files.size(); file_idx++) { auto &file_name = bind_data.files[file_idx]; - auto metadata = (ParquetFileMetadataCache *)(cache.Get(file_name).get()); + auto metadata = std::dynamic_pointer_cast(move(cache.Get(file_name))); auto handle = fs.OpenFile(file_name, FileFlags::FILE_FLAGS_READ, FileSystem::DEFAULT_LOCK, FileSystem::DEFAULT_COMPRESSION, FileSystem::GetFileOpener(context)); // but we need to check if the metadata cache entries are current @@ -543,4 +543,4 @@ std::string ParquetExtension::Name() { return "parquet"; } -} // namespace duckdb +} // namespace duckdb \ No newline at end of file diff --git a/extension/parquet/parquet_reader.cpp b/extension/parquet/parquet_reader.cpp index 1cd41f61844..0e385fe1505 100644 --- a/extension/parquet/parquet_reader.cpp +++ b/extension/parquet/parquet_reader.cpp @@ -377,8 +377,8 @@ ParquetReader::ParquetReader(ClientContext &context_p, string file_name_p, const if (!ObjectCache::ObjectCacheEnabled(context_p)) { metadata = LoadMetadata(allocator, *file_handle); } else { - metadata = - std::dynamic_pointer_cast(ObjectCache::GetObjectCache(context_p).Get(file_name)); + metadata = std::dynamic_pointer_cast( + move(ObjectCache::GetObjectCache(context_p).Get(file_name))); if (!metadata || (last_modify_time + 10 >= metadata->read_time)) { metadata = LoadMetadata(allocator, *file_handle); ObjectCache::GetObjectCache(context_p).Put(file_name, metadata); @@ -765,4 +765,4 @@ bool ParquetReader::ScanInternal(ParquetReaderScanState &state, DataChunk &resul return true; } -} // namespace duckdb +} // namespace duckdb \ No newline at end of file From 7dab12acd04c207f7c2ec43c94445da8f4aa1501 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Mon, 20 Dec 2021 17:08:42 +0100 Subject: [PATCH 07/10] Local variable? --- extension/parquet/parquet-extension.cpp | 3 ++- extension/parquet/parquet_reader.cpp | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/extension/parquet/parquet-extension.cpp b/extension/parquet/parquet-extension.cpp index 68eb7e6621d..b8c878b5f1a 100644 --- a/extension/parquet/parquet-extension.cpp +++ b/extension/parquet/parquet-extension.cpp @@ -139,7 +139,8 @@ class ParquetScanFunction { FileSystem &fs = FileSystem::GetFileSystem(context); for (idx_t file_idx = 1; file_idx < bind_data.files.size(); file_idx++) { auto &file_name = bind_data.files[file_idx]; - auto metadata = std::dynamic_pointer_cast(move(cache.Get(file_name))); + auto metadata_obj = cache.Get(file_name); + auto metadata = std::dynamic_pointer_cast(metadata_obj); auto handle = fs.OpenFile(file_name, FileFlags::FILE_FLAGS_READ, FileSystem::DEFAULT_LOCK, FileSystem::DEFAULT_COMPRESSION, FileSystem::GetFileOpener(context)); // but we need to check if the metadata cache entries are current diff --git a/extension/parquet/parquet_reader.cpp b/extension/parquet/parquet_reader.cpp index 0e385fe1505..f67a804b17d 100644 --- a/extension/parquet/parquet_reader.cpp +++ b/extension/parquet/parquet_reader.cpp @@ -377,8 +377,8 @@ ParquetReader::ParquetReader(ClientContext &context_p, string file_name_p, const if (!ObjectCache::ObjectCacheEnabled(context_p)) { metadata = LoadMetadata(allocator, *file_handle); } else { - metadata = std::dynamic_pointer_cast( - move(ObjectCache::GetObjectCache(context_p).Get(file_name))); + auto metadata_obj = ObjectCache::GetObjectCache(context_p).Get(file_name); + metadata = std::dynamic_pointer_cast(metadata_obj); if (!metadata || (last_modify_time + 10 >= metadata->read_time)) { metadata = LoadMetadata(allocator, *file_handle); ObjectCache::GetObjectCache(context_p).Put(file_name, metadata); From 7ad346ff930ccd60f259ee99e8b24daa91321183 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Mon, 20 Dec 2021 20:18:51 +0100 Subject: [PATCH 08/10] Avoid dynamic_pointer_cast --- .../parquet/include/parquet_file_metadata_cache.hpp | 9 +++++++++ extension/parquet/parquet-extension.cpp | 11 +++++++---- extension/parquet/parquet_reader.cpp | 3 +-- src/include/duckdb/storage/object_cache.hpp | 13 ++++++++++++- 4 files changed, 29 insertions(+), 7 deletions(-) diff --git a/extension/parquet/include/parquet_file_metadata_cache.hpp b/extension/parquet/include/parquet_file_metadata_cache.hpp index 8103ec2ea46..79ac5629878 100644 --- a/extension/parquet/include/parquet_file_metadata_cache.hpp +++ b/extension/parquet/include/parquet_file_metadata_cache.hpp @@ -31,5 +31,14 @@ class ParquetFileMetadataCache : public ObjectCacheEntry { //! read time time_t read_time; + +public: + static string ObjectType() { + return "parquet_metadata"; + } + + string GetObjectType() override { + return ObjectType(); + } }; } // namespace duckdb diff --git a/extension/parquet/parquet-extension.cpp b/extension/parquet/parquet-extension.cpp index b8c878b5f1a..62067adfd1d 100644 --- a/extension/parquet/parquet-extension.cpp +++ b/extension/parquet/parquet-extension.cpp @@ -139,12 +139,15 @@ class ParquetScanFunction { FileSystem &fs = FileSystem::GetFileSystem(context); for (idx_t file_idx = 1; file_idx < bind_data.files.size(); file_idx++) { auto &file_name = bind_data.files[file_idx]; - auto metadata_obj = cache.Get(file_name); - auto metadata = std::dynamic_pointer_cast(metadata_obj); + auto metadata = cache.Get(file_name); + if (!metadata) { + // missing metadata entry in cache, no usable stats + return nullptr; + } auto handle = fs.OpenFile(file_name, FileFlags::FILE_FLAGS_READ, FileSystem::DEFAULT_LOCK, FileSystem::DEFAULT_COMPRESSION, FileSystem::GetFileOpener(context)); - // but we need to check if the metadata cache entries are current - if (!metadata || (fs.GetLastModifiedTime(*handle) >= metadata->read_time)) { + // we need to check if the metadata cache entries are current + if (fs.GetLastModifiedTime(*handle) >= metadata->read_time) { // missing or invalid metadata entry in cache, no usable stats overall return nullptr; } diff --git a/extension/parquet/parquet_reader.cpp b/extension/parquet/parquet_reader.cpp index f67a804b17d..a3a66e7d66e 100644 --- a/extension/parquet/parquet_reader.cpp +++ b/extension/parquet/parquet_reader.cpp @@ -377,8 +377,7 @@ ParquetReader::ParquetReader(ClientContext &context_p, string file_name_p, const if (!ObjectCache::ObjectCacheEnabled(context_p)) { metadata = LoadMetadata(allocator, *file_handle); } else { - auto metadata_obj = ObjectCache::GetObjectCache(context_p).Get(file_name); - metadata = std::dynamic_pointer_cast(metadata_obj); + auto metadata = ObjectCache::GetObjectCache(context_p).Get(file_name); if (!metadata || (last_modify_time + 10 >= metadata->read_time)) { metadata = LoadMetadata(allocator, *file_handle); ObjectCache::GetObjectCache(context_p).Put(file_name, metadata); diff --git a/src/include/duckdb/storage/object_cache.hpp b/src/include/duckdb/storage/object_cache.hpp index d4edfa76c40..f3033db935c 100644 --- a/src/include/duckdb/storage/object_cache.hpp +++ b/src/include/duckdb/storage/object_cache.hpp @@ -23,11 +23,13 @@ class ObjectCacheEntry { public: virtual ~ObjectCacheEntry() { } + + virtual string GetObjectType() = 0; }; class ObjectCache { public: - shared_ptr Get(string key) { + shared_ptr GetObject(const string &key) { lock_guard glock(lock); auto entry = cache.find(key); if (entry == cache.end()) { @@ -36,6 +38,15 @@ class ObjectCache { return entry->second; } + template + shared_ptr Get(const string &key) { + shared_ptr object = GetObject(key); + if (!object || object->GetObjectType() != T::ObjectType()) { + return nullptr; + } + return std::static_pointer_cast(object); + } + void Put(string key, shared_ptr value) { lock_guard glock(lock); cache[key] = move(value); From 4b124758fd33f49b75cf88811a93df321038c396 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Mon, 20 Dec 2021 22:20:56 +0100 Subject: [PATCH 09/10] Not like this --- extension/parquet/parquet_reader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extension/parquet/parquet_reader.cpp b/extension/parquet/parquet_reader.cpp index a3a66e7d66e..9c4b8607194 100644 --- a/extension/parquet/parquet_reader.cpp +++ b/extension/parquet/parquet_reader.cpp @@ -377,7 +377,7 @@ ParquetReader::ParquetReader(ClientContext &context_p, string file_name_p, const if (!ObjectCache::ObjectCacheEnabled(context_p)) { metadata = LoadMetadata(allocator, *file_handle); } else { - auto metadata = ObjectCache::GetObjectCache(context_p).Get(file_name); + metadata = ObjectCache::GetObjectCache(context_p).Get(file_name); if (!metadata || (last_modify_time + 10 >= metadata->read_time)) { metadata = LoadMetadata(allocator, *file_handle); ObjectCache::GetObjectCache(context_p).Put(file_name, metadata); From 57309eedf7fed62b42d9f656699d1f2fc7d5ed1f Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Tue, 21 Dec 2021 23:26:43 +0100 Subject: [PATCH 10/10] Don't tell the thrift people --- third_party/thrift/thrift/protocol/TCompactProtocol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/thrift/thrift/protocol/TCompactProtocol.h b/third_party/thrift/thrift/protocol/TCompactProtocol.h index 94341dfa8b9..b5912180b68 100755 --- a/third_party/thrift/thrift/protocol/TCompactProtocol.h +++ b/third_party/thrift/thrift/protocol/TCompactProtocol.h @@ -240,7 +240,7 @@ class TCompactProtocolFactoryT : public TProtocolFactory { void setContainerSizeLimit(int32_t container_limit) { container_limit_ = container_limit; } std::shared_ptr getProtocol(std::shared_ptr trans) override { - std::shared_ptr specific_trans = std::dynamic_pointer_cast(trans); + std::shared_ptr specific_trans = std::static_pointer_cast(trans); TProtocol* prot; if (specific_trans) { prot = new TCompactProtocolT(specific_trans, string_limit_, container_limit_);