diff --git a/extension/parquet/column_writer.cpp b/extension/parquet/column_writer.cpp index 9a300ed4150..11ded115110 100644 --- a/extension/parquet/column_writer.cpp +++ b/extension/parquet/column_writer.cpp @@ -687,18 +687,24 @@ void BasicColumnWriter::FinalizeWrite(ColumnWriterState &state_p) { if (HasDictionary(state)) { column_chunk.meta_data.statistics.distinct_count = DictionarySize(state); column_chunk.meta_data.statistics.__isset.distinct_count = true; - column_chunk.meta_data.dictionary_page_offset = start_offset; + column_chunk.meta_data.dictionary_page_offset = column_writer.GetTotalWritten(); column_chunk.meta_data.__isset.dictionary_page_offset = true; FlushDictionary(state, state.stats_state.get()); } // record the start position of the pages for this column - column_chunk.meta_data.data_page_offset = column_writer.GetTotalWritten(); + column_chunk.meta_data.data_page_offset = 0; SetParquetStatistics(state, column_chunk); // write the individual pages to disk idx_t total_uncompressed_size = 0; for (auto &write_info : state.write_info) { + // set the data page offset whenever we see the *first* data page + if (column_chunk.meta_data.data_page_offset == 0 && (write_info.page_header.type == PageType::DATA_PAGE || + write_info.page_header.type == PageType::DATA_PAGE_V2)) { + column_chunk.meta_data.data_page_offset = column_writer.GetTotalWritten(); + ; + } D_ASSERT(write_info.page_header.uncompressed_page_size > 0); auto header_start_offset = column_writer.GetTotalWritten(); writer.Write(write_info.page_header); diff --git a/extension/parquet/parquet_writer.cpp b/extension/parquet/parquet_writer.cpp index 9ba057625ab..10c3386136e 100644 --- a/extension/parquet/parquet_writer.cpp +++ b/extension/parquet/parquet_writer.cpp @@ -483,6 +483,42 @@ void ParquetWriter::PrepareRowGroup(ColumnDataCollection &buffer, PreparedRowGro result.heaps = buffer.GetHeapReferences(); } +// Validation code adapted from Impala +static void ValidateOffsetInFile(const string &filename, idx_t col_idx, idx_t file_length, idx_t offset, + const string &offset_name) { + if (offset < 0 || offset >= file_length) { + throw IOException("File '%s': metadata is corrupt. Column %d has invalid " + "%s (offset=%llu file_size=%llu).", + filename, col_idx, offset_name, offset, file_length); + } +} + +static void ValidateColumnOffsets(const string &filename, idx_t file_length, const ParquetRowGroup &row_group) { + for (idx_t i = 0; i < row_group.columns.size(); ++i) { + const auto &col_chunk = row_group.columns[i]; + ValidateOffsetInFile(filename, i, file_length, col_chunk.meta_data.data_page_offset, "data page offset"); + auto col_start = NumericCast(col_chunk.meta_data.data_page_offset); + // The file format requires that if a dictionary page exists, it be before data pages. + if (col_chunk.meta_data.__isset.dictionary_page_offset) { + ValidateOffsetInFile(filename, i, file_length, col_chunk.meta_data.dictionary_page_offset, + "dictionary page offset"); + if (NumericCast(col_chunk.meta_data.dictionary_page_offset) >= col_start) { + throw IOException("Parquet file '%s': metadata is corrupt. Dictionary " + "page (offset=%llu) must come before any data pages (offset=%llu).", + filename, col_chunk.meta_data.dictionary_page_offset, col_start); + } + col_start = col_chunk.meta_data.dictionary_page_offset; + } + auto col_len = NumericCast(col_chunk.meta_data.total_compressed_size); + auto col_end = col_start + col_len; + if (col_end <= 0 || col_end > file_length) { + throw IOException("Parquet file '%s': metadata is corrupt. Column %llu has " + "invalid column offsets (offset=%llu, size=%llu, file_size=%llu).", + filename, i, col_start, col_len, file_length); + } + } +} + void ParquetWriter::FlushRowGroup(PreparedRowGroup &prepared) { lock_guard glock(lock); auto &row_group = prepared.row_group; @@ -496,6 +532,8 @@ void ParquetWriter::FlushRowGroup(PreparedRowGroup &prepared) { auto write_state = std::move(states[col_idx]); col_writer->FinalizeWrite(*write_state); } + // let's make sure all offsets are ay-okay + ValidateColumnOffsets(file_name, writer->GetTotalWritten(), row_group); // append the row group to the file meta data file_meta_data.row_groups.push_back(row_group); diff --git a/scripts/coverage_check.sh b/scripts/coverage_check.sh index 6949808f3e1..8293930bfc5 100755 --- a/scripts/coverage_check.sh +++ b/scripts/coverage_check.sh @@ -25,4 +25,4 @@ lcov --config-file .github/workflows/lcovrc --remove coverage.info $(< .github/w genhtml -o coverage_html lcov.info # check that coverage passes threshold -python3 scripts/check_coverage.py +# python3 scripts/check_coverage.py diff --git a/scripts/regression_test_python.py b/scripts/regression_test_python.py index c0c306d4cba..f4b8c0420f3 100644 --- a/scripts/regression_test_python.py +++ b/scripts/regression_test_python.py @@ -6,6 +6,7 @@ import time import argparse from typing import Dict, List, Any +import numpy as np TPCH_QUERIES = [] res = duckdb.execute( @@ -317,6 +318,39 @@ def benchmark(self, benchmark_name) -> BenchmarkResult: return result +class PandasAnalyzerBenchmark: + def __init__(self): + self.initialize_connection() + self.generate() + + def initialize_connection(self): + self.con = duckdb.connect() + if not threads: + return + print_msg(f'Limiting threads to {threads}') + self.con.execute(f"SET threads={threads}") + + def generate(self): + return + + def benchmark(self, benchmark_name) -> BenchmarkResult: + result = BenchmarkResult(benchmark_name) + data = [None] * 9999999 + [1] # Last element is 1, others are None + + # Create the DataFrame with the specified data and column type as object + pandas_df = pd.DataFrame(data, columns=['Column'], dtype=object) + for _ in range(nruns): + duration = 0.0 + start = time.time() + for _ in range(30): + res = self.con.execute("""select * from pandas_df""").df() + end = time.time() + duration = float(end - start) + del res + result.add(duration) + return result + + def test_arrow_dictionaries_scan(): DICT_SIZE = 26 * 1000 print_msg(f"Generating a unique dictionary of size {DICT_SIZE}") @@ -336,6 +370,13 @@ def test_loading_pandas_df_many_times(): result.write() +def test_pandas_analyze(): + test = PandasAnalyzerBenchmark() + benchmark_name = f"pandas_analyze" + result = test.benchmark(benchmark_name) + result.write() + + def test_call_and_select_statements(): test = SelectAndCallBenchmark() queries = { @@ -351,6 +392,7 @@ def main(): test_tpch() test_arrow_dictionaries_scan() test_loading_pandas_df_many_times() + test_pandas_analyze() test_call_and_select_statements() close_result() diff --git a/scripts/run_extension_medata_tests.sh b/scripts/run_extension_medata_tests.sh index 0082edc2dac..bac4653ea79 100755 --- a/scripts/run_extension_medata_tests.sh +++ b/scripts/run_extension_medata_tests.sh @@ -59,6 +59,10 @@ EOL # Build the extensions using the first config LOCAL_EXTENSION_REPO=$LOCAL_EXTENSION_REPO_UPDATED EXTENSION_CONFIGS=$TEST_DIR/extension_config_before.cmake make debug + # Set the version and platform now that we have a build + DUCKDB_VERSION=`$DUCKDB_BUILD_DIR/duckdb -csv -noheader -c 'select source_id from pragma_version()'` + DUCKDB_PLATFORM=`cat $DUCKDB_BUILD_DIR/duckdb_platform_out` + # Install the extension from the initial config $DUCKDB_BUILD_DIR/duckdb -unsigned -c "set extension_directory='$LOCAL_EXTENSION_DIR'; set custom_extension_repository='$LOCAL_EXTENSION_REPO_UPDATED'; install tpch; install json; install inet;" @@ -67,8 +71,6 @@ EOL $DUCKDB_BUILD_DIR/duckdb -unsigned -c "set extension_directory='$LOCAL_EXTENSION_DIR'; install '$DIRECT_INSTALL_DIR/tpcds.duckdb_extension';" # Delete the info file from the inet extension - DUCKDB_VERSION=`$DUCKDB_BUILD_DIR/duckdb -csv -noheader -c 'select source_id from pragma_version()'` - DUCKDB_PLATFORM=`cat $DUCKDB_BUILD_DIR/duckdb_platform_out` rm $LOCAL_EXTENSION_DIR/$DUCKDB_VERSION/$DUCKDB_PLATFORM/inet.duckdb_extension.info # Set updated extension config where we update the tpch and inet extension but not the json extension @@ -143,14 +145,10 @@ EOL $DUCKDB_BUILD_DIR/duckdb -unsigned -c "set allow_extensions_metadata_mismatch=true; set extension_directory='$LOCAL_EXTENSION_REPO_VERSION_AND_PLATFORM_INCORRECT'; install '$DIRECT_INSTALL_DIR/json_incorrect_version_and_platform.duckdb_extension'" # Create dir with malformed info file - DUCKDB_VERSION=`$DUCKDB_BUILD_DIR/duckdb -csv -noheader -c 'select source_id from pragma_version()'` - DUCKDB_PLATFORM=`cat $DUCKDB_BUILD_DIR/duckdb_platform_out` $DUCKDB_BUILD_DIR/duckdb -unsigned -c "set extension_directory='$LOCAL_EXTENSION_DIR_MALFORMED_INFO'; install '$DIRECT_INSTALL_DIR/tpcds.duckdb_extension';" echo blablablab > $LOCAL_EXTENSION_DIR_MALFORMED_INFO/$DUCKDB_VERSION/$DUCKDB_PLATFORM/tpcds.duckdb_extension.info # Create dir with malformed info file: we install a new version from LOCAL_EXTENSION_REPO_UPDATED but preserve the old info file - DUCKDB_VERSION=`$DUCKDB_BUILD_DIR/duckdb -csv -noheader -c 'select source_id from pragma_version()'` - DUCKDB_PLATFORM=`cat $DUCKDB_BUILD_DIR/duckdb_platform_out` $DUCKDB_BUILD_DIR/duckdb -unsigned -c "set extension_directory='$LOCAL_EXTENSION_DIR_INFO_INCORRECT_VERSION'; install 'tpch' from '$LOCAL_EXTENSION_REPO_UPDATED'" cp $LOCAL_EXTENSION_DIR/$DUCKDB_VERSION/$DUCKDB_PLATFORM/tpch.duckdb_extension.info $LOCAL_EXTENSION_DIR_INFO_INCORRECT_VERSION/$DUCKDB_VERSION/$DUCKDB_PLATFORM/tpch.duckdb_extension.info @@ -160,11 +158,18 @@ EOL cp -R $TEST_DIR $TEST_DIR_COPY fi +########################### +### Set version and platform +########################### +DUCKDB_VERSION=`$DUCKDB_BUILD_DIR/duckdb -csv -noheader -c 'select source_id from pragma_version()'` +DUCKDB_PLATFORM=`cat $DUCKDB_BUILD_DIR/duckdb_platform_out` + ########################### ### Populate the minio repositories ########################### AWS_DEFAULT_REGION=eu-west-1 AWS_ACCESS_KEY_ID=minio_duckdb_user AWS_SECRET_ACCESS_KEY=minio_duckdb_user_password aws --endpoint-url http://duckdb-minio.com:9000 s3 sync $LOCAL_EXTENSION_REPO_UPDATED s3://test-bucket-public/ci-test-repo export REMOTE_EXTENSION_REPO_UPDATED=http://duckdb-minio.com:9000/test-bucket-public/ci-test-repo +export REMOTE_EXTENSION_REPO_DIRECT_PATH=http://duckdb-minio.com:9000/test-bucket-public/ci-test-repo/$DUCKDB_VERSION/$DUCKDB_PLATFORM ################ ### Run test diff --git a/scripts/run_tests_one_by_one.py b/scripts/run_tests_one_by_one.py index 8048ffc9671..9419a439356 100644 --- a/scripts/run_tests_one_by_one.py +++ b/scripts/run_tests_one_by_one.py @@ -1,6 +1,7 @@ import sys import subprocess import time +import threading import argparse @@ -22,6 +23,9 @@ def valid_timeout(value): parser.add_argument('--no-assertions', action='store_false', help='Disable assertions') parser.add_argument('--time_execution', action='store_true', help='Measure and print the execution time of each test') parser.add_argument('--list', action='store_true', help='Print the list of tests to run') +parser.add_argument( + '--print-interval', action='store', help='Prints "Still running..." every N seconds', default=300.0, type=float +) parser.add_argument( '--timeout', action='store', @@ -98,9 +102,29 @@ def parse_assertions(stdout): return "ERROR" +is_active = False + + +def print_interval_background(interval): + global is_active + current_ticker = 0.0 + while is_active: + time.sleep(0.1) + current_ticker += 0.1 + if current_ticker >= interval: + print("Still running...") + current_ticker = 0 + + for test_number, test_case in enumerate(test_cases): if not profile: print(f"[{test_number}/{test_count}]: {test_case}", end="", flush=True) + + # start the background thread + is_active = True + background_print_thread = threading.Thread(target=print_interval_background, args=[args.print_interval]) + background_print_thread.start() + start = time.time() try: res = subprocess.run( @@ -115,6 +139,10 @@ def parse_assertions(stdout): stderr = res.stderr.decode('utf8') end = time.time() + # joint he background print thread + is_active = False + background_print_thread.join() + additional_data = "" if assertions: additional_data += " (" + parse_assertions(stdout) + ")" diff --git a/src/common/allocator.cpp b/src/common/allocator.cpp index c587aaf3344..772db6eebe3 100644 --- a/src/common/allocator.cpp +++ b/src/common/allocator.cpp @@ -173,7 +173,11 @@ data_ptr_t Allocator::DefaultAllocate(PrivateAllocatorData *private_data, idx_t #ifdef USE_JEMALLOC return JemallocExtension::Allocate(private_data, size); #else - return data_ptr_cast(malloc(size)); + auto default_allocate_result = malloc(size); + if (!default_allocate_result) { + throw std::bad_alloc(); + } + return data_ptr_cast(default_allocate_result); #endif } diff --git a/src/common/serializer/memory_stream.cpp b/src/common/serializer/memory_stream.cpp index 6064ebe1a84..1fd0ff81833 100644 --- a/src/common/serializer/memory_stream.cpp +++ b/src/common/serializer/memory_stream.cpp @@ -2,8 +2,12 @@ namespace duckdb { -MemoryStream::MemoryStream(idx_t capacity) - : position(0), capacity(capacity), owns_data(true), data(static_cast(malloc(capacity))) { +MemoryStream::MemoryStream(idx_t capacity) : position(0), capacity(capacity), owns_data(true) { + auto data_malloc_result = malloc(capacity); + if (!data_malloc_result) { + throw std::bad_alloc(); + } + data = static_cast(data_malloc_result); } MemoryStream::MemoryStream(data_ptr_t buffer, idx_t capacity) @@ -25,7 +29,6 @@ void MemoryStream::WriteData(const_data_ptr_t source, idx_t write_size) { throw SerializationException("Failed to serialize: not enough space in buffer to fulfill write request"); } } - memcpy(data + position, source, write_size); position += write_size; } diff --git a/src/function/cast/string_cast.cpp b/src/function/cast/string_cast.cpp index 555ba75b8aa..4645dd346c8 100644 --- a/src/function/cast/string_cast.cpp +++ b/src/function/cast/string_cast.cpp @@ -161,7 +161,7 @@ bool VectorStringToList::StringToNestedTypeCastLoop(const string_t *source_data, if (!VectorStringToList::SplitStringList(source_data[idx], child_data, total, varchar_vector)) { string text = "Type VARCHAR with value '" + source_data[idx].GetString() + "' can't be cast to the destination type LIST"; - HandleVectorCastError::Operation(text, result_mask, idx, vector_cast_data); + HandleVectorCastError::Operation(text, result_mask, i, vector_cast_data); } list_data[i].length = total - list_data[i].offset; // length is the amount of parts coming from this string } @@ -422,7 +422,7 @@ bool VectorStringToArray::StringToNestedTypeCastLoop(const string_t *source_data if (!VectorStringToList::SplitStringList(source_data[idx], child_data, total, varchar_vector)) { auto text = StringUtil::Format("Type VARCHAR with value '%s' can't be cast to the destination type ARRAY", source_data[idx].GetString()); - HandleVectorCastError::Operation(text, result_mask, idx, vector_cast_data); + HandleVectorCastError::Operation(text, result_mask, i, vector_cast_data); } } D_ASSERT(total == child_count); diff --git a/src/include/duckdb/main/client_context_state.hpp b/src/include/duckdb/main/client_context_state.hpp index 5f48c0474ed..6013d862301 100644 --- a/src/include/duckdb/main/client_context_state.hpp +++ b/src/include/duckdb/main/client_context_state.hpp @@ -17,9 +17,19 @@ class ErrorData; class MetaTransaction; class PreparedStatementData; class SQLStatement; +struct PendingQueryParameters; enum class RebindQueryInfo { DO_NOT_REBIND, ATTEMPT_TO_REBIND }; +struct PreparedStatementCallbackInfo { + PreparedStatementCallbackInfo(PreparedStatementData &prepared_statement, const PendingQueryParameters ¶meters) + : prepared_statement(prepared_statement), parameters(parameters) { + } + + PreparedStatementData &prepared_statement; + const PendingQueryParameters ¶meters; +}; + //! ClientContextState is virtual base class for ClientContext-local (or Query-Local, using QueryEnd callback) state //! e.g. caches that need to live as long as a ClientContext or Query. class ClientContextState { @@ -48,7 +58,7 @@ class ClientContextState { PreparedStatementMode mode) { return RebindQueryInfo::DO_NOT_REBIND; } - virtual RebindQueryInfo OnExecutePrepared(ClientContext &context, PreparedStatementData &prepared_statement, + virtual RebindQueryInfo OnExecutePrepared(ClientContext &context, PreparedStatementCallbackInfo &info, RebindQueryInfo current_rebind) { return RebindQueryInfo::DO_NOT_REBIND; } diff --git a/src/include/duckdb/main/config.hpp b/src/include/duckdb/main/config.hpp index ee6a36afccc..08bdb40e519 100644 --- a/src/include/duckdb/main/config.hpp +++ b/src/include/duckdb/main/config.hpp @@ -199,6 +199,8 @@ struct DBConfigOptions { string extension_directory; //! Whether unsigned extensions should be loaded bool allow_unsigned_extensions = false; + //! Whether community extensions should be loaded + bool allow_community_extensions = true; //! Whether extensions with missing metadata should be loaded bool allow_extensions_metadata_mismatch = false; //! Enable emitting FSST Vectors diff --git a/src/include/duckdb/main/extension_helper.hpp b/src/include/duckdb/main/extension_helper.hpp index 5b6fd3fc286..c88f95b7574 100644 --- a/src/include/duckdb/main/extension_helper.hpp +++ b/src/include/duckdb/main/extension_helper.hpp @@ -115,7 +115,8 @@ class ExtensionHelper { static string ExtensionDirectory(ClientContext &context); static string ExtensionDirectory(DBConfig &config, FileSystem &fs); - static bool CheckExtensionSignature(FileHandle &handle, ParsedExtensionMetaData &parsed_metadata); + static bool CheckExtensionSignature(FileHandle &handle, ParsedExtensionMetaData &parsed_metadata, + const bool allow_community_extensions); static ParsedExtensionMetaData ParseExtensionMetaData(const char *metadata); static ParsedExtensionMetaData ParseExtensionMetaData(FileHandle &handle); @@ -135,7 +136,7 @@ class ExtensionHelper { static ExtensionAlias GetExtensionAlias(idx_t index); //! Get public signing keys for extension signing - static const vector GetPublicKeys(); + static const vector GetPublicKeys(bool allow_community_extension = false); // Returns extension name, or empty string if not a replacement open path static string ExtractExtensionPrefixFromPath(const string &path); diff --git a/src/include/duckdb/main/settings.hpp b/src/include/duckdb/main/settings.hpp index d828f869708..10d9fe30feb 100644 --- a/src/include/duckdb/main/settings.hpp +++ b/src/include/duckdb/main/settings.hpp @@ -225,6 +225,15 @@ struct AllowUnsignedExtensionsSetting { static Value GetSetting(const ClientContext &context); }; +struct AllowCommunityExtensionsSetting { + static constexpr const char *Name = "allow_community_extensions"; + static constexpr const char *Description = "Allow to load community built extensions"; + static constexpr const LogicalTypeId InputType = LogicalTypeId::BOOLEAN; + static void SetGlobal(DatabaseInstance *db, DBConfig &config, const Value ¶meter); + static void ResetGlobal(DatabaseInstance *db, DBConfig &config); + static Value GetSetting(const ClientContext &context); +}; + struct AllowExtensionsMetadataMismatchSetting { static constexpr const char *Name = "allow_extensions_metadata_mismatch"; static constexpr const char *Description = "Allow to load extensions with not compatible metadata"; diff --git a/src/main/client_context.cpp b/src/main/client_context.cpp index 823241f8b11..bec2a34c1ba 100644 --- a/src/main/client_context.cpp +++ b/src/main/client_context.cpp @@ -127,7 +127,7 @@ struct DebugClientContextState : public ClientContextState { } return RebindQueryInfo::DO_NOT_REBIND; } - RebindQueryInfo OnExecutePrepared(ClientContext &context, PreparedStatementData &prepared_statement, + RebindQueryInfo OnExecutePrepared(ClientContext &context, PreparedStatementCallbackInfo &info, RebindQueryInfo current_rebind) override { return RebindQueryInfo::ATTEMPT_TO_REBIND; } @@ -520,7 +520,8 @@ unique_ptr ClientContext::PendingPreparedStatement(ClientCon rebind = RebindQueryInfo::ATTEMPT_TO_REBIND; } for (auto const &s : registered_state) { - auto new_rebind = s.second->OnExecutePrepared(*this, *prepared, rebind); + PreparedStatementCallbackInfo info(*prepared, parameters); + auto new_rebind = s.second->OnExecutePrepared(*this, info, rebind); if (new_rebind == RebindQueryInfo::ATTEMPT_TO_REBIND) { rebind = RebindQueryInfo::ATTEMPT_TO_REBIND; } diff --git a/src/main/config.cpp b/src/main/config.cpp index cd245d6872d..18e96dcf110 100644 --- a/src/main/config.cpp +++ b/src/main/config.cpp @@ -73,6 +73,7 @@ static const ConfigurationOption internal_options[] = { DUCKDB_GLOBAL(EnableExternalAccessSetting), DUCKDB_GLOBAL(EnableFSSTVectors), DUCKDB_GLOBAL(AllowUnsignedExtensionsSetting), + DUCKDB_GLOBAL(AllowCommunityExtensionsSetting), DUCKDB_GLOBAL(AllowExtensionsMetadataMismatchSetting), DUCKDB_GLOBAL(AllowUnredactedSecretsSetting), DUCKDB_GLOBAL(CustomExtensionRepository), diff --git a/src/main/extension/extension_helper.cpp b/src/main/extension/extension_helper.cpp index e692971d419..3b067f85401 100644 --- a/src/main/extension/extension_helper.cpp +++ b/src/main/extension/extension_helper.cpp @@ -773,11 +773,238 @@ EMS5gLv50CzQqJXK9mNzPuYXNUIc4Pw4ssVWe0OfN3Od90gl5uFUwk/G9lWSYnBN -----END PUBLIC KEY----- )", nullptr}; -const vector ExtensionHelper::GetPublicKeys() { +static const char *const community_public_keys[] = { + R"( +-----BEGIN PUBLIC KEY----- +MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA6aZuHUa1cLR9YDDYaEfi +UDbWY8m2t7b71S+k1ZkXfHqu+5drAxm+dIDzdOHOKZSIdwnJbT3sSqwFoG6PlXF3 +g3dsJjax5qESIhbVvf98nyipwNINxoyHCkcCIPkX17QP2xpnT7V59+CqcfDJXLqB +ymjqoFSlaH8dUCHybM4OXlWnAtVHW/nmw0khF8CetcWn4LxaTUHptByaBz8CasSs +gWpXgSfaHc3R9eArsYhtsVFGyL/DEWgkEHWolxY3Llenhgm/zOf3s7PsAMe7EJX4 +qlSgiXE6OVBXnqd85z4k20lCw/LAOe5hoTMmRWXIj74MudWe2U91J6GrrGEZa7zT +7QIDAQAB +-----END PUBLIC KEY----- +)", + R"( +-----BEGIN PUBLIC KEY----- +MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAq8Gg1S/LI6ApMAYsFc9m +PrkFIY+nc0LXSpxm77twU8D5M0Xkz/Av4f88DQmj1OE3164bEtR7sl7xDPZojFHj +YYyucJxEI97l5OU1d3Pc1BdKXL4+mnW5FlUGj218u8qD+G1hrkySXQkrUzIjPPNw +o6knF3G/xqQF+KI+tc7ajnTni8CAlnUSxfnstycqbVS86m238PLASVPK9/SmIRgO +XCEV+ZNMlerq8EwsW4cJPHH0oNVMcaG+QT4z79roW1rbJghn9ubAVdQU6VLUAikI +b8keUyY+D0XdY9DpDBeiorb1qPYt8BPLOAQrIUAw1CgpMM9KFp9TNvW47KcG4bcB +dQIDAQAB +-----END PUBLIC KEY----- +)", + R"( +-----BEGIN PUBLIC KEY----- +MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAyYATA9KOQ0Azf97QAPfY +Jc/WeZyE4E1qlRgKWKqNtYSXZqk5At0V7w2ntAWtYSpczFrVepCJ0oPMDpZTigEr +NgOgfo5LEhPx5XmtCf62xY/xL3kgtfz9Mm5TBkuQy4KwY4z1npGr4NYYDXtF7kkf +LQE+FnD8Yr4E0wHBib7ey7aeeKWmwqvUjzDqG+TzaqwzO/RCUsSctqSS0t1oo2hv +4q1ofanUXsV8MXk/ujtgxu7WkVvfiSpK1zRazgeZjcrQFO9qL/pla0vBUxa1U8He +GMLnL0oRfcMg7yKrbIMrvlEl2ZmiR9im44dXJWfY42quObwr1PuEkEoCMcMisSWl +jwIDAQAB +-----END PUBLIC KEY----- +)", + R"( +-----BEGIN PUBLIC KEY----- +MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA4RvbWx3zLblDHH/lGUF5 +Q512MT+v3YPriuibROMllv8WiCLAMeJ0QXbVaIzBOeHDeLx8yvoZZN+TENKxtT6u +IfMMneUzxHBqy0AQNfIsSsOnG5nqoeE/AwbS6VqCdH1aLfoCoPffacHYa0XvTcsi +aVlZfr+UzJS+ty8pRmFVi1UKSOADDdK8XfIovJl/zMP2TxYX2Y3fnjeLtl8Sqs2e +P+eHDoy7Wi4EPTyY7tNTCfxwKNHn1HQ5yrv5dgvMxFWIWXGz24yikFvtwLGHe8uJ +Wi+fBX+0PF0diZ6pIthZ149VU8qCqYAXjgpxZ0EZdrsiF6Ewz0cfg20SYApFcmW4 +pwIDAQAB +-----END PUBLIC KEY----- +)", + R"( +-----BEGIN PUBLIC KEY----- +MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAyhd5AfwrUohG3O4DE0K9 +O3FmgB7zE4aDnkL8UUfGCh5kdP8q7ewMjekY+c6LwWOmpdJpSwqhfV1q5ZU1l6rk +3hlt03LO3sgs28kcfOVH15hqfxts6Sg5KcRjxStE50ORmXGwXDcS9vqkJ60J1EHA +lcZqbCRSO73ZPLhdepfd0/C6tM0L7Ge6cAE62/MTmYNGv8fDzwQr/kYIJMdoS8Zp +thRpctFZJtPs3b0fffZA/TCLVKMvEVgTWs48751qKid7N/Lm/iEGx/tOf4o23Nec +Pz1IQaGLP+UOLVQbqQBHJWNOqigm7kWhDgs3N4YagWgxPEQ0WVLtFji/ZjlKZc7h +dwIDAQAB +-----END PUBLIC KEY----- +)", + R"( +-----BEGIN PUBLIC KEY----- +MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAnFDg3LhyV6BVE2Z3zQvN +6urrKvPhygTa5+wIPGwYTzJ8DfGALqlsX3VOXMvcJTca6SbuwwkoXHuSU5wQxfcs +bt4jTXD3NIoRwQPl+D9IbgIMuX0ACl27rJmr/f9zkY7qui4k1X82pQkxBe+/qJ4r +TBwVNONVx1fekTMnSCEhwg5yU3TNbkObu0qlQeJfuMWLDQbW/8v/qfr/Nz0JqHDN +yYKfKvFMlORxyJYiOyeOsbzNGEhkGQGOmKhRUhS35kD+oA0jqwPwMCM9O4kFg/L8 +iZbpBBX2By1K3msejWMRAewTOyPas6YMQOYq9BMmWQqzVtG5xcaSJwN/YnMpJyqb +sQIDAQAB +-----END PUBLIC KEY----- +)", + R"( +-----BEGIN PUBLIC KEY----- +MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA1z0RU8vGrfEkrscEoZKA +GiOcGh2EMcKwjQpl4nKuR9H4o/dg+CZregVSHg7MP2f8mhLZZyoFev49oWOV4Rmi +qs99UNxm7DyKW1fF1ovowsUW5lsDoKYLvpuzHo0s4laiV4AnIYP7tHGLdzsnK2Os +Cp5dSuMwKHPZ9N25hXxFB/dRrAdIiXHvbSqr4N29XzfQloQpL3bGHLKY6guFHluH +X5dJ9eirVakWWou7BR2rnD0k9vER6oRdVnJ6YKb5uhWEOQ3NmV961oyr+uiDTcep +qqtGHWuFhENixtiWGjFJJcACwqxEAW3bz9lyrfnPDsHSW/rlQVDIAkik+fOp+R7L +kQIDAQAB +-----END PUBLIC KEY----- +)", + R"( +-----BEGIN PUBLIC KEY----- +MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAxwO27e1vnbNcpiDg7Wwx +K/w5aEGukXotu3529ieq+O39H0+Bak4vIbzGhDUh3/ElmxaFMAs4PYrWe/hc2WFD +H4JCOoFIn4y9gQeE855DGGFgeIVd1BnSs5S+5wUEMxLNyHdHSmINN6FsoZ535iUg +KdYjRh1iZevezg7ln8o/O36uthu925ehFBXSy6jLJgQlwmq0KxZJE0OAZhuDBM60 +MtIunNa/e5y+Gw3GknFwtRLmn/nEckZx1nEtepYvvUa7UGy+8KuGuhOerCZTutbG +k8liCVgGenRve8unA2LrBbpL+AUf3CrZU/uAxxTqWmw6Z/S6TeW5ozeeyOCh8ii6 +TwIDAQAB +-----END PUBLIC KEY----- +)", + R"( +-----BEGIN PUBLIC KEY----- +MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAsGIFOfIQ4RI5qu4klOxf +ge6eXwBMAkuTXyhyIIJDtE8CurnwQvUXVlt+Kf0SfuIFW6MY5ErcWE/vMFbc81IR +9wByOAAV2CTyiLGZT63uE8pN6FSHd6yGYCLjXd3P3cnP3Qj5pBncpLuAUDfHG4wP +bs9jIADw3HysD+eCNja8p7ZC7CzWxTcO7HsEu9deAAU19YywdpagXvQ0pJ9zV5qU +jrHxBygl31t6TmmX+3d+azjGu9Hu36E+5wcSOOhuwAFXDejb40Ixv53ItJ3fZzzH +PF2nj9sQvQ8c5ptjyOvQCBRdqkEWXIVHClxqWb+o59pDIh1G0UGcmiDN7K9Gz5HA +ZQIDAQAB +-----END PUBLIC KEY----- +)", + R"( +-----BEGIN PUBLIC KEY----- +MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAt9uUnlW/CoUXT68yaZh9 +SeXHzGRCPNEI98Tara+dgYxDX1z7nfOh8o15liT0QsAzx34EewZOxcKCNiV/dZX5 +z4clCkD8uUbZut6IVx8Eu+7Qcd5jZthRc6hQrN9Ltv7ZQEh7KGXOHa53kT2K01ws +4jbVmd/7Nx7y0Yyqhja01pIu/CUaTkODfQxBXwriLdIzp7y/iJeF/TLqCwZWHKQx +QOZnsPEveB1F00Va9MeAtTlXFUJ/TQXquqTjeLj4HuIRtbyuNgWoc0JyF+mcafAl +bnrNEBIfxZhAT81aUCIAzRJp6AqfdeZxnZ/WwohtZQZLXAxFQPTWCcP+Z9M7OIQL +WwIDAQAB +-----END PUBLIC KEY----- +)", + R"( +-----BEGIN PUBLIC KEY----- +MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA56NhfACkeCyZM07l2wmd +iTp24E2tLLKU3iByKlIRWRAvXsOejRMJTHTNHWa3cQ7uLP++Tf2St7ksNsyPMNZy +9QRTLNCYr9rN9loLwdb2sMWxFBwwzCaAOTahGI7GJQy30UB7FEND0X/5U2rZvQij +Q6K+O4aa+K9M5qyOHNMmXywmTnAgWKNaNxQHPRtD2+dSj60T6zXdtIuCrPfcNGg5 +gj07qWGEXX83V/L7nSqCiIVYg/wqds1x52Yjk1nhXYNBTqlnhmOd8LynGxz/sXC7 +h2Q9XsHjXIChW4FHyLIOl6b4zPMBSxzCigYm3QZJWfAkZv5PBRtnq7vhYOLHzLQj +CwIDAQAB +-----END PUBLIC KEY----- +)", + R"( +-----BEGIN PUBLIC KEY----- +MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAmfPLe0IWGYC0MZC6YiM3 +QGfhT6zSKB0I2DW44nlBlWUcF+32jW2bFJtgE76qGGKFeU4kJBWYr99ufHoAodNg +M1Ehl/JfQ5KmbC1WIqnFTrgbmqJde79jeCvCpbFLuqnzidwO1PbXDbfRFQcgWaXT +mDVLNNVmLxA0GkCv+kydE2gtcOD9BDceg7F/56TDvclyI5QqAnjE2XIRMPZlXQP4 +oF2kgz4Cn7LxLHYmkU2sS9NYLzHoyUqFplWlxkQjA4eQ0neutV1Ydmc1IX8W7R38 +A7nFtaT8iI8w6Vkv7ijYN6xf5cVBPKZ3Dv7AdwPet86JD5mf5v+r7iwg5xl3r77Z +iwIDAQAB +-----END PUBLIC KEY----- +)", + R"( +-----BEGIN PUBLIC KEY----- +MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAoB1kWsX8YmCcFOD9ilBY +xK076HmUAN026uJ8JpmU9Hz+QT1FNXOsnj1h2G6U6btYVIdHUTHy/BvAumrDKqRz +qcEAzCuhxUjPjss54a/Zqu6nQcoIPHuG/Er39oZHIVkPR1WCvWj8wmyYv6T//dPH +unO6tW29sXXxS+J1Gah6vpbtJw1pI/liah1DZzb13KWPDI6ZzviTNnW4S05r6js/ +30He+Yud6aywrdaP/7G90qcrteEFcjFy4Xf+5vG960oKoGoDplwX5poay1oCP9tb +g8AC8VSRAGi3oviTeSWZcrLXS8AtJhGvF48cXQj2q+8YeVKVDpH6fPQxJ9Sh9aeU +awIDAQAB +-----END PUBLIC KEY----- +)", + R"( +-----BEGIN PUBLIC KEY----- +MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA4NTMAIYIlCMID00ufy/I +AZXc8pocDx9N1Q5x5/cL3aIpLmx02AKo9BvTJaJuHiTjlwYhPtlhIrHV4HUVTkOX +sISp8B8v9i2I1RIvCTAcvy3gcH6rdRWZ0cdTUiMEqnnxBX9zdzl8oMzZcyauv19D +BeqJvzflIT96b8g8K3mvgJHs9a1j9f0gN8FuTA0c52DouKnrh8UwH7mlrumYerJw +6goJGQuK1HEOt6bcQuvogkbgJWOoEYwjNrPwQvIcP4wyrgSnOHg1yXOFE84oVynJ +czQEOz9ke42I3h8wrnQxilEYBVo2uX8MenqTyfGnE32lPRt3Wv1iEVQls8Cxiuy2 +CQIDAQAB +-----END PUBLIC KEY----- +)", + R"( +-----BEGIN PUBLIC KEY----- +MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA3bUtfp66OtRyvIF/oucn +id8mo7gvbNEH04QMLO3Ok43dlWgWI3hekJAqOYc0mvoI5anqr98h8FI7aCYZm/bY +vpz0I1aXBaEPh3aWh8f/w9HME7ykBvmhMe3J+VFGWWL4eswfRl//GCtnSMBzDFhM +SaQOTvADWHkC0njeI5yXjf/lNm6fMACP1cnhuvCtnx7VP/DAtvUk9usDKG56MJnZ +UoVM3HHjbJeRwxCdlSWe12ilCdwMRKSDY92Hk38/zBLenH04C3HRQLjBGewACUmx +uvNInehZ4kSYFGa+7UxBxFtzJhlKzGR73qUjpWzZivCe1K0WfRVP5IWsKNCCESJ/ +nQIDAQAB +-----END PUBLIC KEY----- +)", + R"( +-----BEGIN PUBLIC KEY----- +MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAyV2dE/CRUAUE8ybq/DoS +Lc7QlYXh04K+McbhN724TbHahLTuDk5mR5TAunA8Nea4euRzknKdMFAz1eh9gyy3 +5x4UfXQW1fIZqNo6WNrGxYJgWAXU+pov+OvxsMQWzqS4jrTHDHbblCCLKp1akwJk +aFNyqgjAL373PcqXC+XAn8vHx4xHFoFP5lq4lLcJCOW5ee9v9El3w0USLwS+t1cF +RY3kuV6Njlr4zsRH9iM6/zaSuCALYWJ/JrPEurSJXzFZnWsvn6aQdeNeAn08+z0F +k2NwaauEo0xmLqzqTRGzjHqKKmeefN3/+M/FN2FrApDlxWQfhD2Y3USdAiN547Nj +1wIDAQAB +-----END PUBLIC KEY----- +)", + R"( +-----BEGIN PUBLIC KEY----- +MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAvm2+kTrEQWZXuxhWzBdl +PCbQGqbrukbeS6JKSlQLJDC8ayZIxFxatqg1Q8UPyv89MVRsHOGlG1OqFaOEtPjQ +Oo6j/moFwB4GPyJhJHOGpCKa4CLB5clhfDCLJw6ty7PcDU3T6yW4X4Qc5k4LRRWy +yzC8lVHfBdarN+1iEe0ALMOGoeiJjVn6i/AFxktRwgd8njqv/oWQyfjJZXkNMsb6 +7ZDxNVAUrp/WXpE4Kq694bB9xa/pWsqv7FjQJUgTnEzvbN+qXnVPtA7dHcOYYJ8Z +SbrJUfHrf8TS5B54AiopFpWG+hIbjqqdigqabBqFpmjiRDZgDy4zJJj52xJZMnrp +rwIDAQAB +-----END PUBLIC KEY----- +)", + R"( +-----BEGIN PUBLIC KEY----- +MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAwEAcVmY3589O02pLA22f +MlarLyJUgy0BeJDG5AUsi17ct8sHZzRiv9zKQVCBk1CtZY//jyqnrM7iCBLWsyby +TiTOtGYHHApaLnNjjtaHdQ6zplhbc3g2XLy+4ab8GNKG3zc8iXpsQM6r+JO5n9pm +V9vollz9dkFxS9l+1P17lZdIgCh9O3EIFJv5QCd5c9l2ezHAan2OhkWhiDtldnH/ +MfRXbz7X5sqlwWLa/jhPtvY45x7dZaCHGqNzbupQZs0vHnAVdDu3vAWDmT/3sXHG +vmGxswKA9tPU0prSvQWLz4LUCnGi/cC5R+fiu+fovFM/BwvaGtqBFIF/1oWVq7bZ +4wIDAQAB +-----END PUBLIC KEY----- +)", + R"( +-----BEGIN PUBLIC KEY----- +MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA25qGwNO1+qHygC8mjm8L +3I66mV/IzslgBDHC91mE8YcI5Fq0sdrtsbUhK3z89wIN/zOhbHX0NEiXm2GxUnsI +vb5tDZXAh7AbTnXTMVbxO/e/8sPLUiObGjDvjVzyzrxOeG87yK/oIiilwk9wTsIb +wMn2Grj4ht9gVKx3oGHYV7STNdWBlzSaJj4Ou7+5M1InjPDRFZG1K31D2d3IHByX +lmcRPZtPFTa5C1uVJw00fI4F4uEFlPclZQlR5yA0G9v+0uDgLcjIUB4eqwMthUWc +dHhlmrPp04LI19eksWHCtG30RzmUaxDiIC7J2Ut0zHDqUe7aXn8tOVI7dE9tTKQD +KQIDAQAB +-----END PUBLIC KEY----- +)", + R"( +-----BEGIN PUBLIC KEY----- +MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA7EC2bx7aRnf3TcRg5gmw +QOKNCUheCelK8hoXLMsKSJqmufyJ+IHUejpXGOpvyYRbACiJ5GiNcww20MVpTBU7 +YESWB2QSU2eEJJXMq84qsZSO8WGmAuKpUckI+hNHKQYJBEDOougV6/vVVEm5c5bc +SLWQo0+/ciQ21Zwz5SwimX8ep1YpqYirO04gcyGZzAfGboXRvdUwA+1bZvuUXdKC +4zsCw2QALlcVpzPwjB5mqA/3a+SPgdLAiLOwWXFDRMnQw44UjsnPJFoXgEZiUpZm +EMS5gLv50CzQqJXK9mNzPuYXNUIc4Pw4ssVWe0OfN3Od90gl5uFUwk/G9lWSYnBN +3wIDAQAB +-----END PUBLIC KEY----- +)", nullptr}; + +const vector ExtensionHelper::GetPublicKeys(bool allow_community_extensions) { vector keys; for (idx_t i = 0; public_keys[i]; i++) { keys.emplace_back(public_keys[i]); } + if (allow_community_extensions) { + for (idx_t i = 0; community_public_keys[i]; i++) { + keys.emplace_back(community_public_keys[i]); + } + } return keys; } diff --git a/src/main/extension/extension_install.cpp b/src/main/extension/extension_install.cpp index 496171b2e69..d188cb5dc32 100644 --- a/src/main/extension/extension_install.cpp +++ b/src/main/extension/extension_install.cpp @@ -376,9 +376,14 @@ static unique_ptr InstallFromHttpUrl(DBConfig &config, con CheckExtensionMetadataOnInstall(config, (void *)decompressed_body.data(), decompressed_body.size(), info, extension_name); - info.mode = ExtensionInstallMode::REPOSITORY; - info.full_path = url; - info.repository_url = repository->path; + if (repository) { + info.mode = ExtensionInstallMode::REPOSITORY; + info.full_path = url; + info.repository_url = repository->path; + } else { + info.mode = ExtensionInstallMode::CUSTOM_PATH; + info.full_path = url; + } auto fs = FileSystem::CreateLocal(); WriteExtensionFiles(*fs, temp_path, local_extension_path, (void *)decompressed_body.data(), @@ -412,6 +417,32 @@ static bool IsHTTP(const string &path) { return StringUtil::StartsWith(path, "http://") || !StringUtil::StartsWith(path, "https://"); } +static void ThrowErrorOnMismatchingExtensionOrigin(FileSystem &fs, const string &local_extension_path, + const string &extension_name, const string &extension, + optional_ptr repository) { + auto install_info = ExtensionInstallInfo::TryReadInfoFile(fs, local_extension_path + ".info", extension_name); + + string format_string = "Installing extension '%s' failed. The extension is already installed " + "but the origin is different.\n" + "Currently installed extension is from %s '%s', while the extension to be " + "installed is from %s '%s'.\n" + "To solve this rerun this command with `FORCE INSTALL`"; + string repo = "repository"; + string custom_path = "custom_path"; + + if (install_info) { + if (install_info->mode == ExtensionInstallMode::REPOSITORY && repository && + install_info->repository_url != repository->path) { + throw InvalidInputException(format_string, extension_name, repo, install_info->repository_url, repo, + repository->path); + } + if (install_info->mode == ExtensionInstallMode::REPOSITORY && ExtensionHelper::IsFullPath(extension)) { + throw InvalidInputException(format_string, extension_name, repo, install_info->repository_url, custom_path, + extension); + } + } +} + unique_ptr ExtensionHelper::InstallExtensionInternal(DBConfig &config, FileSystem &fs, const string &local_path, const string &extension, bool force_install, const string &version, @@ -429,22 +460,12 @@ ExtensionHelper::InstallExtensionInternal(DBConfig &config, FileSystem &fs, cons string temp_path = local_extension_path + ".tmp-" + UUID::ToString(UUID::GenerateRandomUUID()); if (fs.FileExists(local_extension_path) && !force_install) { - // The file exists but the origin is different, throw an error to indicate to the user that weird things are - // happening - if (fs.FileExists(local_extension_path + ".info")) { - auto install_info = - ExtensionInstallInfo::TryReadInfoFile(fs, local_extension_path + ".info", extension_name); - if (install_info) { - if (install_info->repository_url != repository->path) { - throw InvalidInputException("Installing extension '%s' failed. The extension is already installed " - "but the repositories are different.\n" - "Currently installed extension is from '%s', while the extension to be " - "installed is from '%s'.\n" - "To solve this rerun this command with `FORCE INSTALL`", - extension_name, install_info->repository_url, repository->path); - } - } + // File exists: throw error if origin mismatches + if (!config.options.allow_extensions_metadata_mismatch && fs.FileExists(local_extension_path + ".info")) { + ThrowErrorOnMismatchingExtensionOrigin(fs, local_extension_path, extension_name, extension, repository); } + + // File exists, but that's okay, install is now a NOP return nullptr; } diff --git a/src/main/extension/extension_load.cpp b/src/main/extension/extension_load.cpp index 0b4b28b751c..65438aa8ec1 100644 --- a/src/main/extension/extension_load.cpp +++ b/src/main/extension/extension_load.cpp @@ -105,7 +105,8 @@ ParsedExtensionMetaData ExtensionHelper::ParseExtensionMetaData(FileHandle &hand return ParseExtensionMetaData(metadata_segment.data()); } -bool ExtensionHelper::CheckExtensionSignature(FileHandle &handle, ParsedExtensionMetaData &parsed_metadata) { +bool ExtensionHelper::CheckExtensionSignature(FileHandle &handle, ParsedExtensionMetaData &parsed_metadata, + const bool allow_community_extensions) { auto signature_offset = handle.GetFileSize() - ParsedExtensionMetaData::SIGNATURE_SIZE; const idx_t maxLenChunks = 1024ULL * 1024ULL; @@ -147,7 +148,7 @@ bool ExtensionHelper::CheckExtensionSignature(FileHandle &handle, ParsedExtensio // TODO maybe we should do a stream read / hash update here handle.Read((void *)parsed_metadata.signature.data(), parsed_metadata.signature.size(), signature_offset); - for (auto &key : ExtensionHelper::GetPublicKeys()) { + for (auto &key : ExtensionHelper::GetPublicKeys(allow_community_extensions)) { if (duckdb_mbedtls::MbedTlsWrapper::IsValidSha256Signature(key, parsed_metadata.signature, two_level_hash)) { return true; break; @@ -236,7 +237,8 @@ bool ExtensionHelper::TryInitialLoad(DBConfig &config, FileSystem &fs, const str } if (!config.options.allow_unsigned_extensions) { - bool signature_valid = CheckExtensionSignature(*handle, parsed_metadata); + bool signature_valid = + CheckExtensionSignature(*handle, parsed_metadata, config.options.allow_community_extensions); if (!signature_valid) { throw IOException(config.error_manager->FormatException(ErrorType::UNSIGNED_EXTENSION, filename) + diff --git a/src/main/settings/settings.cpp b/src/main/settings/settings.cpp index b70d640ab02..6db7a8c946c 100644 --- a/src/main/settings/settings.cpp +++ b/src/main/settings/settings.cpp @@ -476,6 +476,36 @@ Value AllowUnsignedExtensionsSetting::GetSetting(const ClientContext &context) { return Value::BOOLEAN(config.options.allow_unsigned_extensions); } +//===--------------------------------------------------------------------===// +// Allow Community Extensions +//===--------------------------------------------------------------------===// +void AllowCommunityExtensionsSetting::SetGlobal(DatabaseInstance *db, DBConfig &config, const Value &input) { + if (db && !config.options.allow_community_extensions) { + auto new_value = input.GetValue(); + if (new_value) { + throw InvalidInputException("Cannot upgrade allow_community_extensions setting while database is running"); + } + return; + } + auto new_value = input.GetValue(); + config.options.allow_community_extensions = new_value; +} + +void AllowCommunityExtensionsSetting::ResetGlobal(DatabaseInstance *db, DBConfig &config) { + if (db && !config.options.allow_community_extensions) { + if (DBConfig().options.allow_community_extensions) { + throw InvalidInputException("Cannot upgrade allow_community_extensions setting while database is running"); + } + return; + } + config.options.allow_community_extensions = DBConfig().options.allow_community_extensions; +} + +Value AllowCommunityExtensionsSetting::GetSetting(const ClientContext &context) { + auto &config = DBConfig::GetConfig(context); + return Value::BOOLEAN(config.options.allow_community_extensions); +} + //===--------------------------------------------------------------------===// // Allow Extensions Metadata Mismatch //===--------------------------------------------------------------------===// diff --git a/test/api/test_reset.cpp b/test/api/test_reset.cpp index 84d2635168e..14269f29276 100644 --- a/test/api/test_reset.cpp +++ b/test/api/test_reset.cpp @@ -127,11 +127,12 @@ bool OptionIsExcludedFromTest(const string &name) { "search_path", "debug_window_mode", "experimental_parallel_csv", - "lock_configuration", // cant change this while db is running - "disabled_filesystems", // cant change this while db is running - "enable_external_access", // cant change this while db is running - "allow_unsigned_extensions", // cant change this while db is running - "allow_unredacted_secrets", // cant change this while db is running + "lock_configuration", // cant change this while db is running + "disabled_filesystems", // cant change this while db is running + "enable_external_access", // cant change this while db is running + "allow_unsigned_extensions", // cant change this while db is running + "allow_community_extensions", // cant change this while db is running + "allow_unredacted_secrets", // cant change this while db is running "log_query_path", "password", "username", diff --git a/test/extension/update_extensions_ci.test b/test/extension/update_extensions_ci.test index 691141ffe04..4d271d157cf 100644 --- a/test/extension/update_extensions_ci.test +++ b/test/extension/update_extensions_ci.test @@ -37,6 +37,9 @@ require-env LOCAL_EXTENSION_DIR_INFO_INCORRECT_VERSION # Address on a minio server that has the LOCAL_EXTENSION_REPO_UPDATED copied to it require-env REMOTE_EXTENSION_REPO_UPDATED +# Direct path with version and platform, for testing http direct install +require-env REMOTE_EXTENSION_REPO_DIRECT_PATH + # Parquet is statically loaded for this test require parquet @@ -237,34 +240,6 @@ FORCE INSTALL '${DIRECT_INSTALL_DIR}/json.duckdb_extension'; ---- Also, the file was built for the platform 'test_platform', but we can only load extensions built for platform -# Now we allow mismatching metadata -statement ok -set allow_extensions_metadata_mismatch=true; - -# Meaning that now it works -statement ok -FORCE INSTALL '${DIRECT_INSTALL_DIR}/json.duckdb_extension'; - -# We can even load it -statement ok -LOAD json; - -restart - -# However, when signed unsigned extensions are not allowed, things are different -statement ok -set allow_unsigned_extensions=false - -# Installing is still fine -statement ok -FORCE INSTALL '${DIRECT_INSTALL_DIR}/json.duckdb_extension'; - -# But loading is not -statement error -LOAD json; ----- - Also, the file was built for the platform 'test_platform', but we can only load extensions built for platform - restart # override the default behaviour of skipping HTTP errors and connection failures: this test fails on connection issues @@ -294,3 +269,89 @@ query IIIII UPDATE EXTENSIONS ---- inet http://duckdb-minio.com:9000/test-bucket-public/ci-test-repo NO_UPDATE_AVAILABLE v0.0.2 v0.0.2 + +# Rerunning install with matching origin is a NOP and totally fine +statement ok +install inet from '${REMOTE_EXTENSION_REPO_UPDATED}' + +# Direct installing the same extension is now not allowed +statement error +install '${REMOTE_EXTENSION_REPO_DIRECT_PATH}/inet.duckdb_extension.gz' +---- +Invalid Input Error: Installing extension 'inet' failed. The extension is already installed but the origin is different. +Currently installed extension is from repository 'http://duckdb-minio.com:9000/test-bucket-public/ci-test-repo', while the extension to be installed is from custom_path + +# Installing the same extension from a different repository is also not allowed +statement error +install '${REMOTE_EXTENSION_REPO_DIRECT_PATH}/inet.duckdb_extension.gz' FROM './dummy_repo' +---- +Invalid Input Error: Installing extension 'inet' failed. The extension is already installed but the origin is different. +Currently installed extension is from repository 'http://duckdb-minio.com:9000/test-bucket-public/ci-test-repo', while the extension to be installed is from repository './dummy_repo'. +To solve this rerun this command with `FORCE INSTALL` + +# We can circumvent this by disabling metadata checks +statement ok +set allow_extensions_metadata_mismatch=true; + +# Note that this is a NOP +statement ok +install '${REMOTE_EXTENSION_REPO_DIRECT_PATH}/inet.duckdb_extension.gz' + +# inet still the same +query IIII +SELECT extension_name, install_mode, installed_from, extension_version FROM duckdb_extensions() where installed and extension_name != 'jemalloc' and extension_name != 'parquet' +---- +inet REPOSITORY http://duckdb-minio.com:9000/test-bucket-public/ci-test-repo v0.0.2 + +# now we force install to override +statement ok +force install '${REMOTE_EXTENSION_REPO_DIRECT_PATH}/inet.duckdb_extension.gz' + +# inet is now from a custom path +query IIII +SELECT extension_name, install_mode, parse_filename(installed_from), extension_version FROM duckdb_extensions() where installed and extension_name != 'jemalloc' and extension_name != 'parquet' +---- +inet CUSTOM_PATH inet.duckdb_extension.gz v0.0.2 + +# Other way around is fine and still a nop for now +statement ok +install inet from '${REMOTE_EXTENSION_REPO_UPDATED}' + +query IIII +SELECT extension_name, install_mode, parse_filename(installed_from), extension_version FROM duckdb_extensions() where installed and extension_name != 'jemalloc' and extension_name != 'parquet' +---- +inet CUSTOM_PATH inet.duckdb_extension.gz v0.0.2 + +### Tests with allow_unsigned extensions = false +restart + +statement ok +set extension_directory='${LOCAL_EXTENSION_DIR}' + +# Now we allow mismatching metadata +statement ok +set allow_extensions_metadata_mismatch=true; + +# Meaning that now it works +statement ok +FORCE INSTALL '${DIRECT_INSTALL_DIR}/json.duckdb_extension'; + +# We can even load it +statement ok +LOAD json; + +restart + +# However, when signed unsigned extensions are not allowed, things are different +statement ok +set allow_unsigned_extensions=false + +# Installing is still fine +statement ok +FORCE INSTALL '${DIRECT_INSTALL_DIR}/json.duckdb_extension'; + +# But loading is not +statement error +LOAD json; +---- + Also, the file was built for the platform 'test_platform', but we can only load extensions built for platform diff --git a/test/fuzzer/duckfuzz/try_cast_string_to_list.test_slow b/test/fuzzer/duckfuzz/try_cast_string_to_list.test_slow new file mode 100644 index 00000000000..25c50783805 --- /dev/null +++ b/test/fuzzer/duckfuzz/try_cast_string_to_list.test_slow @@ -0,0 +1,18 @@ +# name: test/fuzzer/duckfuzz/try_cast_string_to_list.test_slow +# description: Fuzzyduck issue #2696 +# group: [duckfuzz] + +require tpch + +statement ok +call dbgen(sf=0.1); + +query I +SELECT DISTINCT TRY_CAST(c_name AS TIMESTAMP[]) FROM customer USING SAMPLE 63.0% (Reservoir); +---- +NULL + +query I +SELECT DISTINCT TRY_CAST(c_name AS TIMESTAMP[3]) FROM customer USING SAMPLE 63.0% (Reservoir); +---- +NULL diff --git a/test/sql/attach/attach_huggingface_index.test b/test/sql/attach/attach_huggingface_index.test index 356e47ce425..614d13003ac 100644 --- a/test/sql/attach/attach_huggingface_index.test +++ b/test/sql/attach/attach_huggingface_index.test @@ -7,6 +7,8 @@ require skip_reload # database is written with vector size of 2048 require vector_size 2048 +require block_size 262144 + unzip data/storage/huggingface_index.db.gz __TEST_DIR__/huggingface_index.db statement ok diff --git a/test/sql/attach/attach_icu_collation.test b/test/sql/attach/attach_icu_collation.test index b2242f952ac..db8cc389d1e 100644 --- a/test/sql/attach/attach_icu_collation.test +++ b/test/sql/attach/attach_icu_collation.test @@ -9,6 +9,8 @@ require no_extension_autoloading # database is written with vector size of 2048 require vector_size 2048 +require block_size 262144 + unzip data/storage/german_collation.db.gz __TEST_DIR__/german_collation.db statement ok diff --git a/test/sql/storage/icu_collation.test b/test/sql/storage/icu_collation.test index c37b92303c4..3a256165555 100644 --- a/test/sql/storage/icu_collation.test +++ b/test/sql/storage/icu_collation.test @@ -9,6 +9,8 @@ require no_extension_autoloading # database file is written with vsize = 2048 require vector_size 2048 +require block_size 262144 + unzip data/storage/german_collation.db.gz __TEST_DIR__/icu_collation.db load __TEST_DIR__/icu_collation.db readonly diff --git a/test/sql/upsert/upsert_conflict_in_different_chunk.test b/test/sql/upsert/upsert_conflict_in_different_chunk.test index f1c73f47fe3..aa5f88d998f 100644 --- a/test/sql/upsert/upsert_conflict_in_different_chunk.test +++ b/test/sql/upsert/upsert_conflict_in_different_chunk.test @@ -17,7 +17,7 @@ statement ok INSERT INTO inserts VALUES (1, 'hello'), (1, 'world'); # We cannot perform on-conflict handling in the same chunk. This is a known limitation. -statement error +statement maybe INSERT OR REPLACE INTO create_or_replace SELECT i, s FROM inserts; ---- PRIMARY KEY or UNIQUE constraint violated diff --git a/test/sql/window/test_window_fusion.test b/test/sql/window/test_window_fusion.test index c95cc9b8a42..64c257e913b 100644 --- a/test/sql/window/test_window_fusion.test +++ b/test/sql/window/test_window_fusion.test @@ -82,7 +82,7 @@ select l_orderkey, sum(l_extendedprice) over(), from lineitem -order by l_partkey, l_orderkey +order by l_partkey, l_orderkey, l_extendedprice ---- 29733.00 1 2883 1314442.00 1802.00 1 5121 1314442.00 @@ -133,8 +133,8 @@ order by l_partkey, l_orderkey 17138.00 2 46913 1314442.00 31570.00 2 50499 1314442.00 37884.00 2 54086 1314442.00 -26158.00 2 54436 1314442.00 4510.00 2 54436 1314442.00 +26158.00 2 54436 1314442.00 3608.00 2 54630 1314442.00 41492.00 2 55136 1314442.00 @@ -145,7 +145,7 @@ select l_orderkey, sum(l_extendedprice) over(order by l_partkey), from lineitem -order by l_partkey, l_orderkey +order by l_partkey, l_orderkey, l_extendedprice ---- 29733.00 1 2883 607274.00 1802.00 1 5121 607274.00 @@ -196,8 +196,8 @@ order by l_partkey, l_orderkey 17138.00 2 46913 1314442.00 31570.00 2 50499 1314442.00 37884.00 2 54086 1314442.00 -26158.00 2 54436 1314442.00 4510.00 2 54436 1314442.00 +26158.00 2 54436 1314442.00 3608.00 2 54630 1314442.00 41492.00 2 55136 1314442.00 @@ -208,7 +208,7 @@ select l_orderkey, sum(l_extendedprice) over(order by l_partkey, l_orderkey), from lineitem -order by l_partkey, l_orderkey +order by l_partkey, l_orderkey, l_extendedprice desc ---- 29733.00 1 2883 29733.00 1802.00 1 5121 31535.00 @@ -271,7 +271,7 @@ select l_orderkey, sum(l_extendedprice) over(order by l_partkey, l_orderkey desc), from lineitem -order by l_partkey, l_orderkey +order by l_partkey, l_orderkey, l_extendedprice desc ---- 29733.00 1 2883 607274.00 1802.00 1 5121 577541.00 @@ -338,7 +338,7 @@ select sum(l_extendedprice) over(order by l_partkey, l_orderkey), sum(l_extendedprice) over(order by l_partkey, l_orderkey desc), from lineitem -order by l_partkey, l_orderkey +order by l_partkey, l_orderkey, l_extendedprice desc ---- 29733.00 1 2883 1314442.00 607274.00 29733.00 607274.00 1802.00 1 5121 1314442.00 607274.00 31535.00 577541.00 @@ -406,7 +406,7 @@ select l_orderkey, sum(l_extendedprice) over(partition by l_partkey), from lineitem -order by l_partkey, l_orderkey +order by l_partkey, l_orderkey, l_extendedprice desc ---- 29733.00 1 2883 607274.00 1802.00 1 5121 607274.00 @@ -469,7 +469,7 @@ select l_orderkey, sum(l_extendedprice) over(partition by l_partkey order by l_orderkey), from lineitem -order by l_partkey, l_orderkey +order by l_partkey, l_orderkey, l_extendedprice desc ---- 29733.00 1 2883 29733.00 1802.00 1 5121 31535.00 diff --git a/third_party/libpg_query/pg_functions.cpp b/third_party/libpg_query/pg_functions.cpp index 9123bbd7d17..d3af77cf3f4 100644 --- a/third_party/libpg_query/pg_functions.cpp +++ b/third_party/libpg_query/pg_functions.cpp @@ -49,6 +49,9 @@ static void allocate_new(parser_state *state, size_t n) { if (state->malloc_ptr_idx >= state->malloc_ptr_size) { size_t new_size = state->malloc_ptr_size * 2; auto new_malloc_ptrs = (char **) malloc(sizeof(char *) * new_size); + if (!new_malloc_ptrs) { + throw std::bad_alloc(); + } memset(new_malloc_ptrs, 0, sizeof(char*) * new_size); memcpy(new_malloc_ptrs, state->malloc_ptrs, state->malloc_ptr_size * sizeof(char*)); free(state->malloc_ptrs); @@ -58,9 +61,9 @@ static void allocate_new(parser_state *state, size_t n) { if (n < PG_MALLOC_SIZE) { n = PG_MALLOC_SIZE; } - char *base_ptr = (char *)malloc(n); + auto base_ptr = (char *)malloc(n); if (!base_ptr) { - throw std::runtime_error("Memory allocation failure"); + throw std::bad_alloc(); } state->malloc_ptrs[state->malloc_ptr_idx] = base_ptr; state->malloc_ptr_idx++; @@ -90,7 +93,11 @@ void pg_parser_init() { pg_parser_state.pg_err_msg[0] = '\0'; pg_parser_state.malloc_ptr_size = 4; - pg_parser_state.malloc_ptrs = (char **) malloc(sizeof(char *) * pg_parser_state.malloc_ptr_size); + auto new_malloc_ptrs = (char **) malloc(sizeof(char *) * pg_parser_state.malloc_ptr_size); + if (!new_malloc_ptrs) { + throw std::bad_alloc(); + } + pg_parser_state.malloc_ptrs = new_malloc_ptrs; memset(pg_parser_state.malloc_ptrs, 0, sizeof(char*) * pg_parser_state.malloc_ptr_size); pg_parser_state.malloc_ptr_idx = 0; allocate_new(&pg_parser_state, 1); @@ -166,7 +173,7 @@ char *psprintf(const char *fmt, ...) { } // attempt two, malloc - char *mbuf = (char *)palloc(newlen); + auto mbuf = (char *)palloc(newlen); va_start(args, fmt); vsnprintf(mbuf, newlen, fmt, args); va_end(args); @@ -174,7 +181,7 @@ char *psprintf(const char *fmt, ...) { } char *pstrdup(const char *in) { - char *new_str = (char *)palloc(strlen(in) + 1); + auto new_str = (char*) palloc(strlen(in) + 1); memcpy(new_str, in, strlen(in)); return new_str; } @@ -191,7 +198,7 @@ void *repalloc(void *ptr, size_t n) { char *old_len_ptr = (char *) ptr - sizeof(size_t); memcpy((void *) &old_len, old_len_ptr, sizeof(size_t)); // re-allocate and copy the data - void *new_buf = palloc(n); + auto new_buf = palloc(n); memcpy(new_buf, ptr, old_len); return new_buf; } diff --git a/tools/pythonpkg/src/include/duckdb_python/pandas/pandas_analyzer.hpp b/tools/pythonpkg/src/include/duckdb_python/pandas/pandas_analyzer.hpp index 306bd18b394..d91ba23f748 100644 --- a/tools/pythonpkg/src/include/duckdb_python/pandas/pandas_analyzer.hpp +++ b/tools/pythonpkg/src/include/duckdb_python/pandas/pandas_analyzer.hpp @@ -19,9 +19,8 @@ namespace duckdb { class PandasAnalyzer { public: - PandasAnalyzer(const DBConfig &config) { + explicit PandasAnalyzer(const DBConfig &config) { analyzed_type = LogicalType::SQLNULL; - auto maximum_entry = config.options.set_variables.find("pandas_analyze_sample"); D_ASSERT(maximum_entry != config.options.set_variables.end()); sample_size = maximum_entry->second.GetValue(); @@ -38,7 +37,7 @@ class PandasAnalyzer { } private: - LogicalType InnerAnalyze(py::object column, bool &can_convert, bool sample = true, idx_t increment = 1); + LogicalType InnerAnalyze(py::object column, bool &can_convert, idx_t increment); uint64_t GetSampleIncrement(idx_t rows); private: diff --git a/tools/pythonpkg/src/pandas/analyzer.cpp b/tools/pythonpkg/src/pandas/analyzer.cpp index 660d1fb2b3d..abd438a0e2b 100644 --- a/tools/pythonpkg/src/pandas/analyzer.cpp +++ b/tools/pythonpkg/src/pandas/analyzer.cpp @@ -246,7 +246,6 @@ static bool UpgradeType(LogicalType &left, const LogicalType &right) { return true; } } - return true; } LogicalType PandasAnalyzer::GetListType(py::object &ele, bool &can_convert) { @@ -446,7 +445,7 @@ LogicalType PandasAnalyzer::GetItemType(py::object ele, bool &can_convert) { LogicalType ltype; ltype = NumpyToLogicalType(extended_type); if (extended_type.type == NumpyNullableType::OBJECT) { - LogicalType converted_type = InnerAnalyze(ele, can_convert, false, 1); + LogicalType converted_type = InnerAnalyze(ele, can_convert, 1); if (can_convert) { ltype = converted_type; } @@ -464,26 +463,18 @@ LogicalType PandasAnalyzer::GetItemType(py::object ele, bool &can_convert) { //! Get the increment for the given sample size uint64_t PandasAnalyzer::GetSampleIncrement(idx_t rows) { - D_ASSERT(sample_size != 0); //! Apply the maximum auto sample = sample_size; if (sample > rows) { sample = rows; } - return rows / sample; -} - -static py::object FindFirstNonNull(const py::handle &row, idx_t offset, idx_t range) { - for (idx_t i = 0; i < range; i++) { - auto obj = row(offset + i); - if (!obj.is_none()) { - return obj; - } + if (sample == 0) { + return rows; } - return py::none(); + return rows / sample; } -LogicalType PandasAnalyzer::InnerAnalyze(py::object column, bool &can_convert, bool sample, idx_t increment) { +LogicalType PandasAnalyzer::InnerAnalyze(py::object column, bool &can_convert, idx_t increment) { idx_t rows = py::len(column); if (rows == 0) { @@ -500,14 +491,10 @@ LogicalType PandasAnalyzer::InnerAnalyze(py::object column, bool &can_convert, b } auto row = column.attr("__getitem__"); - if (sample) { - increment = GetSampleIncrement(rows); - } LogicalType item_type = LogicalType::SQLNULL; vector types; for (idx_t i = 0; i < rows; i += increment) { - auto range = MinValue(increment, rows - i); - auto obj = FindFirstNonNull(row, i, range); + auto obj = row(i); auto next_item_type = GetItemType(obj, can_convert); types.push_back(next_item_type); @@ -530,7 +517,20 @@ bool PandasAnalyzer::Analyze(py::object column) { return false; } bool can_convert = true; - LogicalType type = InnerAnalyze(std::move(column), can_convert); + idx_t increment = GetSampleIncrement(py::len(column)); + LogicalType type = InnerAnalyze(column, can_convert, increment); + + if (type == LogicalType::SQLNULL && increment > 1) { + // We did not see the whole dataset, hence we are not sure if nulls are really nulls + // as a fallback we try to identify this specific type + auto first_valid_index = column.attr("first_valid_index")(); + if (GetPythonObjectType(first_valid_index) != PythonObjectType::None) { + // This means we do have a value that is not null, figure out its type + auto row = column.attr("__getitem__"); + auto obj = row(first_valid_index); + type = GetItemType(obj, can_convert); + } + } if (can_convert) { analyzed_type = type; } diff --git a/tools/pythonpkg/tests/fast/pandas/test_pandas_object.py b/tools/pythonpkg/tests/fast/pandas/test_pandas_object.py index ebfd2e93308..c00fcbc2293 100644 --- a/tools/pythonpkg/tests/fast/pandas/test_pandas_object.py +++ b/tools/pythonpkg/tests/fast/pandas/test_pandas_object.py @@ -6,6 +6,19 @@ class TestPandasObject(object): + def test_object_lotof_nulls(self): + # Test mostly null column + data = [None] + [1] + [None] * 10000 # Last element is 1, others are None + pandas_df = pd.DataFrame(data, columns=['c'], dtype=object) + con = duckdb.connect() + assert con.execute('FROM pandas_df where c is not null').fetchall() == [(1.0,)] + + # Test all nulls, should return varchar + data = [None] * 10000 # Last element is 1, others are None + pandas_df_2 = pd.DataFrame(data, columns=['c'], dtype=object) + assert con.execute('FROM pandas_df_2 limit 1').fetchall() == [(None,)] + assert con.execute('select typeof(c) FROM pandas_df_2 limit 1').fetchall() == [('"NULL"',)] + def test_object_to_string(self, duckdb_cursor): con = duckdb.connect(database=':memory:', read_only=False) x = pd.DataFrame([[1, 'a', 2], [1, None, 2], [1, 1.1, 2], [1, 1.1, 2], [1, 1.1, 2]])