Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/duckdb/extension/icu/icu-makedate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ struct ICUMakeDate : public ICUDateFunc {
}

// Extract the time zone parts
auto micros = SetTime(calendar, instant);
SetTime(calendar, instant);
const auto era = ExtractField(calendar, UCAL_ERA);
const auto year = ExtractField(calendar, UCAL_YEAR);
const auto mm = ExtractField(calendar, UCAL_MONTH) + 1;
Expand Down
2 changes: 0 additions & 2 deletions src/duckdb/extension/icu/icu-strptime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ struct ICUStrptime : public ICUDateFunc {
auto &info = func_expr.bind_info->Cast<ICUStrptimeBindData>();
CalendarPtr calendar_ptr(info.calendar->clone());
auto calendar = calendar_ptr.get();
auto &formats = info.formats;

D_ASSERT(fmt_arg.GetVectorType() == VectorType::CONSTANT_VECTOR);

Expand Down Expand Up @@ -126,7 +125,6 @@ struct ICUStrptime : public ICUDateFunc {
auto &info = func_expr.bind_info->Cast<ICUStrptimeBindData>();
CalendarPtr calendar_ptr(info.calendar->clone());
auto calendar = calendar_ptr.get();
auto &formats = info.formats;

D_ASSERT(fmt_arg.GetVectorType() == VectorType::CONSTANT_VECTOR);

Expand Down
1 change: 0 additions & 1 deletion src/duckdb/extension/icu/icu_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,6 @@ static void SetICUCalendar(ClientContext &context, SetScope scope, Value &parame

void IcuExtension::Load(DuckDB &ddb) {
auto &db = *ddb.instance;
auto &catalog = Catalog::GetSystemCatalog(db);

// iterate over all the collations
int32_t count;
Expand Down
11 changes: 1 addition & 10 deletions src/duckdb/extension/json/json_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,16 +189,7 @@ vector<TableFunctionSet> JSONFunctions::GetTableFunctions() {

unique_ptr<TableRef> JSONFunctions::ReadJSONReplacement(ClientContext &context, const string &table_name,
ReplacementScanData *data) {
auto lower_name = StringUtil::Lower(table_name);
// remove any compression
if (StringUtil::EndsWith(lower_name, ".gz")) {
lower_name = lower_name.substr(0, lower_name.size() - 3);
} else if (StringUtil::EndsWith(lower_name, ".zst")) {
lower_name = lower_name.substr(0, lower_name.size() - 4);
}
if (!StringUtil::EndsWith(lower_name, ".json") && !StringUtil::Contains(lower_name, ".json?") &&
!StringUtil::EndsWith(lower_name, ".jsonl") && !StringUtil::Contains(lower_name, ".jsonl?") &&
!StringUtil::EndsWith(lower_name, ".ndjson") && !StringUtil::Contains(lower_name, ".ndjson?")) {
if (!ReplacementScan::CanReplace(table_name, {"json", "jsonl", "ndjson"})) {
return nullptr;
}
auto table_function = make_uniq<TableFunctionRef>();
Expand Down
41 changes: 27 additions & 14 deletions src/duckdb/extension/json/json_functions/json_create.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -682,20 +682,33 @@ BoundCastInfo AnyToJSONCastBind(BindCastInput &input, const LogicalType &source,
}

void JSONFunctions::RegisterJSONCreateCastFunctions(CastFunctionSet &casts) {
auto json_to_any_cost = casts.ImplicitCastCost(LogicalType::ANY, JSONCommon::JSONType());
casts.RegisterCastFunction(LogicalType::ANY, JSONCommon::JSONType(), AnyToJSONCastBind, json_to_any_cost);

const auto struct_type = LogicalType::STRUCT({{"any", LogicalType::ANY}});
auto struct_to_json_cost = casts.ImplicitCastCost(struct_type, LogicalType::VARCHAR) - 2;
casts.RegisterCastFunction(struct_type, JSONCommon::JSONType(), AnyToJSONCastBind, struct_to_json_cost);

const auto list_type = LogicalType::LIST(LogicalType::ANY);
auto list_to_json_cost = casts.ImplicitCastCost(list_type, LogicalType::VARCHAR) - 2;
casts.RegisterCastFunction(list_type, JSONCommon::JSONType(), AnyToJSONCastBind, list_to_json_cost);

const auto map_type = LogicalType::MAP(LogicalType::ANY, LogicalType::ANY);
auto map_to_json_cost = casts.ImplicitCastCost(map_type, LogicalType::VARCHAR) - 2;
casts.RegisterCastFunction(map_type, JSONCommon::JSONType(), AnyToJSONCastBind, map_to_json_cost);
// Anything can be cast to JSON
for (const auto &type : LogicalType::AllTypes()) {
LogicalType source_type;
switch (type.id()) {
case LogicalTypeId::STRUCT:
source_type = LogicalType::STRUCT({{"any", LogicalType::ANY}});
break;
case LogicalTypeId::LIST:
source_type = LogicalType::LIST(LogicalType::ANY);
break;
case LogicalTypeId::MAP:
source_type = LogicalType::MAP(LogicalType::ANY, LogicalType::ANY);
break;
case LogicalTypeId::UNION:
source_type = LogicalType::UNION({{"any", LogicalType::ANY}});
break;
case LogicalTypeId::VARCHAR:
// We skip this one here as it's handled in json_functions.cpp
continue;
default:
source_type = type;
}
// We prefer going to JSON over going to VARCHAR if a function can do either
const auto source_to_json_cost =
MaxValue<int64_t>(casts.ImplicitCastCost(source_type, LogicalType::VARCHAR) - 1, 0);
casts.RegisterCastFunction(source_type, JSONCommon::JSONType(), AnyToJSONCastBind, source_to_json_cost);
}
}

} // namespace duckdb
40 changes: 26 additions & 14 deletions src/duckdb/extension/json/json_functions/json_transform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -898,20 +898,32 @@ BoundCastInfo JSONToAnyCastBind(BindCastInput &input, const LogicalType &source,
}

void JSONFunctions::RegisterJSONTransformCastFunctions(CastFunctionSet &casts) {
auto json_to_any_cost = casts.ImplicitCastCost(JSONCommon::JSONType(), LogicalType::ANY);
casts.RegisterCastFunction(JSONCommon::JSONType(), LogicalType::ANY, JSONToAnyCastBind, json_to_any_cost);

const auto struct_type = LogicalType::STRUCT({{"any", LogicalType::ANY}});
auto json_to_struct_cost = casts.ImplicitCastCost(LogicalType::VARCHAR, struct_type) - 2;
casts.RegisterCastFunction(JSONCommon::JSONType(), struct_type, JSONToAnyCastBind, json_to_struct_cost);

const auto list_type = LogicalType::LIST(LogicalType::ANY);
auto json_to_list_cost = casts.ImplicitCastCost(LogicalType::VARCHAR, list_type) - 2;
casts.RegisterCastFunction(JSONCommon::JSONType(), list_type, JSONToAnyCastBind, json_to_list_cost);

const auto map_type = LogicalType::MAP(LogicalType::ANY, LogicalType::ANY);
auto json_to_map_cost = casts.ImplicitCastCost(LogicalType::VARCHAR, map_type) - 2;
casts.RegisterCastFunction(JSONCommon::JSONType(), map_type, JSONToAnyCastBind, json_to_map_cost);
// JSON can be cast to anything
for (const auto &type : LogicalType::AllTypes()) {
LogicalType target_type;
switch (type.id()) {
case LogicalTypeId::STRUCT:
target_type = LogicalType::STRUCT({{"any", LogicalType::ANY}});
break;
case LogicalTypeId::LIST:
target_type = LogicalType::LIST(LogicalType::ANY);
break;
case LogicalTypeId::MAP:
target_type = LogicalType::MAP(LogicalType::ANY, LogicalType::ANY);
break;
case LogicalTypeId::UNION:
target_type = LogicalType::UNION({{"any", LogicalType::ANY}});
break;
case LogicalTypeId::VARCHAR:
// We skip this one here as it's handled in json_functions.cpp
continue;
default:
target_type = type;
}
// Going from JSON to another type has the same cost as going from VARCHAR to that type
const auto json_to_target_cost = casts.ImplicitCastCost(LogicalType::VARCHAR, target_type);
casts.RegisterCastFunction(JSONCommon::JSONType(), target_type, JSONToAnyCastBind, json_to_target_cost);
}
}

} // namespace duckdb
27 changes: 26 additions & 1 deletion src/duckdb/extension/parquet/column_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ void ColumnReader::InitializeRead(idx_t row_group_idx_p, const vector<ColumnChun
void ColumnReader::PrepareRead(parquet_filter_t &filter) {
dict_decoder.reset();
defined_decoder.reset();
bss_decoder.reset();
block.reset();
PageHeader page_hdr;
page_hdr.read(protocol);
Expand Down Expand Up @@ -443,6 +444,13 @@ void ColumnReader::PrepareDataPage(PageHeader &page_hdr) {
PrepareDeltaByteArray(*block);
break;
}
case Encoding::BYTE_STREAM_SPLIT: {
// Subtract 1 from length as the block is allocated with 1 extra byte,
// but the byte stream split encoder needs to know the correct data size.
bss_decoder = make_uniq<BssDecoder>(block->ptr, block->len - 1);
block->inc(block->len);
break;
}
case Encoding::PLAIN:
// nothing to do here, will be read directly below
break;
Expand Down Expand Up @@ -488,7 +496,7 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr

idx_t null_count = 0;

if ((dict_decoder || dbp_decoder || rle_decoder) && HasDefines()) {
if ((dict_decoder || dbp_decoder || rle_decoder || bss_decoder) && HasDefines()) {
// we need the null count because the dictionary offsets have no entries for nulls
for (idx_t i = 0; i < read_now; i++) {
if (define_out[i + result_offset] != max_define) {
Expand Down Expand Up @@ -534,6 +542,23 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr
} else if (byte_array_data) {
// DELTA_BYTE_ARRAY or DELTA_LENGTH_BYTE_ARRAY
DeltaByteArray(define_out, read_now, filter, result_offset, result);
} else if (bss_decoder) {
auto read_buf = make_shared<ResizeableBuffer>();

switch (schema.type) {
case duckdb_parquet::format::Type::FLOAT:
read_buf->resize(reader.allocator, sizeof(float) * (read_now - null_count));
bss_decoder->GetBatch<float>(read_buf->ptr, read_now - null_count);
break;
case duckdb_parquet::format::Type::DOUBLE:
read_buf->resize(reader.allocator, sizeof(double) * (read_now - null_count));
bss_decoder->GetBatch<double>(read_buf->ptr, read_now - null_count);
break;
default:
throw std::runtime_error("BYTE_STREAM_SPLIT encoding is only supported for FLOAT or DOUBLE data");
}

Plain(read_buf, define_out, read_now, filter, result_offset, result);
} else {
PlainReference(block, result);
Plain(block, define_out, read_now, filter, result_offset, result);
Expand Down
11 changes: 10 additions & 1 deletion src/duckdb/extension/parquet/column_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -796,6 +796,13 @@ struct ParquetTimestampSOperator : public BaseParquetOperator {
}
};

struct ParquetTimeTZOperator : public BaseParquetOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
return input.time().micros;
}
};

struct ParquetHugeintOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
Expand Down Expand Up @@ -1975,12 +1982,14 @@ unique_ptr<ColumnWriter> ColumnWriter::CreateWriterRecursive(vector<duckdb_parqu
max_define, can_have_nulls);
case LogicalTypeId::BIGINT:
case LogicalTypeId::TIME:
case LogicalTypeId::TIME_TZ:
case LogicalTypeId::TIMESTAMP:
case LogicalTypeId::TIMESTAMP_TZ:
case LogicalTypeId::TIMESTAMP_MS:
return make_uniq<StandardColumnWriter<int64_t, int64_t>>(writer, schema_idx, std::move(schema_path), max_repeat,
max_define, can_have_nulls);
case LogicalTypeId::TIME_TZ:
return make_uniq<StandardColumnWriter<dtime_tz_t, int64_t, ParquetTimeTZOperator>>(
writer, schema_idx, std::move(schema_path), max_repeat, max_define, can_have_nulls);
case LogicalTypeId::HUGEINT:
return make_uniq<StandardColumnWriter<hugeint_t, double, ParquetHugeintOperator>>(
writer, schema_idx, std::move(schema_path), max_repeat, max_define, can_have_nulls);
Expand Down
2 changes: 2 additions & 0 deletions src/duckdb/extension/parquet/include/column_reader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#pragma once

#include "duckdb.hpp"
#include "parquet_bss_decoder.hpp"
#include "parquet_dbp_decoder.hpp"
#include "parquet_rle_bp_decoder.hpp"
#include "parquet_statistics.hpp"
Expand Down Expand Up @@ -161,6 +162,7 @@ class ColumnReader {
unique_ptr<RleBpDecoder> repeated_decoder;
unique_ptr<DbpDecoder> dbp_decoder;
unique_ptr<RleBpDecoder> rle_decoder;
unique_ptr<BssDecoder> bss_decoder;

// dummies for Skip()
parquet_filter_t none_filter;
Expand Down
49 changes: 49 additions & 0 deletions src/duckdb/extension/parquet/include/parquet_bss_decoder.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// parquet_bss_decoder.hpp
//
//
//===----------------------------------------------------------------------===//

#pragma once
#include "parquet_types.h"
#include "resizable_buffer.hpp"

namespace duckdb {

/// Decoder for the Byte Stream Split encoding
class BssDecoder {
public:
/// Create a decoder object. buffer/buffer_len is the encoded data.
BssDecoder(data_ptr_t buffer, uint32_t buffer_len) : buffer_(buffer, buffer_len), value_offset_(0) {
}

public:
template <typename T>
void GetBatch(data_ptr_t values_target_ptr, uint32_t batch_size) {
if (buffer_.len % sizeof(T) != 0) {
std::stringstream error;
error << "Data buffer size for the BYTE_STREAM_SPLIT encoding (" << buffer_.len
<< ") should be a multiple of the type size (" << sizeof(T) << ")";
throw std::runtime_error(error.str());
}
uint32_t num_buffer_values = buffer_.len / sizeof(T);

buffer_.available((value_offset_ + batch_size) * sizeof(T));

for (uint32_t byte_offset = 0; byte_offset < sizeof(T); ++byte_offset) {
data_ptr_t input_bytes = buffer_.ptr + byte_offset * num_buffer_values + value_offset_;
for (uint32_t i = 0; i < batch_size; ++i) {
values_target_ptr[byte_offset + i * sizeof(T)] = *(input_bytes + i);
}
}
value_offset_ += batch_size;
}

private:
ByteBuffer buffer_;
uint32_t value_offset_;
};

} // namespace duckdb
7 changes: 3 additions & 4 deletions src/duckdb/extension/parquet/parquet_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
#include "duckdb/common/enums/file_compression_type.hpp"
#include "duckdb/common/file_system.hpp"
#include "duckdb/common/multi_file_reader.hpp"
#include "duckdb/common/serializer/deserializer.hpp"
#include "duckdb/common/serializer/serializer.hpp"
#include "duckdb/common/types/chunk_collection.hpp"
#include "duckdb/function/copy_function.hpp"
#include "duckdb/function/table_function.hpp"
Expand All @@ -34,8 +36,6 @@
#include "duckdb/planner/operator/logical_get.hpp"
#include "duckdb/storage/statistics/base_statistics.hpp"
#include "duckdb/storage/table/row_group.hpp"
#include "duckdb/common/serializer/serializer.hpp"
#include "duckdb/common/serializer/deserializer.hpp"
#endif

namespace duckdb {
Expand Down Expand Up @@ -983,8 +983,7 @@ idx_t ParquetWriteDesiredBatchSize(ClientContext &context, FunctionData &bind_da
//===--------------------------------------------------------------------===//
unique_ptr<TableRef> ParquetScanReplacement(ClientContext &context, const string &table_name,
ReplacementScanData *data) {
auto lower_name = StringUtil::Lower(table_name);
if (!StringUtil::EndsWith(lower_name, ".parquet") && !StringUtil::Contains(lower_name, ".parquet?")) {
if (!ReplacementScan::CanReplace(table_name, {"parquet"})) {
return nullptr;
}
auto table_function = make_uniq<TableFunctionRef>();
Expand Down
7 changes: 3 additions & 4 deletions src/duckdb/extension/parquet/parquet_timestamp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,9 @@ dtime_t ParquetIntToTimeNs(const int64_t &raw_time) {
return Time::FromTimeNs(raw_time);
}

dtime_tz_t ParquetIntToTimeTZ(const int64_t &raw_time) {
dtime_tz_t result;
result.bits = raw_time;
return result;
dtime_tz_t ParquetIntToTimeTZ(const int64_t &raw_micros) {
dtime_t t(raw_micros);
return dtime_tz_t(t, 0);
}

} // namespace duckdb
4 changes: 2 additions & 2 deletions src/duckdb/src/common/arrow/appender/list_data.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,10 @@ void ArrowListData::Finalize(ArrowAppendData &append_data, const LogicalType &ty
result->buffers[1] = append_data.main_buffer.data();

auto &child_type = ListType::GetChildType(type);
append_data.child_pointers.resize(1);
ArrowAppender::AddChildren(append_data, 1);
result->children = append_data.child_pointers.data();
result->n_children = 1;
append_data.child_pointers[0] = ArrowAppender::FinalizeChild(child_type, *append_data.child_data[0]);
append_data.child_arrays[0] = *ArrowAppender::FinalizeChild(child_type, std::move(append_data.child_data[0]));
}

} // namespace duckdb
Loading