Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix logical error with IPv4 in Protobuf, add support for Date32 #48486

Merged
merged 5 commits into from
Apr 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
164 changes: 159 additions & 5 deletions src/Formats/ProtobufSerializer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ namespace ErrorCodes
extern const int PROTOBUF_BAD_CAST;
extern const int LOGICAL_ERROR;
extern const int BAD_ARGUMENTS;
extern const int ILLEGAL_COLUMN;
}

namespace
Expand Down Expand Up @@ -298,7 +299,10 @@ namespace
try
{
/// TODO: use accurate::convertNumeric() maybe?
result = boost::numeric_cast<DestType>(value);
if constexpr (std::is_same_v<SrcType, IPv4>)
result = boost::numeric_cast<DestType>(value.toUnderType());
Avogar marked this conversation as resolved.
Show resolved Hide resolved
else
result = boost::numeric_cast<DestType>(value);
}
catch (boost::numeric::bad_numeric_cast &)
{
Expand Down Expand Up @@ -504,7 +508,7 @@ namespace
{
UInt64 u64 = readUInt();
if (u64 < 2)
return static_cast<NumberType>(u64);
return castNumber<NumberType>(u64);
Avogar marked this conversation as resolved.
Show resolved Hide resolved
else
cannotConvertValue(toString(u64), field_descriptor.type_name(), TypeName<NumberType>);
};
Expand Down Expand Up @@ -1482,6 +1486,157 @@ namespace
}
};

class ProtobufSerializerDate32 : public ProtobufSerializerNumber<Int32>
{
public:
ProtobufSerializerDate32(
std::string_view column_name_,
const FieldDescriptor & field_descriptor_,
const ProtobufReaderOrWriter & reader_or_writer_)
: ProtobufSerializerNumber<Int32>(column_name_, field_descriptor_, reader_or_writer_)
{
setFunctions();
}

void describeTree(WriteBuffer & out, size_t indent) const override
{
writeIndent(out, indent) << "ProtobufSerializerDate32: column " << quoteString(column_name) << " -> field "
<< quoteString(field_descriptor.full_name()) << " (" << field_descriptor.type_name() << ")\n";
}

private:
void setFunctions()
{
switch (field_typeid)
{
case FieldTypeId::TYPE_INT32:
case FieldTypeId::TYPE_SINT32:
case FieldTypeId::TYPE_UINT32:
case FieldTypeId::TYPE_INT64:
case FieldTypeId::TYPE_SINT64:
case FieldTypeId::TYPE_UINT64:
case FieldTypeId::TYPE_FIXED32:
case FieldTypeId::TYPE_SFIXED32:
case FieldTypeId::TYPE_FIXED64:
case FieldTypeId::TYPE_SFIXED64:
case FieldTypeId::TYPE_FLOAT:
case FieldTypeId::TYPE_DOUBLE:
break; /// already set in ProtobufSerializerNumber<Int32>::setFunctions().

case FieldTypeId::TYPE_STRING:
case FieldTypeId::TYPE_BYTES:
{
write_function = [this](Int32 value)
{
dateToString(static_cast<ExtendedDayNum>(value), text_buffer);
writeStr(text_buffer);
};

read_function = [this]() -> Int32
{
readStr(text_buffer);
return stringToDate(text_buffer);
};

default_function = [this]() -> Int32 { return stringToDate(field_descriptor.default_value_string()); };
break;
}

default:
incompatibleColumnType("Date32");
}
}

static void dateToString(ExtendedDayNum date, String & str)
{
WriteBufferFromString buf{str};
writeDateText(date, buf);
}

static ExtendedDayNum stringToDate(const String & str)
{
ExtendedDayNum date;
ReadBufferFromString buf{str};
readDateText(date, buf);
return date;
}
};

class ProtobufSerializerIPv4 : public ProtobufSerializerNumber<IPv4>
Copy link
Member

@vitlibar vitlibar Apr 7, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we write : public ProtobufSerializerNumber<UInt32> here for consistency with ProtobufSerializerDate?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We cannot, because otherwise we will get logical error Bad cast from type DB::ColumnVector<StrongTypedef<unsigned int, DB::IPv4Tag>> to DB::ColumnVector<unsigned int>'. IPv4 type is not the same as UInt32

Copy link
Member

@vitlibar vitlibar Apr 12, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I see. DataTypeDate is derived from DataTypeNumberBase<UInt16> however for unknown reason DataTypeIPv4 is not derived from DataTypeNumberBase<UInt32>.

Probably if DataTypeIPv4 was derived from DataTypeNumberBase<UInt32> with ColumnType = ColumnVector<UInt32> , that would make things easier.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried to change it to DataTypeNumberBase<UInt32> with ColumnType = ColumnVector<UInt32>, but seems like it won't work as in some places we need to distinguish between ColumnVector<UInt32> and ColumnIPv4 (for example for proper CAST I guess) and that's the reason it's implemented as separate DataType. Let's keep it as is.

{
public:
ProtobufSerializerIPv4(
std::string_view column_name_,
const FieldDescriptor & field_descriptor_,
const ProtobufReaderOrWriter & reader_or_writer_)
: ProtobufSerializerNumber<IPv4>(column_name_, field_descriptor_, reader_or_writer_)
{
setFunctions();
}

void describeTree(WriteBuffer & out, size_t indent) const override
{
writeIndent(out, indent) << "ProtobufSerializerDate: column " << quoteString(column_name) << " -> field "
<< quoteString(field_descriptor.full_name()) << " (" << field_descriptor.type_name() << ")\n";
}

private:
void setFunctions()
{
switch (field_typeid)
{
case FieldTypeId::TYPE_INT32:
case FieldTypeId::TYPE_SINT32:
case FieldTypeId::TYPE_UINT32:
case FieldTypeId::TYPE_INT64:
case FieldTypeId::TYPE_SINT64:
case FieldTypeId::TYPE_UINT64:
case FieldTypeId::TYPE_FIXED32:
case FieldTypeId::TYPE_SFIXED32:
case FieldTypeId::TYPE_FIXED64:
case FieldTypeId::TYPE_SFIXED64:
case FieldTypeId::TYPE_FLOAT:
case FieldTypeId::TYPE_DOUBLE:
break; /// already set in ProtobufSerializerNumber<IPv4>::setFunctions().

case FieldTypeId::TYPE_STRING:
case FieldTypeId::TYPE_BYTES:
{
write_function = [this](IPv4 value)
{
ipv4ToString(value, text_buffer);
writeStr(text_buffer);
};

read_function = [this]() -> IPv4
{
readStr(text_buffer);
return stringToIPv4(text_buffer);
};

default_function = [this]() -> IPv4 { return stringToIPv4(field_descriptor.default_value_string()); };
break;
}

default:
incompatibleColumnType("IPv4");
}
}

static void ipv4ToString(IPv4 value, String & str)
{
WriteBufferFromString buf{str};
writeIPv4Text(value, buf);
}

static IPv4 stringToIPv4(const String & str)
{
IPv4 value;
ReadBufferFromString buf{str};
readIPv4Text(value, buf);
return value;
}
};

/// Serializes a ColumnVector<UInt32> containing datetimes to a field of any type except TYPE_MESSAGE, TYPE_GROUP, TYPE_BOOL, TYPE_ENUM.
class ProtobufSerializerDateTime : public ProtobufSerializerNumber<UInt32>
Expand Down Expand Up @@ -1722,8 +1877,6 @@ namespace
String text_buffer;
};

using ProtobufSerializerIPv4 = ProtobufSerializerNumber<UInt32>;

using ProtobufSerializerInterval = ProtobufSerializerNumber<Int64>;


Expand Down Expand Up @@ -3341,6 +3494,7 @@ namespace
case TypeIndex::UInt256: return std::make_unique<ProtobufSerializerNumber<UInt256>>(column_name, field_descriptor, reader_or_writer);
case TypeIndex::Int8: return std::make_unique<ProtobufSerializerNumber<Int8>>(column_name, field_descriptor, reader_or_writer);
case TypeIndex::Int16: return std::make_unique<ProtobufSerializerNumber<Int16>>(column_name, field_descriptor, reader_or_writer);
case TypeIndex::Date32: return std::make_unique<ProtobufSerializerDate32>(column_name, field_descriptor, reader_or_writer);
case TypeIndex::Int32: return std::make_unique<ProtobufSerializerNumber<Int32>>(column_name, field_descriptor, reader_or_writer);
case TypeIndex::Int64: return std::make_unique<ProtobufSerializerNumber<Int64>>(column_name, field_descriptor, reader_or_writer);
case TypeIndex::Int128: return std::make_unique<ProtobufSerializerNumber<Int128>>(column_name, field_descriptor, reader_or_writer);
Expand Down Expand Up @@ -3528,7 +3682,7 @@ namespace
}

default:
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown data type: {}", data_type->getName());
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Type {} is not supported in Protobuf format", data_type->getName());
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's not throw LOGICAL_ERROR here. We can add new types in future and forget to support it in Protobuf format. It's ok to thow just ILLEGAL_COLUMN.

}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
0.0.0.0 0.0.0.0 0.0.0.0 2020-01-01 2020-01-01 2020-01-01
1.2.3.4 1.2.3.4 1.2.3.4
255.255.255.255 255.255.255.255 255.255.255.255
17 changes: 17 additions & 0 deletions tests/queries/0_stateless/02710_protobuf_ipv4_date32.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/usr/bin/env bash
# Tags: no-fasttest

CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh

SCHEMADIR=$CURDIR/format_schemas

$CLICKHOUSE_LOCAL -q "select '0.0.0.0'::IPv4 as ipv4, ipv4 as ipv4_bytes, ipv4 as ipv4_int64, '2020-01-01'::Date32 as date32, date32 as date32_bytes, date32 as date32_int64 format Protobuf settings format_schema = '$SCHEMADIR/02710_schema:Message'" | $CLICKHOUSE_LOCAL --input-format Protobuf --format_schema="$SCHEMADIR/02710_schema:Message" --structure="ipv4 IPv4, ipv4_bytes IPv4, ipv4_int64 IPv4, date32 Date32, date32_bytes Date32, date32_int64 Date32" -q "select * from table"
Avogar marked this conversation as resolved.
Show resolved Hide resolved

$CLICKHOUSE_LOCAL -q "select '1.2.3.4'::IPv4 as ipv4, ipv4 as ipv4_bytes, ipv4 as ipv4_int64 format Protobuf settings format_schema = '$SCHEMADIR/02710_schema:Message'" | $CLICKHOUSE_LOCAL --input-format Protobuf --format_schema="$SCHEMADIR/02710_schema:Message" --structure="ipv4 IPv4, ipv4_bytes IPv4, ipv4_int64 IPv4" -q "select * from table"

$CLICKHOUSE_LOCAL -q "select '255.255.255.255'::IPv4 as ipv4, ipv4 as ipv4_bytes, ipv4 as ipv4_int64 format Protobuf settings format_schema = '$SCHEMADIR/02710_schema:Message'" | $CLICKHOUSE_LOCAL --input-format Protobuf --format_schema="$SCHEMADIR/02710_schema:Message" --structure="ipv4 IPv4, ipv4_bytes IPv4, ipv4_int64 IPv4" -q "select * from table"



Avogar marked this conversation as resolved.
Show resolved Hide resolved
11 changes: 11 additions & 0 deletions tests/queries/0_stateless/format_schemas/02710_schema.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
syntax = "proto3";

message Message
{
uint32 ipv4 = 1;
bytes ipv4_bytes = 2;
int64 ipv4_int64 = 3;
int32 date32 = 4;
bytes date32_bytes = 5;
int64 date32_int64 = 6;
}