Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WeakHash32 #9735

Merged
merged 45 commits into from
Mar 25, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
9e0c257
WeakHash32 [part 1].
KochetovNicolai Mar 13, 2020
21d9752
WeakHash for ColumnVector and ColumnString.
KochetovNicolai Mar 13, 2020
a19937a
Refactor weakHash for ColumnString.
KochetovNicolai Mar 18, 2020
e26eac4
Implement updateWeakHash32 for other columns.
KochetovNicolai Mar 18, 2020
e76877c
Fix build.
KochetovNicolai Mar 18, 2020
0a61f32
Fix build.
KochetovNicolai Mar 18, 2020
e94f572
Fix build.
KochetovNicolai Mar 18, 2020
9f82f43
Fix build.
KochetovNicolai Mar 18, 2020
6d16406
Fix build.
KochetovNicolai Mar 18, 2020
e6f5181
Fix build.
KochetovNicolai Mar 18, 2020
7246f81
Fix build.
KochetovNicolai Mar 18, 2020
fe36c27
Fix style.
KochetovNicolai Mar 19, 2020
dcd8321
Add some tests.
KochetovNicolai Mar 19, 2020
c9d808e
Fix weakHash
KochetovNicolai Mar 20, 2020
c4e73c1
Fix weakHash
KochetovNicolai Mar 20, 2020
33db7f3
Fix weakHash
KochetovNicolai Mar 20, 2020
ebba6a6
Fix weakHash
KochetovNicolai Mar 20, 2020
75192d4
Fix weakHash
KochetovNicolai Mar 20, 2020
0344978
Fix weakHash
KochetovNicolai Mar 20, 2020
46f63c1
Fix weakHash
KochetovNicolai Mar 20, 2020
1d94b8b
Merge branch 'master' into weak-hash
KochetovNicolai Mar 23, 2020
7f6dda4
Update unit test.
KochetovNicolai Mar 23, 2020
05fb176
Update unit test.
KochetovNicolai Mar 23, 2020
9ab6f6a
Update unit test.
KochetovNicolai Mar 23, 2020
7582f66
Update unit test.
KochetovNicolai Mar 23, 2020
2e22727
Update unit test.
KochetovNicolai Mar 23, 2020
0bd502f
Update unit test.
KochetovNicolai Mar 23, 2020
3c31139
More tests.
KochetovNicolai Mar 23, 2020
9e105b8
Update intHash.
KochetovNicolai Mar 23, 2020
c0dea7b
Fix test.
KochetovNicolai Mar 23, 2020
4cfdef4
Fix style and suppress pvs varnings.]
KochetovNicolai Mar 23, 2020
ad56da8
Added comments.
KochetovNicolai Mar 23, 2020
ee907da
Update weak hash.
KochetovNicolai Mar 23, 2020
6d63d4b
Update weak hash.
KochetovNicolai Mar 23, 2020
c437088
Update weak hash.
KochetovNicolai Mar 23, 2020
67ed4d4
Update weak hash.
KochetovNicolai Mar 24, 2020
488b21c
Update weak hash.
KochetovNicolai Mar 24, 2020
20d3e34
Update weak hash.
KochetovNicolai Mar 24, 2020
07b2945
Update weak hash.
KochetovNicolai Mar 24, 2020
08a9486
Update weak hash.
KochetovNicolai Mar 24, 2020
5400ffb
Update weak hash.
KochetovNicolai Mar 24, 2020
778be31
Fix pvs warnings.
KochetovNicolai Mar 24, 2020
a70a9e5
Update WeakHash.h
alexey-milovidov Mar 25, 2020
1a37afa
Review fixes.
KochetovNicolai Mar 25, 2020
f37b30f
Review fixes.
KochetovNicolai Mar 25, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
21 changes: 21 additions & 0 deletions dbms/src/Columns/ColumnAggregateFunction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
#include <Common/AlignedBuffer.h>
#include <Common/typeid_cast.h>
#include <Common/Arena.h>
#include <Common/WeakHash.h>
#include <Common/HashTable/Hash.h>

#include <AggregateFunctions/AggregateFunctionMLMethod.h>

Expand Down Expand Up @@ -282,6 +284,25 @@ void ColumnAggregateFunction::updateHashWithValue(size_t n, SipHash & hash) cons
hash.update(wbuf.str().c_str(), wbuf.str().size());
}

void ColumnAggregateFunction::updateWeakHash32(WeakHash32 & hash) const
{
auto s = data.size();
if (hash.getData().size() != data.size())
throw Exception("Size of WeakHash32 does not match size of column: column size is " + std::to_string(s) +
", hash size is " + std::to_string(hash.getData().size()), ErrorCodes::LOGICAL_ERROR);

auto & hash_data = hash.getData();

std::vector<UInt8> v;
for (size_t i = 0; i < s; ++i)
{
WriteBufferFromVector<std::vector<UInt8>> wbuf(v);
func->serialize(data[i], wbuf);
wbuf.finalize();
hash_data[i] = ::updateWeakHash32(v.data(), v.size(), hash_data[i]);
}
}

/// The returned size is less than real size. The reason is that some parts of
/// aggregate function data may be allocated on shared arenas. These arenas are
/// used for several blocks, and also may be updated concurrently from other
Expand Down
2 changes: 2 additions & 0 deletions dbms/src/Columns/ColumnAggregateFunction.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,8 @@ class ColumnAggregateFunction final : public COWHelper<IColumn, ColumnAggregateF

void updateHashWithValue(size_t n, SipHash & hash) const override;

void updateWeakHash32(WeakHash32 & hash) const override;

size_t byteSize() const override;

size_t allocatedBytes() const override;
Expand Down
32 changes: 32 additions & 0 deletions dbms/src/Columns/ColumnArray.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#include <Common/SipHash.h>
#include <Common/typeid_cast.h>
#include <Common/assert_cast.h>
#include <Common/WeakHash.h>
#include <Common/HashTable/Hash.h>


namespace DB
Expand Down Expand Up @@ -213,6 +215,36 @@ void ColumnArray::updateHashWithValue(size_t n, SipHash & hash) const
getData().updateHashWithValue(offset + i, hash);
}

void ColumnArray::updateWeakHash32(WeakHash32 & hash) const
{
auto s = offsets->size();
if (hash.getData().size() != s)
throw Exception("Size of WeakHash32 does not match size of column: column size is " + std::to_string(s) +
", hash size is " + std::to_string(hash.getData().size()), ErrorCodes::LOGICAL_ERROR);

WeakHash32 internal_hash(data->size());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Didn't get it. Why don't simply update hashes with all corresponding array elements in chain?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because we don't have method for single array element.
It would be possible if offsets will be added to updateWeakHash as the second argument.

data->updateWeakHash32(internal_hash);

Offset prev_offset = 0;
auto & offsets_data = getOffsets();
auto & hash_data = hash.getData();
auto & internal_hash_data = internal_hash.getData();

for (size_t i = 0; i < s; ++i)
{
/// This row improves hash a little bit according to integration tests.
/// It is the same as to use previous hash value as the first element of array.
hash_data[i] = intHashCRC32(hash_data[i]);

for (size_t row = prev_offset; row < offsets_data[i]; ++row)
/// It is probably not the best way to combine hashes.
/// But much better then xor which lead to similar hash for arrays like [1], [1, 1, 1], [1, 1, 1, 1, 1], ...
/// Much better implementation - to add offsets as an optional argument to updateWeakHash32.
hash_data[i] = intHashCRC32(internal_hash_data[row], hash_data[i]);

prev_offset = offsets_data[i];
}
}

void ColumnArray::insert(const Field & x)
{
Expand Down
1 change: 1 addition & 0 deletions dbms/src/Columns/ColumnArray.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ class ColumnArray final : public COWHelper<IColumn, ColumnArray>
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
const char * deserializeAndInsertFromArena(const char * pos) override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
void insert(const Field & x) override;
void insertFrom(const IColumn & src_, size_t n) override;
Expand Down
17 changes: 17 additions & 0 deletions dbms/src/Columns/ColumnConst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#include <Columns/ColumnsCommon.h>
#include <Common/PODArray.h>
#include <Common/typeid_cast.h>
#include <Common/WeakHash.h>
#include <Common/HashTable/Hash.h>


namespace DB
Expand All @@ -12,6 +14,7 @@ namespace DB
namespace ErrorCodes
{
extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
extern const int LOGICAL_ERROR;
}

ColumnConst::ColumnConst(const ColumnPtr & data_, size_t s_)
Expand Down Expand Up @@ -103,4 +106,18 @@ void ColumnConst::getPermutation(bool /*reverse*/, size_t /*limit*/, int /*nan_d
res[i] = i;
}

void ColumnConst::updateWeakHash32(WeakHash32 & hash) const
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe don't update hash at all?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need it.
Because data could be represented as const column, but different in different chunks.
And also hash will be unreasonably different after materializing.

{
if (hash.getData().size() != s)
throw Exception("Size of WeakHash32 does not match size of column: column size is " + std::to_string(s) +
", hash size is " + std::to_string(hash.getData().size()), ErrorCodes::LOGICAL_ERROR);

WeakHash32 element_hash(1);
data->updateWeakHash32(element_hash);
size_t data_hash = element_hash.getData()[0];

for (auto & value : hash.getData())
value = intHashCRC32(data_hash, value);
}

}
2 changes: 2 additions & 0 deletions dbms/src/Columns/ColumnConst.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,8 @@ class ColumnConst final : public COWHelper<IColumn, ColumnConst>
data->updateHashWithValue(0, hash);
}

void updateWeakHash32(WeakHash32 & hash) const override;

ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
ColumnPtr replicate(const Offsets & offsets) const override;
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
Expand Down
24 changes: 24 additions & 0 deletions dbms/src/Columns/ColumnDecimal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#include <Common/Arena.h>
#include <Common/SipHash.h>
#include <Common/assert_cast.h>
#include <Common/WeakHash.h>
#include <Common/HashTable/Hash.h>

#include <common/unaligned.h>

Expand All @@ -22,6 +24,7 @@ namespace ErrorCodes
extern const int PARAMETER_OUT_OF_BOUND;
extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
extern const int NOT_IMPLEMENTED;
extern const int LOGICAL_ERROR;
}

template <typename T>
Expand Down Expand Up @@ -65,6 +68,27 @@ void ColumnDecimal<T>::updateHashWithValue(size_t n, SipHash & hash) const
hash.update(data[n]);
}

template <typename T>
void ColumnDecimal<T>::updateWeakHash32(WeakHash32 & hash) const
{
auto s = data.size();

if (hash.getData().size() != s)
throw Exception("Size of WeakHash32 does not match size of column: column size is " + std::to_string(s) +
", hash size is " + std::to_string(hash.getData().size()), ErrorCodes::LOGICAL_ERROR);

const T * begin = data.data();
const T * end = begin + s;
UInt32 * hash_data = hash.getData().data();

while (begin < end)
{
*hash_data = intHashCRC32(*begin, *hash_data);
++begin;
++hash_data;
}
}

template <typename T>
void ColumnDecimal<T>::getPermutation(bool reverse, size_t limit, int , IColumn::Permutation & res) const
{
Expand Down
1 change: 1 addition & 0 deletions dbms/src/Columns/ColumnDecimal.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ class ColumnDecimal final : public COWHelper<ColumnVectorHelper, ColumnDecimal<T
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
const char * deserializeAndInsertFromArena(const char * pos) override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
int compareAt(size_t n, size_t m, const IColumn & rhs_, int nan_direction_hint) const override;
void getPermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res) const override;

Expand Down
23 changes: 23 additions & 0 deletions dbms/src/Columns/ColumnFixedString.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#include <Common/memcpySmall.h>
#include <Common/memcmpSmall.h>
#include <Common/assert_cast.h>
#include <Common/WeakHash.h>
#include <Common/HashTable/Hash.h>

#include <DataStreams/ColumnGathererStream.h>

Expand All @@ -25,6 +27,7 @@ namespace ErrorCodes
extern const int SIZE_OF_FIXED_STRING_DOESNT_MATCH;
extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
extern const int PARAMETER_OUT_OF_BOUND;
extern const int LOGICAL_ERROR;
}


Expand Down Expand Up @@ -101,6 +104,26 @@ void ColumnFixedString::updateHashWithValue(size_t index, SipHash & hash) const
hash.update(reinterpret_cast<const char *>(&chars[n * index]), n);
}

void ColumnFixedString::updateWeakHash32(WeakHash32 & hash) const
{
auto s = size();

if (hash.getData().size() != s)
throw Exception("Size of WeakHash32 does not match size of column: column size is " + std::to_string(s) +
", hash size is " + std::to_string(hash.getData().size()), ErrorCodes::LOGICAL_ERROR);

const UInt8 * pos = chars.data();
UInt32 * hash_data = hash.getData().data();

for (size_t row = 0; row < s; ++row)
{
*hash_data = ::updateWeakHash32(pos, n, *hash_data);

pos += n;
++hash_data;
}
}

template <bool positive>
struct ColumnFixedString::less
{
Expand Down
2 changes: 2 additions & 0 deletions dbms/src/Columns/ColumnFixedString.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ class ColumnFixedString final : public COWHelper<ColumnVectorHelper, ColumnFixed

void updateHashWithValue(size_t index, SipHash & hash) const override;

void updateWeakHash32(WeakHash32 & hash) const override;

int compareAt(size_t p1, size_t p2, const IColumn & rhs_, int /*nan_direction_hint*/) const override
{
const ColumnFixedString & rhs = assert_cast<const ColumnFixedString &>(rhs_);
Expand Down
5 changes: 5 additions & 0 deletions dbms/src/Columns/ColumnFunction.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,11 @@ class ColumnFunction final : public COWHelper<IColumn, ColumnFunction>
throw Exception("updateHashWithValue is not implemented for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}

void updateWeakHash32(WeakHash32 &) const override
{
throw Exception("updateWeakHash32 is not implemented for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}

void popBack(size_t) override
{
throw Exception("popBack is not implemented for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
Expand Down
34 changes: 34 additions & 0 deletions dbms/src/Columns/ColumnLowCardinality.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <DataTypes/NumberTraits.h>
#include <Common/HashTable/HashMap.h>
#include <Common/assert_cast.h>
#include <Common/WeakHash.h>


namespace DB
Expand Down Expand Up @@ -241,6 +242,21 @@ const char * ColumnLowCardinality::deserializeAndInsertFromArena(const char * po
return new_pos;
}

void ColumnLowCardinality::updateWeakHash32(WeakHash32 & hash) const
{
auto s = size();

if (hash.getData().size() != s)
throw Exception("Size of WeakHash32 does not match size of column: column size is " + std::to_string(s) +
", hash size is " + std::to_string(hash.getData().size()), ErrorCodes::LOGICAL_ERROR);

auto & dict = getDictionary().getNestedColumn();
WeakHash32 dict_hash(dict->size());
dict->updateWeakHash32(dict_hash);

idx.updateWeakHash(hash, dict_hash);
}

void ColumnLowCardinality::gather(ColumnGathererStream & gatherer)
{
gatherer.gather(*this);
Expand Down Expand Up @@ -645,6 +661,24 @@ bool ColumnLowCardinality::Index::containsDefault() const
return contains;
}

void ColumnLowCardinality::Index::updateWeakHash(WeakHash32 & hash, WeakHash32 & dict_hash) const
{
auto & hash_data = hash.getData();
auto & dict_hash_data = dict_hash.getData();

auto update_weak_hash = [&](auto x)
{
using CurIndexType = decltype(x);
auto & data = getPositionsData<CurIndexType>();
auto size = data.size();

for (size_t i = 0; i < size; ++i)
hash_data[i] = intHashCRC32(dict_hash_data[data[i]], hash_data[i]);
};

callForType(std::move(update_weak_hash), size_of_type);
}


ColumnLowCardinality::Dictionary::Dictionary(MutableColumnPtr && column_unique_, bool is_shared)
: column_unique(std::move(column_unique_)), shared(is_shared)
Expand Down
4 changes: 4 additions & 0 deletions dbms/src/Columns/ColumnLowCardinality.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ class ColumnLowCardinality final : public COWHelper<IColumn, ColumnLowCardinalit
return getDictionary().updateHashWithValue(getIndexes().getUInt(n), hash);
}

void updateWeakHash32(WeakHash32 & hash) const override;

ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override
{
return ColumnLowCardinality::create(dictionary.getColumnUniquePtr(), getIndexes().filter(filt, result_size_hint));
Expand Down Expand Up @@ -230,6 +232,8 @@ class ColumnLowCardinality final : public COWHelper<IColumn, ColumnLowCardinalit

bool containsDefault() const;

void updateWeakHash(WeakHash32 & hash, WeakHash32 & dict_hash) const;

private:
WrappedPtr positions;
size_t size_of_type = 0;
Expand Down
21 changes: 21 additions & 0 deletions dbms/src/Columns/ColumnNullable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <Common/NaNUtils.h>
#include <Common/typeid_cast.h>
#include <Common/assert_cast.h>
#include <Common/WeakHash.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnConst.h>
#include <DataStreams/ColumnGathererStream.h>
Expand Down Expand Up @@ -42,6 +43,26 @@ void ColumnNullable::updateHashWithValue(size_t n, SipHash & hash) const
getNestedColumn().updateHashWithValue(n, hash);
}

void ColumnNullable::updateWeakHash32(WeakHash32 & hash) const
{
auto s = size();

if (hash.getData().size() != s)
throw Exception("Size of WeakHash32 does not match size of column: column size is " + std::to_string(s) +
", hash size is " + std::to_string(hash.getData().size()), ErrorCodes::LOGICAL_ERROR);

WeakHash32 old_hash = hash;
nested_column->updateWeakHash32(hash);

auto & null_map_data = getNullMapData();
auto & hash_data = hash.getData();
auto & old_hash_data = old_hash.getData();

/// Use old data for nulls.
for (size_t row = 0; row < s; ++row)
if (null_map_data[row])
hash_data[row] = old_hash_data[row];
}

MutableColumnPtr ColumnNullable::cloneResized(size_t new_size) const
{
Expand Down
1 change: 1 addition & 0 deletions dbms/src/Columns/ColumnNullable.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ class ColumnNullable final : public COWHelper<IColumn, ColumnNullable>
void protect() override;
ColumnPtr replicate(const Offsets & replicate_offsets) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
void getExtremes(Field & min, Field & max) const override;

MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override
Expand Down