Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add StringHashMap to optimize string aggregation #5417

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
22 changes: 5 additions & 17 deletions dbms/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ class AggregateFunctionGroupUniqArray
auto & set = this->data(place).value;
size_t size = set.size();
writeVarUInt(size, buf);
for (auto & elem : set)
for (const auto & elem : set)
writeIntBinary(elem, buf);
}

Expand Down Expand Up @@ -216,18 +216,13 @@ class AggregateFunctionGroupUniqArrayGeneric
return;
StringRef str_serialized = getSerialization(*columns[0], row_num, *arena);

set.emplace(str_serialized, it, inserted);
set.emplace(str_serialized, it, inserted, *arena);

if constexpr (!is_plain_column)
{
if (!inserted)
arena->rollback(str_serialized.size);
}
else
{
if (inserted)
it->getValueMutable().data = arena->insert(str_serialized.data, str_serialized.size);
}
}

void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override
Expand All @@ -237,18 +232,11 @@ class AggregateFunctionGroupUniqArrayGeneric

bool inserted;
State::Set::iterator it;
for (auto & rhs_elem : rhs_set)
for (const auto & rhs_elem : rhs_set)
{
if (limit_num_elems && cur_set.size() >= max_elems)
return ;
cur_set.emplace(rhs_elem.getValue(), it, inserted);
if (inserted)
{
if (it->getValue().size)
it->getValueMutable().data = arena->insert(it->getValue().data, it->getValue().size);
else
it->getValueMutable().data = nullptr;
}
cur_set.emplace(rhs_elem.getValue(), it, inserted, *arena);
}
}

Expand All @@ -261,7 +249,7 @@ class AggregateFunctionGroupUniqArrayGeneric
auto & set = this->data(place).value;
offsets_to.push_back(offsets_to.back() + set.size());

for (auto & elem : set)
for (const auto & elem : set)
{
deserializeAndInsert(elem.getValue(), data_to);
}
Expand Down
2 changes: 2 additions & 0 deletions dbms/src/Columns/ColumnArray.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,8 @@ StringRef ColumnArray::getDataAt(size_t n) const

size_t offset_of_first_elem = offsetAt(n);
StringRef first = getData().getDataAtWithTerminatingZero(offset_of_first_elem);
if (reinterpret_cast<uintptr_t>(first.data) == 0xffffffffffffffff)
throw Exception("getDataAt returns a StringRef with -1 pointer", ErrorCodes::BAD_GET);

size_t array_size = sizeAt(n);
if (array_size == 0)
Expand Down
2 changes: 1 addition & 1 deletion dbms/src/Columns/ColumnLowCardinality.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ namespace
auto & data = res_col->getData();

data.resize(hash_map.size());
for (auto val : hash_map)
for (const auto & val : hash_map)
data[val.getSecond()] = val.getFirst();

for (auto & ind : index)
Expand Down
37 changes: 15 additions & 22 deletions dbms/src/Common/ColumnsHashing.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ struct HashMethodString

const IColumn::Offset * offsets;
const UInt8 * chars;
static constexpr bool key_to_arena = place_string_to_arena;

HashMethodString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &)
{
Expand All @@ -94,15 +95,6 @@ struct HashMethodString

protected:
friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;

static ALWAYS_INLINE void onNewKey([[maybe_unused]] StringRef & key, [[maybe_unused]] Arena & pool)
{
akuzm marked this conversation as resolved.
Show resolved Hide resolved
if constexpr (place_string_to_arena)
{
if (key.size)
key.data = pool.insert(key.data, key.size);
}
}
};


Expand All @@ -116,6 +108,7 @@ struct HashMethodFixedString

size_t n;
const ColumnFixedString::Chars * chars;
static constexpr bool key_to_arena = place_string_to_arena;

HashMethodFixedString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &)
{
Expand All @@ -131,11 +124,6 @@ struct HashMethodFixedString

protected:
friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
static ALWAYS_INLINE void onNewKey([[maybe_unused]] StringRef & key, [[maybe_unused]] Arena & pool)
{
if constexpr (place_string_to_arena)
key.data = pool.insert(key.data, key.size);
}
};


Expand Down Expand Up @@ -350,22 +338,27 @@ struct HashMethodSingleLowCardinalityColumn : public SingleColumnMethod

bool inserted = false;
typename Data::iterator it;
if (saved_hash)
data.emplace(key, it, inserted, saved_hash[row]);
if constexpr (SingleColumnMethod::key_to_arena)
{
if (saved_hash)
data.emplace(key, it, inserted, saved_hash[row], pool);
else
data.emplace(key, it, inserted, pool);
}
else
data.emplace(key, it, inserted);
{
if (saved_hash)
data.emplace(key, it, inserted, saved_hash[row]);
else
data.emplace(key, it, inserted);
}

visit_cache[row] = VisitValue::Found;

if (inserted)
{
if constexpr (has_mapped)
{
new(&it->getSecond()) Mapped();
Base::onNewKey(it->getFirstMutable(), pool);
}
else
Base::onNewKey(*it, pool);
}

if constexpr (has_mapped)
Expand Down
32 changes: 9 additions & 23 deletions dbms/src/Common/ColumnsHashingImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,12 @@ struct LastElementCache
bool empty = true;
bool found = false;

IGNORE_MAYBE_UNINITIALIZED_PUSH
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can probably remove this one as well, I tried and there's no warning.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removing that leads to

/home/amos/git/chorigin/dbms/src/Common/ColumnsHashingImpl.h:40:54: error: 'state' may be used uninitialized in this function [-Werror=maybe-uninitialized]
     bool check(const Value & value_) { return !empty && value == value_; }
                                               ~~~~~~~^~~~~~~~~~~~~~~~~~

bool check(const Value & value_) { return !empty && value == value_; }

template <typename Key>
bool check(const Key & key) { return !empty && value.first == key; }
IGNORE_MAYBE_UNINITIALIZED_POP
};

template <typename Data>
Expand Down Expand Up @@ -108,6 +110,7 @@ class HashMethodBase
using EmplaceResult = EmplaceResultImpl<Mapped>;
using FindResult = FindResultImpl<Mapped>;
static constexpr bool has_mapped = !std::is_same<Mapped, void>::value;
static constexpr bool key_to_arena = false;
using Cache = LastElementCache<Value, consecutive_keys_optimization>;

static HashMethodContextPtr createContext(const HashMethodContext::Settings &) { return nullptr; }
Expand All @@ -116,7 +119,7 @@ class HashMethodBase
ALWAYS_INLINE EmplaceResult emplaceKey(Data & data, size_t row, Arena & pool)
{
auto key = static_cast<Derived &>(*this).getKey(row, pool);
return emplaceKeyImpl(key, data, pool);
return emplaceKeyImpl<Derived::key_to_arena>(key, data, pool);
}

template <typename Data>
Expand All @@ -140,27 +143,12 @@ class HashMethodBase
protected:
Cache cache;

HashMethodBase()
{
if constexpr (consecutive_keys_optimization)
{
if constexpr (has_mapped)
{
/// Init PairNoInit elements.
cache.value.second = Mapped();
cache.value.first = {};
}
else
cache.value = Value();
}
}

template <typename Key>
static ALWAYS_INLINE void onNewKey(Key & /*key*/, Arena & /*pool*/) {}
template <typename Key>
static ALWAYS_INLINE void onExistingKey(Key & /*key*/, Arena & /*pool*/) {}

template <typename Data, typename Key>
template <bool may_place_key_to_arena, typename Data, typename Key>
ALWAYS_INLINE EmplaceResult emplaceKeyImpl(Key key, Data & data, Arena & pool)
{
if constexpr (Cache::consecutive_keys_optimization)
Expand All @@ -178,7 +166,10 @@ class HashMethodBase

typename Data::iterator it;
bool inserted = false;
data.emplace(key, it, inserted);
if constexpr (may_place_key_to_arena)
data.emplace(key, it, inserted, pool);
else
data.emplace(key, it, inserted);

[[maybe_unused]] Mapped * cached = nullptr;
if constexpr (has_mapped)
Expand All @@ -187,12 +178,7 @@ class HashMethodBase
if (inserted)
{
if constexpr (has_mapped)
{
new(&it->getSecond()) Mapped();
static_cast<Derived &>(*this).onNewKey(it->getFirstMutable(), pool);
}
else
static_cast<Derived &>(*this).onNewKey(it->getValueMutable(), pool);
}
else
static_cast<Derived &>(*this).onExistingKey(key, pool);
Expand Down
1 change: 0 additions & 1 deletion dbms/src/Common/HashTable/FixedClearableHashSet.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ struct FixedClearableHashTableCell
struct CellExt
{
Key key;
value_type & getValueMutable() { return key; }
const value_type & getValue() const { return key; }
void update(Key && key_, FixedClearableHashTableCell *) { key = key_; }
};
Expand Down
89 changes: 87 additions & 2 deletions dbms/src/Common/HashTable/FixedHashMap.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ struct FixedHashMapCell
using State = TState;

using value_type = PairNoInit<Key, Mapped>;
bool full;
Mapped mapped;
bool full;

FixedHashMapCell() {}
FixedHashMapCell(const Key &, const State &) : full(true) {}
Expand Down Expand Up @@ -53,12 +53,87 @@ class FixedHashMap : public FixedHashTable<Key, FixedHashMapCell<Key, Mapped>, A
{
public:
using Base = FixedHashTable<Key, FixedHashMapCell<Key, Mapped>, Allocator>;
using Self = FixedHashMap;
using key_type = Key;
using mapped_type = Mapped;
using value_type = typename Base::cell_type::value_type;
using Cell = typename Base::cell_type;
using value_type = typename Cell::value_type;

using Base::Base;

/// Merge every cell's value of current map into the destination map.
/// Func should have signature void(Mapped & dst, Mapped & src, bool emplaced).
/// Each filled cell in current map will invoke func once. If that map doesn't
/// have a key equals to the given cell, a new cell gets emplaced into that map,
/// and func is invoked with the third argument emplaced set to true. Otherwise
/// emplaced is set to false.
template <typename Func>
void mergeToViaEmplace(Self & that, Func && func)
{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's worth to have a comment about why we use internal iteration, like you described in the pull request, and also mention what arguments the Func should take.

The names of the functions are somewhat unclear. Maybe something like mergeToEmplace -> merge and mergeToFind -> findInBoth?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, comments are needed among other places as well. Naming things is hard (https://martinfowler.com/bliki/TwoHardThings.html) . I'm open to suggestions.

auto this_buf = this->buf;
auto that_buf = that.buf;
for (auto i = 0ul; i < this->BUFFER_SIZE; ++i)
{
if (this_buf[i].isZero(*this))
continue;
if (that_buf[i].isZero(that))
{
new (&that_buf[i]) Cell(i, that);
++that.m_size;
func(that_buf[i].getSecond(), this_buf[i].getSecond(), true);
}
else
func(that_buf[i].getSecond(), this_buf[i].getSecond(), false);
}
}

/// Merge every cell's value of current map into the destination map via find.
/// Func should have signature void(Mapped & dst, Mapped & src, bool exist).
/// Each filled cell in current map will invoke func once. If that map doesn't
/// have a key equals to the given cell, func is invoked with the third argument
/// exist set to false. Otherwise exist is set to true.
template <typename Func>
void mergeToViaFind(Self & that, Func && func)
{
auto this_buf = this->buf;
auto that_buf = that.buf;
for (auto i = 0ul; i < this->BUFFER_SIZE; ++i)
{
if (this_buf[i].isZero(*this))
continue;
if (that_buf[i].isZero(that))
func(this_buf[i].getSecond(), this_buf[i].getSecond(), false);
else
func(that_buf[i].getSecond(), this_buf[i].getSecond(), true);
}
}

template <typename Func>
void forEachValue(Func && func)
{
ValueHolder value;
auto this_buf = this->buf;
for (auto i = 0u; i < this->BUFFER_SIZE; ++i)
{
if (this_buf[i].isZero(*this))
continue;
value = {static_cast<Key>(i), &this_buf[i].getSecond()};
func(value);
}
}

template <typename Func>
void forEachMapped(Func && func)
{
auto this_buf = this->buf;
akuzm marked this conversation as resolved.
Show resolved Hide resolved
for (auto i = 0u; i < this->BUFFER_SIZE; ++i)
{
if (this_buf[i].isZero(*this))
continue;
func(this_buf[i].getSecond());
}
}

mapped_type & ALWAYS_INLINE operator[](Key x)
{
typename Base::iterator it;
Expand All @@ -69,4 +144,14 @@ class FixedHashMap : public FixedHashTable<Key, FixedHashMapCell<Key, Mapped>, A

return it->getSecond();
}

struct ValueHolder
{
Key key;
mapped_type * mapped;
auto * operator-> () { return this; }
mapped_type & getSecond() { return *mapped; }
const mapped_type & getSecond() const { return *mapped; }
value_type getValue() const { return {key, *mapped}; }
};
};