Skip to content

Commit

Permalink
Merge branch 'master' of github.com:cliqz/keyvi into remove-c-string-…
Browse files Browse the repository at this point in the history
…limitation-rebase2

Conflicts:
	pykeyvi/src/pykeyvi.cpp
  • Loading branch information
Hendrik Muhs committed Aug 22, 2016
2 parents 793f0ff + d7f8dbb commit 0c498ef
Show file tree
Hide file tree
Showing 8 changed files with 298 additions and 809 deletions.
85 changes: 55 additions & 30 deletions keyvi/src/cpp/dictionary/dictionary_merger.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#define DICTIONARY_MERGER_H_

#include <queue>
#include <memory>

#include "dictionary/fsa/generator.h"
#include "dictionary/fsa/automata.h"
Expand All @@ -44,25 +45,50 @@ typedef const fsa::internal::IValueStoreWriter::vs_param_t merger_param_t;
template<class PersistenceT, class ValueStoreT = fsa::internal::NullValueStore>
class DictionaryMerger
final {
private:
struct SegmentEntryForMerge
{
SegmentEntryForMerge(fsa::EntryIterator& e, int p): entry_iterator(e), priority(p) {}

fsa::EntryIterator entry_iterator;
int priority;
bool operator<(const SegmentEntryForMerge& rhs) const
{
// very important difference in semantics: we have to ensure that in case of equal key,
// the iterator with the higher priority is taken

if (priority < rhs.priority) {
return entry_iterator > rhs.entry_iterator;
}

return rhs.entry_iterator < entry_iterator;
}
};
private:
class SegmentIterator {
using EntryIteratorPtr = std::shared_ptr<fsa::EntryIterator>;

public:
SegmentIterator(const fsa::EntryIterator& e, int p) :
entry_iterator_ptr_(std::make_shared<fsa::EntryIterator>(e)),
priority_(p)
{}

bool operator<(const SegmentIterator& rhs) const {
// very important difference in semantics: we have to ensure that in case of equal key,
// the iterator with the higher priority is taken

if (priority_ < rhs.priority_) {
return entryIterator() > rhs.entryIterator();
}

return rhs.entryIterator() < entryIterator();
}

operator bool() const {
return entryIterator() != endIterator();
}

SegmentIterator& operator++() {
++(*entry_iterator_ptr_);
return *this;
}

const fsa::EntryIterator& entryIterator() const {
return *entry_iterator_ptr_;
}

private:
static const fsa::EntryIterator& endIterator() {
static fsa::EntryIterator end_it;
return end_it;
}

private:
EntryIteratorPtr entry_iterator_ptr_;
int priority_;
};

public:
DictionaryMerger(size_t memory_limit = 1073741824,
Expand All @@ -84,13 +110,12 @@ final {
}

void Merge(const std::string& filename){
std::priority_queue<SegmentEntryForMerge> pqueue;
fsa::EntryIterator end_it;
std::priority_queue<SegmentIterator> pqueue;

int i = 0;
for (auto fsa: dicts_to_merge_) {
fsa::EntryIterator e_it(fsa);
pqueue.push(SegmentEntryForMerge(e_it, i++));
pqueue.push(SegmentIterator(e_it, i++));
}

ValueStoreT* value_store = new ValueStoreT(params_);
Expand All @@ -100,19 +125,19 @@ final {
std::string top_key;

while(!pqueue.empty()){
auto entry_it = pqueue.top();
auto segment_it = pqueue.top();
pqueue.pop();

top_key = entry_it.entry_iterator.GetKey();
top_key = segment_it.entryIterator().GetKey();

// check for same keys and merge only the most recent one
while (!pqueue.empty() and pqueue.top().entry_iterator.operator==(top_key)) {
while (!pqueue.empty() and pqueue.top().entryIterator().operator==(top_key)) {

auto to_inc = pqueue.top();
TRACE("removing element with prio %d (in favor of %d)", to_inc.priority, entry_it.priority);

pqueue.pop();
if (++to_inc.entry_iterator != end_it) {
if (++to_inc) {
TRACE("push iterator");
pqueue.push(to_inc);
}
Expand All @@ -125,15 +150,15 @@ final {
//handle.weight = value_store_->GetWeightValue(value);
handle.weight = 0;

handle.value_idx = value_store->GetValue(entry_it.entry_iterator.GetFsa()->GetValueStore()->GetValueStorePayload(),
entry_it.entry_iterator.GetValueId(),
handle.value_idx = value_store->GetValue(segment_it.entryIterator().GetFsa()->GetValueStore()->GetValueStorePayload(),
segment_it.entryIterator().GetValueId(),
handle.no_minimization);

TRACE("Add key: %s", top_key.c_str());
generator.Add(std::move(top_key), handle);

if (++entry_it.entry_iterator != end_it) {
pqueue.push(entry_it);
if (++segment_it) {
pqueue.push(segment_it);
}
}
TRACE("finished iterating, do final compile.");
Expand Down
9 changes: 9 additions & 0 deletions keyvi/src/cpp/dictionary/match.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,15 @@ struct Match {
return fsa_->GetRawValueAsString(state_);
}

std::string GetMsgPackedValueAsString() const {
const std::string raw_value = GetRawValueAsString();
if (raw_value.empty()) {
return raw_value;
}
const compression::decompress_func_t decompressor = compression::decompressor_by_code(raw_value);
return decompressor(raw_value);
}

/**
* being able to set the value, e.g. when keyvi is used over network boundaries
*
Expand Down
1 change: 0 additions & 1 deletion pykeyvi/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,6 @@ def run(self):

install_requires = [
'msgpack-python',
'python-snappy',
]

setup(
Expand Down
15 changes: 3 additions & 12 deletions pykeyvi/src/addons/Match.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -31,20 +31,11 @@

def GetValue(self):
"""Decodes a keyvi value and returns it."""
value = self.inst.get().GetRawValueAsString()
if value is None or len(value) == 0:
cdef libcpp_string packed_value = self.inst.get().GetMsgPackedValueAsString()
if packed_value.empty():
return None

elif value[0] == '\x00':
return msgpack.loads(value[1:])

elif value[0] == '\x01':
value = zlib.decompress(value[1:])

elif value[0] == '\x02':
value = snappy.decompress(value[1:])

return msgpack.loads(value)
return msgpack.loads(packed_value)


def dumps(self):
Expand Down
2 changes: 0 additions & 2 deletions pykeyvi/src/addons/autwrap_workarounds.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,3 @@ from libc.stdint cimport uint32_t

import json
import msgpack
import zlib
import snappy
1 change: 1 addition & 0 deletions pykeyvi/src/pxds/match.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ cdef extern from "dictionary/match.h" namespace "keyvi::dictionary":
PyObject* GetAttributePy(libcpp_string) nogil except + # wrap-ignore
libcpp_string GetValueAsString() except +
libcpp_string GetRawValueAsString() except +
libcpp_string GetMsgPackedValueAsString() except + # wrap-ignore
void SetRawValue(libcpp_string) except + # wrap-ignore
void SetAttribute(libcpp_string, libcpp_string) except + # wrap-ignore
void SetAttribute(libcpp_string, float) except + # wrap-ignore
Expand Down
Loading

0 comments on commit 0c498ef

Please sign in to comment.