Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Api cleanup + SetManifest for Merger #118

Merged
merged 10 commits into from
Aug 23, 2016
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,10 @@ final {
};

MatchIterator::MatchIteratorPair GetCompletions(
const char* query, int number_of_results = 10) {
const std::string& query, int number_of_results = 10) {

// get query length
size_t query_length = strlen(query);
const size_t query_length = query.size();

// get tokens
std::vector<std::string> strs;
Expand Down
2 changes: 1 addition & 1 deletion keyvi/src/cpp/dictionary/completion/multiword_completion.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class MultiWordCompletion final {
}

MatchIterator::MatchIteratorPair GetCompletions(
const char* query, int number_of_results = 10) {
const std::string& query, int number_of_results = 10) const {

uint64_t state = fsa_->GetStartState();
size_t number_of_tokens;
Expand Down
14 changes: 7 additions & 7 deletions keyvi/src/cpp/dictionary/completion/prefix_completion.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,10 @@ final {
}

MatchIterator::MatchIteratorPair GetCompletions(
const char* query, int number_of_results = 10) {
const std::string& query, int number_of_results = 10) {

uint64_t state = fsa_->GetStartState();
size_t query_length = strlen(query);
const size_t query_length = query.size();
size_t depth = 0;

std::vector<unsigned char> traversal_stack;
Expand Down Expand Up @@ -93,7 +93,7 @@ final {
if (fsa_->IsFinalState(state)) {
TRACE("prefix matched depth %d %s", query_length + data->traverser.GetDepth(), std::string(reinterpret_cast<char*> (&data->traversal_stack[0]), query_length + data->traverser.GetDepth()).c_str());
first_match = Match(
0, query_length, std::string(query, query_length), 0, fsa_, fsa_->GetStateValue(state));
0, query_length, query, 0, fsa_, fsa_->GetStateValue(state));
}

auto tfunc =
Expand Down Expand Up @@ -137,16 +137,16 @@ final {
}

MatchIterator::MatchIteratorPair GetFuzzyCompletions(
const char* query, int max_edit_distance) {
const std::string& query, int max_edit_distance) {

uint64_t state = fsa_->GetStartState();
size_t query_length = strlen(query);
const size_t query_length = query.size();
size_t depth = 0;
const size_t minimum_exact_prefix = 2;
size_t exact_prefix = std::min(query_length, minimum_exact_prefix);
std::vector<int> codepoints;

utf8::unchecked::utf8to32(query, query + query_length,
utf8::unchecked::utf8to32(query.c_str(), query.c_str() + query_length,
back_inserter(codepoints));

stringdistance::Levenshtein metric(codepoints, 20, 3);
Expand Down Expand Up @@ -185,7 +185,7 @@ final {
if (depth == query_length && fsa_->IsFinalState(state)) {
TRACE("prefix matched depth %d %s", query_length + data->traverser.GetDepth(), std::string(query, query_length).c_str());
first_match = Match(
0, query_length, std::string(query, query_length), 0, fsa_, fsa_->GetStateValue(state));
0, query_length, query, 0, fsa_, fsa_->GetStateValue(state));
}

auto tfunc =
Expand Down
29 changes: 14 additions & 15 deletions keyvi/src/cpp/dictionary/dictionary.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ final {
* @param filename the filename
* @param load_lazy whether to load lazy.
*/
Dictionary(const char* filename, bool load_lazy)
Dictionary(const std::string& filename, bool load_lazy)
: fsa_(std::make_shared<fsa::Automata>(filename, load_lazy)) {
TRACE("Dictionary from file %s", filename);
}
Expand All @@ -60,7 +60,7 @@ final {
* @param filename filename to load keyvi file from.
* @param loading_strategy optional: Loading strategy to use.
*/
explicit Dictionary(const char* filename, loading_strategy_types loading_strategy = loading_strategy_types::lazy)
explicit Dictionary(const std::string& filename, loading_strategy_types loading_strategy = loading_strategy_types::lazy)
: fsa_(std::make_shared<fsa::Automata>(filename, loading_strategy)) {
TRACE("Dictionary from file %s", filename);
}
Expand All @@ -70,7 +70,7 @@ final {
}

// temporary implementation
fsa::automata_t GetFsa() {
fsa::automata_t GetFsa() const {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suggest have this getter without const qualifier, otherwise its possible to get the pointer from const Dictionary and modify the internal state from outside.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

or we can have a const qualifier, but with return type std::shared_ptr<const Automata> which assures read-only behavior

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, there is no internal state in Automata. So the only state would be the shared pointer itself. I will try it.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return fsa_;
}

Expand All @@ -88,9 +88,9 @@ final {
* @param key The key
* @return True if key is in the dictionary, False otherwise.
*/
bool Contains(const char* key) const {
bool Contains(const std::string& key) const {
uint64_t state = fsa_->GetStartState();
size_t key_length = strlen(key);
const size_t key_length = key.size();

TRACE("Contains for %s", key);
for (size_t i = 0; i < key_length; ++i) {
Expand All @@ -111,9 +111,9 @@ final {
return false;
}

Match operator[](const char* key) const {
Match operator[](const std::string& key) const {
uint64_t state = fsa_->GetStartState();
size_t text_length = strlen(key);
const size_t text_length = key.size();

for (size_t i = 0; i < text_length; ++i) {
state = fsa_->TryWalkTransition(state, key[i]);
Expand Down Expand Up @@ -142,9 +142,9 @@ final {
* @param key the key to lookup.
* @return a match iterator
*/
MatchIterator::MatchIteratorPair Get(const char* key) const {
MatchIterator::MatchIteratorPair Get(const std::string& key) const {
uint64_t state = fsa_->GetStartState();
size_t text_length = strlen(key);
const size_t text_length = key.size();

for (size_t i = 0; i < text_length; ++i) {
state = fsa_->TryWalkTransition(state, key[i]);
Expand Down Expand Up @@ -256,11 +256,11 @@ final {
* @param text the input
* @return a match iterator.
*/
MatchIterator::MatchIteratorPair Lookup(const char* text,
MatchIterator::MatchIteratorPair Lookup(const std::string& text,
size_t offset = 0) {

uint64_t state = fsa_->GetStartState();
size_t text_length = strlen(text);
const size_t text_length = text.size();
uint64_t last_final_state = 0;
size_t last_final_state_position = 0;

Expand All @@ -287,8 +287,7 @@ final {
m = Match(
offset,
last_final_state_position,
/*text.substr(0, last_final_state_position),*/
std::string(text + offset, last_final_state_position - offset),
text.substr(offset, last_final_state_position - offset),
0,
fsa_,
fsa_->GetStateValue(last_final_state));
Expand All @@ -313,9 +312,9 @@ final {
* @param text the input
* @return a match iterator.
*/
MatchIterator::MatchIteratorPair LookupText(const char* text) {
MatchIterator::MatchIteratorPair LookupText(const std::string& text) {

size_t text_length = strlen(text);
const size_t text_length = text.size();
std::queue<MatchIterator> iterators;

TRACE("LookupText, 1st lookup for: %s", text);
Expand Down
4 changes: 3 additions & 1 deletion keyvi/src/cpp/dictionary/dictionary_compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,9 @@ class DictionaryCompiler
if (params_.count(TEMPORARY_PATH_KEY) == 0) {
params_[TEMPORARY_PATH_KEY] =
boost::filesystem::temp_directory_path().string();

} else {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

// set temp path for tpie
initializer_.SetTempDirectory(params_[TEMPORARY_PATH_KEY]);
}

TRACE("tmp path set to %s", params_[TEMPORARY_PATH_KEY].c_str());
Expand Down
6 changes: 3 additions & 3 deletions keyvi/src/cpp/dictionary/fsa/automata.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,10 @@ class Automata
final {

public:
Automata(const char * filename, bool load_lazy):
Automata(const std::string& filename, bool load_lazy):
Automata(filename, load_lazy ? loading_strategy_types::default_os : loading_strategy_types::populate) {}

explicit Automata(const char * filename, loading_strategy_types loading_strategy = loading_strategy_types::lazy) {
explicit Automata(const std::string& filename, loading_strategy_types loading_strategy = loading_strategy_types::lazy) {
std::ifstream in_stream(filename, std::ios::binary);

if (!in_stream.good()) {
Expand Down Expand Up @@ -85,7 +85,7 @@ final {
size_t offset = in_stream.tellg();

file_mapping_ = new boost::interprocess::file_mapping(
filename, boost::interprocess::read_only);
filename.c_str(), boost::interprocess::read_only);
size_t array_size = boost::lexical_cast<size_t>(sparse_array_properties_.get<std::string>("size"));

in_stream.seekg(offset + array_size + bucket_size * array_size - 1);
Expand Down
23 changes: 9 additions & 14 deletions keyvi/src/cpp/dictionary/fsa/generator.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,11 @@ namespace fsa {
* @returns length of the longest common prefix of given strings
*/

inline size_t get_common_prefix_length(const char* first, const char* second) {
inline size_t get_common_prefix_length(const std::string& first, const std::string& second) {

size_t common_prefix_length = 0;

while (first[common_prefix_length] == second[common_prefix_length]
&& first[common_prefix_length] != 0) {
while (first[common_prefix_length] == second[common_prefix_length] && common_prefix_length < first.size()) {
++common_prefix_length;
}
return common_prefix_length;
Expand Down Expand Up @@ -190,9 +189,7 @@ final {
void Add(const std::string& input_key, typename ValueStoreT::value_t value =
ValueStoreT::no_value) {

const char* key = input_key.c_str();

size_t commonPrefixLength = get_common_prefix_length(last_key_.c_str(), key);
const size_t commonPrefixLength = get_common_prefix_length(last_key_, input_key);

// keys are equal, just return
if (commonPrefixLength == input_key.size() && last_key_.size() == input_key.size()) {
Expand All @@ -203,7 +200,7 @@ final {
ConsumeStack(commonPrefixLength);

// put everything that is not common between the two strings (the suffix) into the stack
FeedStack(commonPrefixLength, input_key.size(), key);
FeedStack(commonPrefixLength, input_key.size(), input_key.c_str());
Copy link

@ghost ghost Aug 23, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about changing signature of FeedStack to have param const std::string& input_key ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice catch! Feedstack already takes a const std::string&, so an implicit conversion happens here.


// get value and mark final state
bool no_minimization = false;
Expand All @@ -220,7 +217,7 @@ final {
stack_->UpdateWeights(0, input_key.size() + 1, weight);
}

last_key_ = key;
last_key_ = input_key;
state_ = generator_state::FEEDING;
}

Expand All @@ -231,9 +228,7 @@ final {
*/
void Add(const std::string& input_key, const ValueHandle& handle) {

const char* key = input_key.c_str();

size_t commonPrefixLength = get_common_prefix_length(last_key_.c_str(), key);
const size_t commonPrefixLength = get_common_prefix_length(last_key_, input_key);

// keys are equal, just return
if (commonPrefixLength == input_key.size() && last_key_.size() == input_key.size()) {
Expand All @@ -244,7 +239,7 @@ final {
ConsumeStack(commonPrefixLength);

// put everything that is not common between the two strings (the suffix) into the stack
FeedStack(commonPrefixLength, input_key.size(), key);
FeedStack(commonPrefixLength, input_key.size(), input_key.c_str());

stack_->InsertFinalState(input_key.size(), handle.value_idx, handle.no_minimization);

Expand All @@ -256,7 +251,7 @@ final {
stack_->UpdateWeights(0, input_key.size() + 1, handle.weight);
}

last_key_ = std::move(input_key);
last_key_ = input_key;
state_ = generator_state::FEEDING;
}

Expand Down Expand Up @@ -358,7 +353,7 @@ final {
internal::SerializationUtils::WriteJsonRecord(stream, pt);
}

inline void FeedStack(const size_t start, const size_t end, const char* key) {
inline void FeedStack(const size_t start, const size_t end, const std::string& key) {
for (size_t i = start; i < end; ++i) {
uint32_t ukey =
static_cast<uint32_t>(static_cast<unsigned char>(key[i]));
Expand Down
4 changes: 2 additions & 2 deletions pykeyvi/src/addons/Dictionary.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
key = key.encode('utf-8')
assert isinstance(key, bytes), 'arg in_0 wrong type'

cdef shared_ptr[_Match] _r = shared_ptr[_Match](new _Match(deref(self.inst.get())[(<const_char *>key)]))
cdef shared_ptr[_Match] _r = shared_ptr[_Match](new _Match(deref(self.inst.get())[(<libcpp_string>key)]))

if _r.get().IsEmpty():
return default
Expand All @@ -30,7 +30,7 @@

assert isinstance(key, bytes), 'arg in_0 wrong type'

cdef shared_ptr[_Match] _r = shared_ptr[_Match](new _Match(deref(self.inst.get())[(<const_char *>key)]))
cdef shared_ptr[_Match] _r = shared_ptr[_Match](new _Match(deref(self.inst.get())[(<libcpp_string>key)]))

if _r.get().IsEmpty():
raise KeyError(key)
Expand Down
4 changes: 2 additions & 2 deletions pykeyvi/src/addons/JsonDictionaryCompiler.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@

if isinstance(key, unicode):
key = key.encode('UTF-8')
cdef const_char * input_in_0 = <const_char *> key
cdef libcpp_string input_in_0 = <libcpp_string> key

if isinstance(value, unicode):
value = value.encode('UTF-8')
cdef const_char * input_in_1 = <const_char *> value
cdef libcpp_string input_in_1 = <libcpp_string> value

self.inst.get().Add(input_in_0, input_in_1)

Expand Down
14 changes: 7 additions & 7 deletions pykeyvi/src/pxds/dictionary.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,16 @@ cdef extern from "dictionary/dictionary.h" namespace "keyvi::dictionary":
populate_key_part_no_readahead_value_part # populate the key part, but disable read ahead value part

cdef cppclass Dictionary:
Dictionary (const_char* filename) except +
Dictionary (const_char* filename, loading_strategy_types) except +
bool Contains (const_char*) # wrap-ignore
Match operator[](const_char*) # wrap-ignore
_MatchIteratorPair Get (const_char*)
Dictionary (libcpp_string filename) except +
Dictionary (libcpp_string filename, loading_strategy_types) except +
bool Contains (libcpp_string) # wrap-ignore
Match operator[](libcpp_string) # wrap-ignore
_MatchIteratorPair Get (libcpp_string)
_MatchIteratorPair GetNear (libcpp_string, size_t minimum_prefix_length) except +
_MatchIteratorPair GetNear (libcpp_string, size_t minimum_prefix_length, bool greedy) except +
_MatchIteratorPair GetAllItems () # wrap-ignore
_MatchIteratorPair Lookup(const_char*)
_MatchIteratorPair LookupText(const_char*)
_MatchIteratorPair Lookup(libcpp_string)
_MatchIteratorPair LookupText(libcpp_string)
libcpp_string GetManifestAsString() except + # wrap-ignore
libcpp_string GetStatistics() # wrap-ignore
uint32_t GetSize() # wrap-ignore
22 changes: 11 additions & 11 deletions pykeyvi/src/pxds/dictionary_compiler.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ cdef extern from "dictionary/dictionary_types.h" namespace "keyvi::dictionary":
CompletionDictionaryCompiler() except +
CompletionDictionaryCompiler(size_t memory_limit) except +
CompletionDictionaryCompiler(size_t memory_limit, libcpp_map[libcpp_string, libcpp_string] value_store_params) except +
void Add(const_char*, int) except +
void __setitem__ (const_char*, int) except +
void Add(libcpp_string, int) except +
void __setitem__ (libcpp_string, int) except +
void Compile() nogil # wrap-ignore
void Compile(callback_t, void*) nogil # wrap-ignore
void SetManifestFromString(const_char*) # wrap-ignore
Expand All @@ -20,7 +20,7 @@ cdef extern from "dictionary/dictionary_types.h" namespace "keyvi::dictionary":
KeyOnlyDictionaryCompiler() except +
KeyOnlyDictionaryCompiler(size_t memory_limit) except +
KeyOnlyDictionaryCompiler(size_t memory_limit, libcpp_map[libcpp_string, libcpp_string] value_store_params) except +
void Add(const_char*) except +
void Add(libcpp_string) except +
void Compile() nogil # wrap-ignore
void Compile(callback_t, void*) nogil # wrap-ignore
void SetManifestFromString(const_char*) # wrap-ignore
Expand All @@ -30,21 +30,21 @@ cdef extern from "dictionary/dictionary_types.h" namespace "keyvi::dictionary":
JsonDictionaryCompiler() except +
JsonDictionaryCompiler(size_t memory_limit) except +
JsonDictionaryCompiler(size_t memory_limit, libcpp_map[libcpp_string, libcpp_string] value_store_params) except +
void Add(const_char*, const_char*) except + # wrap-ignore
void __setitem__(const_char*, const_char*) except +
void Add(libcpp_string, libcpp_string) except + # wrap-ignore
void __setitem__(libcpp_string, libcpp_string) except +
void Compile() nogil # wrap-ignore
void Compile(callback_t, void*) nogil # wrap-ignore
void SetManifestFromString(const_char*) except + # wrap-ignore
void WriteToFile(const_char*)
void SetManifestFromString(libcpp_string) except + # wrap-ignore
void WriteToFile(libcpp_string)

cdef cppclass StringDictionaryCompiler:
StringDictionaryCompiler() except +
StringDictionaryCompiler(size_t memory_limit) except +
StringDictionaryCompiler(size_t memory_limit, libcpp_map[libcpp_string, libcpp_string] value_store_params) except +
void Add(const_char*, const_char*) except +
void __setitem__(const_char*, const_char*) except +
void Add(libcpp_string, libcpp_string) except +
void __setitem__(libcpp_string, libcpp_string) except +
void Compile() nogil # wrap-ignore
void Compile(callback_t, void*) nogil # wrap-ignore
void SetManifestFromString(const_char*) # wrap-ignore
void WriteToFile(const_char*)
void SetManifestFromString(libcpp_string) # wrap-ignore
void WriteToFile(libcpp_string)

Loading