Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DO NOT MERGE: Remove c string limitation rebase2 #115

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,10 @@ final {
};

MatchIterator::MatchIteratorPair GetCompletions(
const char* query, int number_of_results = 10) {
const std::string& query, int number_of_results = 10) {

// get query length
size_t query_length = strlen(query);
const size_t query_length = query.size();

// get tokens
std::vector<std::string> strs;
Expand Down
8 changes: 3 additions & 5 deletions keyvi/src/cpp/dictionary/completion/multiword_completion.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class MultiWordCompletion final {
}

MatchIterator::MatchIteratorPair GetCompletions(
const char* query, int number_of_results = 10) {
const std::string& query, int number_of_results = 10) const {

uint64_t state = fsa_->GetStartState();
size_t number_of_tokens;
Expand Down Expand Up @@ -107,10 +107,8 @@ class MultiWordCompletion final {
TRACE("prefix completion callback called");

for (;;) {
unsigned char label = data->traverser.GetStateLabel();

if (label) {

if (data->traverser) {
unsigned char label = data->traverser.GetStateLabel();
if (label == 0x1b){
data->multi_word_boundary = data->traverser.GetDepth();
TRACE("found MW boundary at %d", data->multi_word_boundary);
Expand Down
26 changes: 11 additions & 15 deletions keyvi/src/cpp/dictionary/completion/prefix_completion.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,10 @@ final {
}

MatchIterator::MatchIteratorPair GetCompletions(
const char* query, int number_of_results = 10) {
const std::string& query, int number_of_results = 10) {

uint64_t state = fsa_->GetStartState();
size_t query_length = strlen(query);
const size_t query_length = query.size();
size_t depth = 0;

std::vector<unsigned char> traversal_stack;
Expand Down Expand Up @@ -93,20 +93,18 @@ final {
if (fsa_->IsFinalState(state)) {
TRACE("prefix matched depth %d %s", query_length + data->traverser.GetDepth(), std::string(reinterpret_cast<char*> (&data->traversal_stack[0]), query_length + data->traverser.GetDepth()).c_str());
first_match = Match(
0, query_length, std::string(query, query_length), 0, fsa_, fsa_->GetStateValue(state));
0, query_length, query, 0, fsa_, fsa_->GetStateValue(state));
}

auto tfunc =
[data, query_length] () {
TRACE("prefix completion callback called");

for (;;) {
unsigned char label = data->traverser.GetStateLabel();

if (label) {
if (data->traverser) {

data->traversal_stack.resize(query_length+data->traverser.GetDepth()-1);
data->traversal_stack.push_back(label);
data->traversal_stack.push_back(data->traverser.GetStateLabel());
TRACE("Current depth %d (%d)", query_length + data->traverser.GetDepth() -1, data->traversal_stack.size());
if (data->traverser.IsFinalState()) {
std::string match_str = std::string(reinterpret_cast<char*> (&data->traversal_stack[0]), query_length + data->traverser.GetDepth())
Expand Down Expand Up @@ -137,16 +135,16 @@ final {
}

MatchIterator::MatchIteratorPair GetFuzzyCompletions(
const char* query, int max_edit_distance) {
const std::string& query, int max_edit_distance) {

uint64_t state = fsa_->GetStartState();
size_t query_length = strlen(query);
const size_t query_length = query.size();
size_t depth = 0;
const size_t minimum_exact_prefix = 2;
size_t exact_prefix = std::min(query_length, minimum_exact_prefix);
std::vector<int> codepoints;

utf8::unchecked::utf8to32(query, query + query_length,
utf8::unchecked::utf8to32(query.c_str(), query.c_str() + query_length,
back_inserter(codepoints));

stringdistance::Levenshtein metric(codepoints, 20, 3);
Expand Down Expand Up @@ -185,20 +183,18 @@ final {
if (depth == query_length && fsa_->IsFinalState(state)) {
TRACE("prefix matched depth %d %s", query_length + data->traverser.GetDepth(), std::string(query, query_length).c_str());
first_match = Match(
0, query_length, std::string(query, query_length), 0, fsa_, fsa_->GetStateValue(state));
0, query_length, query, 0, fsa_, fsa_->GetStateValue(state));
}

auto tfunc =
[data, query_length, max_edit_distance, exact_prefix] () {
TRACE("prefix completion callback called");
for (;;) {
int label = data->traverser.GetStateLabel();

if (label) {
if (data->traverser) {

TRACE("Current depth %d", exact_prefix + data->traverser.GetDepth() -1);

int score = data->metric.Put(label, exact_prefix + data->traverser.GetDepth() - 1);
int score = data->metric.Put(data->traverser.GetStateLabel(), exact_prefix + data->traverser.GetDepth() - 1);

TRACE("Intermediate score %d", score);

Expand Down
51 changes: 23 additions & 28 deletions keyvi/src/cpp/dictionary/dictionary.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@ final {
* @param filename the filename
* @param load_lazy whether to load lazy.
*/
Dictionary(const char* filename, bool load_lazy)
Dictionary(const std::string& filename, bool load_lazy)
: fsa_(std::make_shared<fsa::Automata>(filename, load_lazy)) {
TRACE("Dictionary from file %s", filename);
TRACE("Dictionary from file %s", filename.c_str());
}

/**
Expand All @@ -60,17 +60,17 @@ final {
* @param filename filename to load keyvi file from.
* @param loading_strategy optional: Loading strategy to use.
*/
explicit Dictionary(const char* filename, loading_strategy_types loading_strategy = loading_strategy_types::lazy)
explicit Dictionary(const std::string& filename, loading_strategy_types loading_strategy = loading_strategy_types::lazy)
: fsa_(std::make_shared<fsa::Automata>(filename, loading_strategy)) {
TRACE("Dictionary from file %s", filename);
TRACE("Dictionary from file %s", filename.c_str());
}

Dictionary(fsa::automata_t f)
: fsa_(f) {
}

// temporary implementation
fsa::automata_t GetFsa() {
fsa::automata_t GetFsa() const {
return fsa_;
}

Expand All @@ -88,11 +88,11 @@ final {
* @param key The key
* @return True if key is in the dictionary, False otherwise.
*/
bool Contains(const char* key) const {
bool Contains(const std::string& key) const {
uint64_t state = fsa_->GetStartState();
size_t key_length = strlen(key);
const size_t key_length = key.size();

TRACE("Contains for %s", key);
TRACE("Contains for %s", key.c_str());
for (size_t i = 0; i < key_length; ++i) {
state = fsa_->TryWalkTransition(state, key[i]);

Expand All @@ -111,9 +111,9 @@ final {
return false;
}

Match operator[](const char* key) const {
Match operator[](const std::string& key) const {
uint64_t state = fsa_->GetStartState();
size_t text_length = strlen(key);
const size_t text_length = key.size();

for (size_t i = 0; i < text_length; ++i) {
state = fsa_->TryWalkTransition(state, key[i]);
Expand Down Expand Up @@ -142,9 +142,9 @@ final {
* @param key the key to lookup.
* @return a match iterator
*/
MatchIterator::MatchIteratorPair Get(const char* key) const {
MatchIterator::MatchIteratorPair Get(const std::string& key) const {
uint64_t state = fsa_->GetStartState();
size_t text_length = strlen(key);
const size_t text_length = key.size();

for (size_t i = 0; i < text_length; ++i) {
state = fsa_->TryWalkTransition(state, key[i]);
Expand Down Expand Up @@ -217,11 +217,9 @@ final {
TRACE("GetAllKeys callback called");

for (;;) {
unsigned char label = data->traverser.GetStateLabel();

if (label) {
if (!data->traverser.AtEnd()) {
data->traversal_stack.resize(data->traverser.GetDepth()-1);
data->traversal_stack.push_back(label);
data->traversal_stack.push_back(data->traverser.GetStateLabel());
TRACE("Current depth %d (%d)", data->traverser.GetDepth() -1, data->traversal_stack.size());

if (data->traverser.IsFinalState()) {
Expand Down Expand Up @@ -256,11 +254,11 @@ final {
* @param text the input
* @return a match iterator.
*/
MatchIterator::MatchIteratorPair Lookup(const char* text,
MatchIterator::MatchIteratorPair Lookup(const std::string& text,
size_t offset = 0) {

uint64_t state = fsa_->GetStartState();
size_t text_length = strlen(text);
const size_t text_length = text.size();
uint64_t last_final_state = 0;
size_t last_final_state_position = 0;

Expand All @@ -287,8 +285,7 @@ final {
m = Match(
offset,
last_final_state_position,
/*text.substr(0, last_final_state_position),*/
std::string(text + offset, last_final_state_position - offset),
text.substr(offset, last_final_state_position - offset),
0,
fsa_,
fsa_->GetStateValue(last_final_state));
Expand All @@ -313,12 +310,12 @@ final {
* @param text the input
* @return a match iterator.
*/
MatchIterator::MatchIteratorPair LookupText(const char* text) {
MatchIterator::MatchIteratorPair LookupText(const std::string& text) {

size_t text_length = strlen(text);
const size_t text_length = text.size();
std::queue<MatchIterator> iterators;

TRACE("LookupText, 1st lookup for: %s", text);
TRACE("LookupText, 1st lookup for: %s", text.c_str());

iterators.push(Lookup(text).begin());
size_t position = 1;
Expand All @@ -330,7 +327,7 @@ final {
}

++position;
TRACE("LookupText, starting lookup for: %s", text+position);
TRACE("LookupText, starting lookup for: %s", text.c_str()+position);
iterators.push(Lookup(text, position).begin());
}

Expand Down Expand Up @@ -407,13 +404,11 @@ final {


for (;;) {
unsigned char label = data->traverser.GetStateLabel();

// check minimum depth
if (label && data->traverser.GetDepth() > data->matched_depth) {
if (!data->traverser.AtEnd() && data->traverser.GetDepth() > data->matched_depth) {

data->traversal_stack.resize(data->traverser.GetDepth()-1);
data->traversal_stack.push_back(label);
data->traversal_stack.push_back(data->traverser.GetStateLabel());
TRACE("Current depth %d (%d)", minimum_prefix_length + data->traverser.GetDepth() -1, data->traversal_stack.size());
if (data->traverser.IsFinalState()) {
// optimize? fill vector upfront?
Expand Down
4 changes: 3 additions & 1 deletion keyvi/src/cpp/dictionary/dictionary_compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,9 @@ class DictionaryCompiler
if (params_.count(TEMPORARY_PATH_KEY) == 0) {
params_[TEMPORARY_PATH_KEY] =
boost::filesystem::temp_directory_path().string();

} else {
// set temp path for tpie
initializer_.SetTempDirectory(params_[TEMPORARY_PATH_KEY]);
}

TRACE("tmp path set to %s", params_[TEMPORARY_PATH_KEY].c_str());
Expand Down
37 changes: 5 additions & 32 deletions keyvi/src/cpp/dictionary/fsa/automata.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,10 @@ class Automata
final {

public:
Automata(const char * filename, bool load_lazy):
Automata(const std::string& filename, bool load_lazy):
Automata(filename, load_lazy ? loading_strategy_types::default_os : loading_strategy_types::populate) {}

explicit Automata(const char * filename, loading_strategy_types loading_strategy = loading_strategy_types::lazy) {
explicit Automata(const std::string& filename, loading_strategy_types loading_strategy = loading_strategy_types::lazy) {
std::ifstream in_stream(filename, std::ios::binary);

if (!in_stream.good()) {
Expand Down Expand Up @@ -85,7 +85,7 @@ final {
size_t offset = in_stream.tellg();

file_mapping_ = new boost::interprocess::file_mapping(
filename, boost::interprocess::read_only);
filename.c_str(), boost::interprocess::read_only);
size_t array_size = boost::lexical_cast<size_t>(sparse_array_properties_.get<std::string>("size"));

in_stream.seekg(offset + array_size + bucket_size * array_size - 1);
Expand Down Expand Up @@ -194,24 +194,12 @@ final {
TRACE ("Bitmask %d", mask_int);

if (mask_int != 0) {
if (offset == 0) {
// in this case we have to ignore the first bit, so start counting from 1
mask_int = mask_int >> 1;
for (auto i=1; i<16; ++i) {
if ((mask_int & 1) == 1) {
TRACE("push symbol+%d", symbol + i);
traversal_state.Add(ResolvePointer(starting_state, symbol + i), symbol + i, payload);
}
mask_int = mask_int >> 1;
}
} else {
for (auto i=0; i<16; ++i) {
if ((mask_int & 1) == 1) {
TRACE("push symbol+%d", symbol + i);
traversal_state.Add(ResolvePointer(starting_state, symbol + i), symbol + i, payload);
}
mask_int = mask_int >> 1;
}
}
}

Expand All @@ -229,7 +217,7 @@ final {

uint64_t xor_labels_with_mask = *labels_as_ll^*mask_as_ll;

if (((xor_labels_with_mask & 0x00000000000000ffULL) == 0) && offset > 0){
if (((xor_labels_with_mask & 0x00000000000000ffULL) == 0)){
traversal_state.Add(ResolvePointer(starting_state, symbol), symbol, payload);
}
if ((xor_labels_with_mask & 0x000000000000ff00ULL)== 0){
Expand Down Expand Up @@ -291,20 +279,6 @@ final {
TRACE ("Bitmask %d", mask_int);

if (mask_int != 0) {
if (offset == 0) {
// in this case we have to ignore the first bit, so start counting from 1
mask_int = mask_int >> 1;
for (auto i=1; i<16; ++i) {
if ((mask_int & 1) == 1) {
TRACE("push symbol+%d", symbol + i);
uint64_t child_state = ResolvePointer(starting_state, symbol + i);
uint32_t weight = GetWeightValue(child_state);
weight = weight != 0 ? weight : parent_weight;
traversal_state.Add(child_state, weight, symbol + i, payload);
}
mask_int = mask_int >> 1;
}
} else {
for (auto i=0; i<16; ++i) {
if ((mask_int & 1) == 1) {
TRACE("push symbol+%d", symbol + i);
Expand All @@ -315,7 +289,6 @@ final {
}
mask_int = mask_int >> 1;
}
}
}

++labels_as_m128;
Expand All @@ -332,7 +305,7 @@ final {

uint64_t xor_labels_with_mask = *labels_as_ll^*mask_as_ll;

if (((xor_labels_with_mask & 0x00000000000000ffULL) == 0) && offset > 0){
if (((xor_labels_with_mask & 0x00000000000000ffULL) == 0)){
uint64_t child_state = ResolvePointer(starting_state, symbol);
uint32_t weight = GetWeightValue(child_state);
weight = weight != 0 ? weight : parent_weight;
Expand Down
Loading