Skip to content

Commit

Permalink
Merge pull request #164 from hendrik-cliqz/0_2_cleanup-remove-deprecated
Browse files Browse the repository at this point in the history
master to 0_2_cleanup
  • Loading branch information
hendrikmuhs committed Dec 6, 2016
2 parents 554f8bb + 63946d2 commit 66b56ce
Show file tree
Hide file tree
Showing 7 changed files with 91 additions and 38 deletions.
2 changes: 1 addition & 1 deletion keyvi/src/cpp/dictionary/fsa/generator.h
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@ final {

void WriteHeader(std::ostream& stream) {
boost::property_tree::ptree pt;
pt.put("version", "1");
pt.put("version", "2");
pt.put("start_state", std::to_string(start_state_));
pt.put("number_of_keys", std::to_string(number_of_keys_added_));
pt.put("value_store_type", std::to_string(value_store_->GetValueStoreType()));
Expand Down
13 changes: 13 additions & 0 deletions keyvi/src/cpp/dictionary/fsa/internal/constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,19 @@
#ifndef CONSTANTS_H_
#define CONSTANTS_H_

// file format definitions

// file magic
#define KEYVI_FILE_MAGIC "KEYVIFSA"
#define KEYVI_FILE_MAGIC_LEN 8

// min version of the file
static const int KEYVI_FILE_VERSION_MIN = 1;

// min version of the persistence part
static const int KEYVI_FILE_PERSISTENCE_VERSION_MIN = 1;


#define NUMBER_OF_STATE_CODINGS 255
#define FINAL_OFFSET_TRANSITION 256
#define FINAL_OFFSET_CODE 1
Expand Down
24 changes: 9 additions & 15 deletions keyvi/src/cpp/dictionary/fsa/internal/sparse_array_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ class SparseArrayBuilder<SparseArrayPersistence<uint16_t>, OffsetTypeT, HashCode
SlidingWindowBitArrayPositionTracker zerobyte_scrambling_state_start_positions_; //< special construct to mark states already in use for zerobyte handling

OffsetTypeT FindFreeBucket(
const UnpackedState<SparseArrayPersistence<uint16_t>>& unpacked_state) const {
UnpackedState<SparseArrayPersistence<uint16_t>>& unpacked_state) const {

// states (state ids) start with 1 as 0 is reserved to mark a 'none-state'
OffsetTypeT start_position =
Expand Down Expand Up @@ -186,7 +186,7 @@ class SparseArrayBuilder<SparseArrayPersistence<uint16_t>, OffsetTypeT, HashCode
}

if (unpacked_state[0].label != 0 && !taken_positions_in_sparsearray_.IsSet(start_position)) {
TRACE("Need special handling for zero-byte state");
TRACE("Need special handling for zero-byte state, position %ld", start_position);

// state has no 0-byte, we have to 'scramble' the 0-byte to avoid a ghost state
if (start_position >= NUMBER_OF_STATE_CODINGS) {
Expand All @@ -209,6 +209,10 @@ class SparseArrayBuilder<SparseArrayPersistence<uint16_t>, OffsetTypeT, HashCode
continue;
}

TRACE("Found zero byte label %d ,position %ld", zerobyte_scrambling_label, zerobyte_scrambling_state);

unpacked_state.SetZerobyteState(zerobyte_scrambling_state);
unpacked_state.SetZerobyteLabel(zerobyte_scrambling_label);
}
}

Expand Down Expand Up @@ -248,23 +252,13 @@ class SparseArrayBuilder<SparseArrayPersistence<uint16_t>, OffsetTypeT, HashCode

// check if something is already written there
if (!taken_positions_in_sparsearray_.IsSet(offset)) {

// no 0-byte, we have to 'scramble' the 0-byte to avoid a ghost state
unsigned char invalid_label = 0xff;
if (offset >= NUMBER_OF_STATE_CODINGS) {
OffsetTypeT next_free_slot = state_start_positions_.NextFreeSlot(offset - NUMBER_OF_STATE_CODINGS);

invalid_label = static_cast<unsigned char> (offset - next_free_slot);

TRACE ("Write bogus label %d, and block start state at %d (%d %d)", invalid_label, next_free_slot);

// block the position as a possible start state
//state_start_positions_.Set(next_free_slot);
zerobyte_scrambling_state_start_positions_.Set(next_free_slot);;
zerobyte_scrambling_state_start_positions_.Set(unpacked_state.GetZerobyteState());
}

// write the bogus label (it can get overridden later, which is ok)
WriteTransition(offset, invalid_label, 0);
// write the zerobyte label (it can get overridden later, which is ok)
WriteTransition(offset, unpacked_state.GetZerobyteLabel(), 0);
}
} else {
// first bit is a 0 byte, so check [1]
Expand Down
28 changes: 28 additions & 0 deletions keyvi/src/cpp/dictionary/fsa/internal/unpacked_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,17 @@ final {
hashcode_(other.hashcode_),
no_minimization_counter_(other.no_minimization_counter_),
weight_(other.weight_),
zerobyte_state_ (other.zerobyte_state_),
zerobyte_label_ (other.zerobyte_label_),
final_(other.final_)
{
other.persistence_ = 0;
other.used_ = 0;
other.hashcode_ = 0;
other.no_minimization_counter_ = 0;
other.weight_ = 0;
other.zerobyte_state_ = 0;
other.zerobyte_label_ = 0xff;
other.final_ = false;
}

Expand All @@ -86,13 +90,17 @@ final {
hashcode_ = other.hashcode_;
no_minimization_counter_ = other.no_minimization_counter_;
weight_ = other.weight_;
zerobyte_state_ = other.zerobyte_state_;
zerobyte_label_ = other.zerobyte_label_;
final_ = other.final_;

other.persistence_ = 0;
other.used_ = 0;
other.hashcode_ = 0;
other.no_minimization_counter_ = 0;
other.weight_ = 0;
other.zerobyte_state_ = 0;
other.zerobyte_label_ = 0xff;
other.final_ = false;

return *this;
Expand Down Expand Up @@ -128,6 +136,8 @@ final {
bitvector_.Clear();
no_minimization_counter_ = 0;
weight_ = 0;
zerobyte_state_ = 0;
zerobyte_label_ = 0xff;
final_ = false;
}

Expand All @@ -153,6 +163,22 @@ final {
return weight_;
}

void SetZerobyteState(size_t position) {
zerobyte_state_ = position;
}

size_t GetZerobyteState() const {
return zerobyte_state_;
}

void SetZerobyteLabel(unsigned char label) {
zerobyte_label_ = label;
}

unsigned char GetZerobyteLabel() const {
return zerobyte_label_;
}

inline int64_t GetHashcode() {
if (hashcode_ == -1) {
int64_t b;
Expand Down Expand Up @@ -267,6 +293,8 @@ final {
int64_t hashcode_ = -1;
int no_minimization_counter_ = 0;
uint32_t weight_ = 0;
size_t zerobyte_state_ = 0;
unsigned char zerobyte_label_ = 0xff;
bool final_ = false;
};

Expand Down
50 changes: 30 additions & 20 deletions keyvi/src/cpp/dictionary/keyvi_file.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

#include <fstream>

#include "dictionary/fsa/internal/constants.h"
#include "dictionary/fsa/internal/serialization_utils.h"

namespace keyvi {
Expand All @@ -38,57 +39,66 @@ class KeyViFile {
using ptree=boost::property_tree::ptree;
public:
explicit KeyViFile(const std::string& filename)
: fileStream_(filename, std::ios::binary)
: file_stream_(filename, std::ios::binary)
{
using namespace ::boost;
using namespace fsa::internal;

if (!fileStream_.good()) {
if (!file_stream_.good()) {
throw std::invalid_argument("file not found");
}

char magic[8];
fileStream_.read(magic, sizeof(magic));
char magic[KEYVI_FILE_MAGIC_LEN];
file_stream_.read(magic, KEYVI_FILE_MAGIC_LEN);
// check magic
if (std::strncmp(magic, "KEYVIFSA", 8)){
if (std::strncmp(magic, KEYVI_FILE_MAGIC, KEYVI_FILE_MAGIC_LEN)){
throw std::invalid_argument("not a keyvi file");
}

automataProperties_ = SerializationUtils::ReadJsonRecord(fileStream_);
persistenceOffset_ = fileStream_.tellg();
automata_properties_ = SerializationUtils::ReadJsonRecord(file_stream_);
persistence_offset_ = file_stream_.tellg();

if (lexical_cast<int> (automata_properties_.get<std::string>("version")) < KEYVI_FILE_VERSION_MIN) {
throw std::invalid_argument("this version of keyvi file is unsupported");
}

const ptree sparse_array_properties = SerializationUtils::ReadJsonRecord(file_stream_);

if (lexical_cast<int> (sparse_array_properties.get<std::string>("version")) < KEYVI_FILE_PERSISTENCE_VERSION_MIN) {
throw std::invalid_argument("this versions of keyvi file is unsupported");
}

// check for file truncation
const ptree sparse_array_properties = SerializationUtils::ReadJsonRecord(fileStream_);
const bool compact_size = lexical_cast<uint32_t> (sparse_array_properties.get<std::string>("version")) == 2;
const size_t bucket_size = compact_size ? sizeof(uint16_t) : sizeof(uint32_t);
const size_t array_size = lexical_cast<size_t>(sparse_array_properties.get<std::string>("size"));

fileStream_.seekg((size_t)fileStream_.tellg() + array_size + bucket_size * array_size - 1);
if (fileStream_.peek() == EOF) {
// check for file truncation
file_stream_.seekg((size_t)file_stream_.tellg() + array_size + bucket_size * array_size - 1);
if (file_stream_.peek() == EOF) {
throw std::invalid_argument("file is corrupt(truncated)");
}

fileStream_.get();
valueStoreOffset_ = fileStream_.tellg();
file_stream_.get();
value_store_offset_ = file_stream_.tellg();
}

ptree automataProperties() const {
return automataProperties_;
return automata_properties_;
}

std::istream& persistenceStream() {
return fileStream_.seekg(persistenceOffset_);
return file_stream_.seekg(persistence_offset_);
}

std::istream& valueStoreStream() {
return fileStream_.seekg(valueStoreOffset_);
return file_stream_.seekg(value_store_offset_);
}

private:
std::ifstream fileStream_;
ptree automataProperties_;
std::streampos persistenceOffset_;
std::streampos valueStoreOffset_;
std::ifstream file_stream_;
ptree automata_properties_;
std::streampos persistence_offset_;
std::streampos value_store_offset_;
};

} /* namespace dictionary */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,15 @@ BOOST_AUTO_TEST_CASE( writeTransitionRelativeOverflowZerobyteGhostState ) {
u2.Add(65, 100);
u2.Add(66, 101);
u2.Add(233, 102);
for (int i = 1; i < 255 + 65; ++i) {
// mark transitions
if (i == 255) {
continue;
}
b.taken_positions_in_sparsearray_.Set(i);
}

b.FindFreeBucket(u2);
b.WriteState(0xff,u2);

// 0 + 255 -> 255 should not exist as it would mean u1 has a transition 255
Expand Down
4 changes: 2 additions & 2 deletions pykeyvi/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def run(self):

setup(
name=PACKAGE_NAME,
version='0.1.30',
version='0.1.31',
description='Python bindings for keyvi',
author='Hendrik Muhs',
author_email='hendrik.muhs@gmail.com',
Expand All @@ -206,7 +206,7 @@ def run(self):
ext_modules=ext_modules,
zip_safe=False,
url='https://github.com/cliqz/keyvi',
download_url='https://github.com/cliqz/keyvi/tarball/v0.1.30',
download_url='https://github.com/cliqz/keyvi/tarball/v0.1.31',
keywords=['FST'],
classifiers=[],
install_requires=install_requires,
Expand Down

0 comments on commit 66b56ce

Please sign in to comment.