From 11c33fdc64f6a16e081659b1b3f5eb9380905e6a Mon Sep 17 00:00:00 2001 From: Anders Johannsen Date: Tue, 29 Apr 2014 11:32:02 +0200 Subject: [PATCH] Introduced dictionary-based hashing --- rungsted/feat_map.pxd | 27 +++++++++ rungsted/feat_map.pyx | 105 +++++++++++++++++++++++++++++++++ rungsted/hashing.pxd | 4 -- rungsted/hashing.pyx | 71 ---------------------- rungsted/input.pxd | 1 - rungsted/input.pyx | 56 +++++++++++------- rungsted/runner.py | 25 +++++--- rungsted/setup.py | 30 +--------- rungsted/struct_perceptron.pyx | 23 ++++---- 9 files changed, 194 insertions(+), 148 deletions(-) create mode 100644 rungsted/feat_map.pxd create mode 100644 rungsted/feat_map.pyx delete mode 100644 rungsted/hashing.pxd delete mode 100644 rungsted/hashing.pyx diff --git a/rungsted/feat_map.pxd b/rungsted/feat_map.pxd new file mode 100644 index 0000000..95a374c --- /dev/null +++ b/rungsted/feat_map.pxd @@ -0,0 +1,27 @@ +from libc.stdint cimport uint32_t, int32_t + +cdef class FeatMap(object): + cdef: + int frozen + + cdef int32_t feat_i(self, char * feat) + cdef int32_t feat_i_for_label(self, uint32_t feat_i, uint32_t label) + cpdef int32_t n_feats(self) + cpdef int freeze(self) + cpdef int unfreeze(self) + +cdef class HashingFeatMap(FeatMap): + cdef: + int b + uint32_t mask + + + +cdef class DictFeatMap(FeatMap): + cdef: + int n_labels + int next_i + object feat2index + + + diff --git a/rungsted/feat_map.pyx b/rungsted/feat_map.pyx new file mode 100644 index 0000000..a338874 --- /dev/null +++ b/rungsted/feat_map.pyx @@ -0,0 +1,105 @@ +#cython: boundscheck=False +#cython: nonecheck=False +#cython: wraparound=False + +from libc.stdint cimport uint32_t, int32_t, int64_t, uint64_t + +cdef extern from "string.h": + char * strncpy(char *, char *, size_t) nogil + int strlen(char *) nogil + void * memset(void *, int, size_t) nogil + +cdef extern from "MurmurHash3.h": + void MurmurHash3_x86_32 (void *, int, uint32_t, void *) nogil + + +cdef class FeatMap(object): + cdef int32_t feat_i(self, char * feat): + return -1 + cdef int32_t feat_i_for_label(self, uint32_t feat_i, uint32_t label): + return -1 + cpdef int32_t n_feats(self): + return -1 + cpdef int freeze(self): + self.frozen = 1 + return self.frozen + cpdef int unfreeze(self): + self.frozen = 0 + return self.frozen + + +DEF MURMUR_SEED = 100 +# MAX_PADDED_LEN Should be a multiple of 4 +DEF MAX_PADDED_LEN = 4*512 + +cdef class HashingFeatMap(FeatMap): + def __init__(self, int b): + self.b = b + self.mask = ((1 << b) - 1) + + cdef int32_t feat_i(self, char * feat): + cdef: + uint32_t out = 0 + int pad_len = 0 + char padded_key[MAX_PADDED_LEN] + int padded_len + int key_len + + key_len = strlen(feat) + # Truncate key + if key_len > MAX_PADDED_LEN: + key_len = MAX_PADDED_LEN + + # Pad the string with the null byte making the length a multiple of 4. + # padded_len never exceeds MAX_PADDED_LEN, because the constant is a multiple of 4 + padded_len = key_len + (key_len % 4) + memset(padded_key, 0, padded_len) + + # Write the string on top of the padding + strncpy(padded_key, feat, key_len) + + MurmurHash3_x86_32(padded_key, padded_len, MURMUR_SEED, &out) + + return out & self.mask + + cdef int32_t feat_i_for_label(self, uint32_t feat_i, uint32_t label): + cdef: + uint32_t out = 0 + uint64_t input + + # Combine the bits of the two 32-bit integers into a 64-bit int + input = label + input <<= 32 + input |= feat_i + + MurmurHash3_x86_32(&input, sizeof(uint64_t), MURMUR_SEED, &out) + + return out & self.mask + + cpdef int32_t n_feats(self): + return 2**self.b + + +cdef class DictFeatMap(FeatMap): + def __init__(self, int n_labels): + self.next_i = 0 + self.n_labels = n_labels + self.feat2index = {} + + cdef int32_t feat_i(self, char * feat): + cdef int32_t key + key = self.feat2index.get(feat, -1) + if key != -1 or self.frozen == 1: + return key + else: + key = self.next_i + self.feat2index[feat] = key + self.next_i += 1 + return key + + cdef int32_t feat_i_for_label(self, uint32_t feat_i, uint32_t label): + # The weight weight has `n_labels` sections, each with `next_i` entries + return self.next_i * (label - 1) + feat_i + + cpdef int32_t n_feats(self): + return self.next_i * self.n_labels \ No newline at end of file diff --git a/rungsted/hashing.pxd b/rungsted/hashing.pxd deleted file mode 100644 index efbef23..0000000 --- a/rungsted/hashing.pxd +++ /dev/null @@ -1,4 +0,0 @@ -from libc.stdint cimport uint32_t, int32_t, int64_t, uint64_t - -cpdef uint32_t hash_feat(char* key, int b) -cpdef uint32_t hash_ints(int32_t int1, int32_t int2, int b) \ No newline at end of file diff --git a/rungsted/hashing.pyx b/rungsted/hashing.pyx deleted file mode 100644 index 2f77230..0000000 --- a/rungsted/hashing.pyx +++ /dev/null @@ -1,71 +0,0 @@ -#cython: boundscheck=False -#cython: nonecheck=False -#cython: wraparound=False - -from libc.stdint cimport uint32_t, int32_t, int64_t, uint64_t - -cdef extern from "string.h": - char * strncpy(char *, char *, size_t) - int strlen(char *) - void * memset(void *, int, size_t) - -cdef extern from "MurmurHash3.h": - void MurmurHash3_x86_32 (void *, int, uint32_t, void *) - - -DEF MURMUR_SEED = 100 -# MAX_PADDED_LEN Should be a multiple of 4 -DEF MAX_PADDED_LEN = 4*512 - - -cpdef inline uint32_t hash_feat(char* key, int b): - cdef uint32_t out = 0 - cdef int pad_len = 0 - cdef char padded_key[MAX_PADDED_LEN] - cdef int padded_len - cdef uint32_t mask = ((1 << b) - 1) - cdef int key_len - - key_len = len(key) - # Truncate key - if key_len > MAX_PADDED_LEN: - key_len = MAX_PADDED_LEN - - # Pad the string with the null byte making the length a multiple of 4. - # padded_len never exceeds MAX_PADDED_LEN, because the constant is a multiple of 4 - padded_len = key_len + (key_len % 4) - memset(padded_key, 0, padded_len) - - # Write the string on top of the padding - strncpy(padded_key, key, key_len) - - MurmurHash3_x86_32(padded_key, padded_len, MURMUR_SEED, &out) - - return out & mask - -cpdef inline uint32_t hash_ints(int32_t int1, int32_t int2, int b): - cdef uint32_t out = 0 - cdef uint32_t mask = ((1 << b) - 1) - cdef uint64_t input - - # Combine the bits of the two 32-bit integers into a 64-bit int - input = int1 - input <<= 32 - input |= int2 - - MurmurHash3_x86_32(&input, sizeof(uint64_t), MURMUR_SEED, &out) - - return out & mask - -# import numpy as np -# arr1 = np.random.random_integers(0, 2**18, size=n_pairs).astype(np.int32) -# arr2 = np.random.random_integers(0, 2**18, size=n_pairs).astype(np.int32) -# In [5]: %timeit perf_hash_ints(arr1, arr2, 1000, 1000) -# 100 loops, best of 3: 12.3 ms per loop -def perf_hash_ints(int[:] arr1, int[:] arr2, n_pairs, n_rounds): - cdef int i, j - - for i in range(n_rounds): - for j in range(n_pairs): - hash_ints(arr1[j], arr2[j], 18) - diff --git a/rungsted/input.pxd b/rungsted/input.pxd index a6718bc..3c801c4 100644 --- a/rungsted/input.pxd +++ b/rungsted/input.pxd @@ -3,7 +3,6 @@ cdef class Dataset(object): char *quadratic char *ignore int nnz - int hash_bits int n_labels diff --git a/rungsted/input.pyx b/rungsted/input.pyx index 19785b9..5f8de59 100644 --- a/rungsted/input.pyx +++ b/rungsted/input.pyx @@ -1,3 +1,7 @@ +#cython: boundscheck=False +#cython: nonecheck=False +#cython: wraparound=False + from libc.stdio cimport FILE, sscanf from libc.stdint cimport uint32_t, int64_t from libc.stdlib cimport malloc, free @@ -12,9 +16,8 @@ from cpython cimport array cimport cython import sys +from feat_map cimport FeatMap -import hashing -from hashing cimport hash_feat cnp.import_array() @@ -47,7 +50,6 @@ cdef extern from "ctype.h": DEF MAX_LEN = 2048 DEF MAX_FEAT_NAME_LEN = 1024 cdef char* DEFAULT_NS = "" -DEF HASH_BITS = 18 DEF BLOCK_SIZE = 50*1000 # DEF BLOCK_SIZE = 50 @@ -102,7 +104,7 @@ cdef inline int add_feature(Example e, int index, double val) except -1: return 0 cdef class Dataset(object): - def __init__(self, n_labels, hash_bits=18, quadratic=[], ignore=[]): + def __init__(self, n_labels, quadratic=[], ignore=[]): for combo in quadratic: if not isinstance(combo, str) or len(combo) != 2: raise StandardError("Invalid quadratic combination: {}".format(combo)) @@ -116,7 +118,6 @@ cdef class Dataset(object): self.ignore = ignore_str self.nnz = 0 self.n_labels = n_labels - self.hash_bits = hash_bits cdef class Example(object): @@ -228,9 +229,9 @@ cdef double separate_and_parse_val(char* string_with_value) except -1: cdef int quadratic_combinations(char* quadratic, Example e, int[] ns_begin, char[] ns, int n_features, - char** feature_begin) except -1: + char** feature_begin, FeatMap feat_map) except -1: cdef: - int arg_i + int arg_i, feat_i = -1 char arg1, arg2 int arg1_begin, arg2_begin int arg1_i, arg2_i @@ -273,17 +274,18 @@ cdef int quadratic_combinations(char* quadratic, Example e, int[] ns_begin, char ns[arg1_i], feature_begin[arg1_i], ns[arg2_i], feature_begin[arg2_i]) - - add_feature(e, - hash_feat(combined_name, HASH_BITS), - e.val[arg1_i] * e.val[arg2_i]) - n_combos += 1 + feat_i = feat_map.feat_i(combined_name) + if feat_i >= 0: + add_feature(e, + feat_i, + e.val[arg1_i] * e.val[arg2_i]) + n_combos += 1 return n_combos -cdef parse_features(char* feature_str, Example e, char* quadratic): +cdef parse_features(char* feature_str, Example e, char* quadratic, FeatMap feat_map): cdef: char ns[MAX_LEN] char * feature_begin[MAX_LEN] @@ -291,6 +293,8 @@ cdef parse_features(char* feature_str, Example e, char* quadratic): # Indexes of the beginning of the namespace int ns_begin[255] # Maximum value of char + int feat_i = -1 + int n_features = 0 double cur_ns_mult = 1.0 @@ -323,22 +327,25 @@ cdef parse_features(char* feature_str, Example e, char* quadratic): raise StandardError("Number of features on line exceeds maximum allowed (defined by MAX_LEN)") snprintf(ns_and_feature_name, MAX_FEAT_NAME_LEN, "%s^%s", cur_ns, feat_and_val) - add_feature(e, - hash_feat(ns_and_feature_name, HASH_BITS), - cur_ns_mult * separate_and_parse_val(feat_and_val)) + feat_i = feat_map.feat_i(ns_and_feature_name) - feature_begin[n_features] = feat_and_val - ns[n_features] = cur_ns_first - n_features += 1 + if feat_i >= 0: + add_feature(e, + feat_i, + cur_ns_mult * separate_and_parse_val(feat_and_val)) + + feature_begin[n_features] = feat_and_val + ns[n_features] = cur_ns_first + n_features += 1 feat_and_val = strsep(&feature_str, " ") - n_features += quadratic_combinations(quadratic, e, ns_begin, ns, n_features, feature_begin) + n_features += quadratic_combinations(quadratic, e, ns_begin, ns, n_features, feature_begin, feat_map) return n_features -def read_vw_seq(filename, n_labels, quadratic=[], ignore=[]): +def read_vw_seq(filename, n_labels, FeatMap feat_map, quadratic=[], ignore=[]): cdef: char* fname FILE* cfile @@ -384,7 +391,12 @@ def read_vw_seq(filename, n_labels, quadratic=[], ignore=[]): parse_header(header, e) free(header) - features_parsed = parse_features(bar_pos, e, dataset.quadratic) + features_parsed = parse_features(bar_pos, e, dataset.quadratic, feat_map) + + # Add constant feature + add_feature(e, feat_map.feat_i("^Constant"), 1) + features_parsed += 1 + e.length = features_parsed e.init_views() diff --git a/rungsted/runner.py b/rungsted/runner.py index 0e81775..7ef3572 100644 --- a/rungsted/runner.py +++ b/rungsted/runner.py @@ -4,6 +4,7 @@ import random import numpy as np import sys +from feat_map import HashingFeatMap, DictFeatMap from input import read_vw_seq from struct_perceptron import Weights, viterbi, update_weights @@ -13,7 +14,7 @@ parser = argparse.ArgumentParser(description="""Structured perceptron tagger""") parser.add_argument('--train', help="Training data (vw format)") parser.add_argument('--test', help="Test data (vw format)") -parser.add_argument('--hash-bits', '-b', help="Size of feature vector in bits (2**b)", type=int, default=20) +parser.add_argument('--hash-bits', '-b', help="Size of feature vector in bits (2**b)", type=int) parser.add_argument('--n-labels', '-k', help="Number of different labels", required=True, type=int) parser.add_argument('--passes', help="Number of passes over the training set", type=int, default=5) parser.add_argument('--predictions', '-p', help="File for outputting predictions") @@ -31,14 +32,20 @@ logging.info("Tagger started. \nCalled with {}".format(args)) n_labels = args.n_labels -train = read_vw_seq(args.train, args.n_labels, ignore=args.ignore) +if args.hash_bits: + feat_map = HashingFeatMap(args.hash_bits) +else: + feat_map = DictFeatMap(args.n_labels) + +train = read_vw_seq(args.train, args.n_labels, ignore=args.ignore, feat_map=feat_map) logging.info("Training data {} sentences".format(len(train))) -test = read_vw_seq(args.test, args.n_labels, ignore=args.ignore) +# Prevents the addition of new features when loading the test set +feat_map.freeze() +test = read_vw_seq(args.test, args.n_labels, ignore=args.ignore, feat_map=feat_map) logging.info("Test data {} sentences".format(len(test))) -n_feats = 2**args.hash_bits - -w = Weights(n_labels, n_feats, args.hash_bits) +w = Weights(n_labels, feat_map.n_feats()) +logging.info("Weight vector size {}".format(feat_map.n_feats())) n_updates = 0 @@ -54,11 +61,11 @@ # print flattened_labels, list(sent.cost) gold_seq = np.array(flattened_labels, dtype=np.int32) - pred_seq = np.array(viterbi(sent, n_labels, w), dtype=np.int32) + pred_seq = np.array(viterbi(sent, n_labels, w, feat_map), dtype=np.int32) assert len(gold_seq) == len(pred_seq) - update_weights(pred_seq, gold_seq, sent, w, n_updates, learning_rate, n_labels) + update_weights(pred_seq, gold_seq, sent, w, n_updates, learning_rate, n_labels, feat_map) n_updates += 1 @@ -76,7 +83,7 @@ w.average_weights(n_updates) for sent in test: - y_pred_sent = viterbi(sent, n_labels, w) + y_pred_sent = viterbi(sent, n_labels, w, feat_map) y_gold += [e.flat_label() for e in sent] y_pred += y_pred_sent diff --git a/rungsted/setup.py b/rungsted/setup.py index 55a3d76..478bff6 100644 --- a/rungsted/setup.py +++ b/rungsted/setup.py @@ -5,47 +5,21 @@ from distutils.command.build_clib import build_clib - -# ext_modules = [Extension('murmurhash', sources=['rungsted/MurmurHash3.cpp', -# 'rungsted/MurmurHash3.h'], language='c++')] -# ext_modules += cythonize(["rungsted/*.pyx"], -# language='c++', -# define_macros=[("NPY_NO_DEPRECATED_API", None)], -# ) - - - -# define_macros=[("NPY_NO_DEPRECATED_API", None)] - -# setup( -# cmdclass={'build_ext': build_ext}, -# ext_modules=[Extension("cython_test", sources=["cython_test.pyx", "c_test.cc"])] -# ) - # TODO Change the setup to be more like the one described here, which handles dependencies between modules better # https://github.com/cython/cython/wiki/enhancements-distutils_preprocessing extra_compile_args=['-Wno-deprecated', '-Wno-unused-function', '-Wno-#warnings', '-Wno-deprecated-writable-strings'] -# extra_compile_args=['-w -ffast-math -O3 -fopenmp'], -# extra_link_args=['-fopenmp'])] - setup( name='Structured perceptron', cmdclass={'build_ext': build_ext, 'build_clib': build_clib}, - # libraries=[('MurmurHash3', {'sources': ['rungsted/MurmurHash3.cpp'], - # 'language': 'c++', - # })], - - # ext_modules=cythonize("rungsted/*.pyx", sources=['rungsted/MurmurHash3.cpp']) ext_modules=[ Extension('struct_perceptron', sources=['struct_perceptron.pyx'], extra_compile_args=extra_compile_args, language='c++'), - Extension('input', sources=['input.pyx', - 'MurmurHash3.cpp'], + Extension('input', sources=['input.pyx'], extra_compile_args=extra_compile_args, language='c++'), - Extension('hashing', sources=['hashing.pyx', + Extension('feat_map', sources=['feat_map.pyx', 'MurmurHash3.cpp'], extra_compile_args=extra_compile_args, language='c++') ], diff --git a/rungsted/struct_perceptron.pyx b/rungsted/struct_perceptron.pyx index b67d32d..a065619 100644 --- a/rungsted/struct_perceptron.pyx +++ b/rungsted/struct_perceptron.pyx @@ -7,9 +7,9 @@ from libc.stdint cimport uint32_t, int32_t, int64_t, uint64_t import cython import numpy as np cimport numpy as cnp +from feat_map cimport HashingFeatMap, FeatMap from .input cimport Example, Dataset, DataBlock -from .hashing cimport hash_ints cdef extern from "math.h": float INFINITY @@ -25,9 +25,7 @@ cdef class Weights: cpdef public double [::1] e_acc cpdef public int [::1] e_last_update - cpdef public int hash_bits - - def __init__(self, n_labels, n_e_feats, hash_bits): + def __init__(self, n_labels, n_e_feats): self.t = np.zeros((n_labels+1, n_labels), dtype=np.float64) self.t_acc = np.zeros_like(self.t, dtype=np.float64) self.t_last_update = np.zeros_like(self.t, dtype=np.int32) @@ -36,8 +34,6 @@ cdef class Weights: self.e_acc = np.zeros_like(self.e, dtype=np.float64) self.e_last_update = np.zeros_like(self.e, dtype=np.int32) - self.hash_bits = hash_bits - def average_weights(self, n_updates): e = np.asarray(self.e) e_acc = np.asarray(self.e_acc) @@ -72,7 +68,8 @@ cdef class Weights: self.t[label_i, label_j] += val -def update_weights(int[:] pred_seq, int[:] gold_seq, list sent, Weights w, int n_updates, double alpha, int n_labels): +def update_weights(int[:] pred_seq, int[:] gold_seq, list sent, Weights w, int n_updates, double alpha, int n_labels, + FeatMap feat_map): cdef int word_i, i cdef Example cur cdef int pred_label, gold_label @@ -86,9 +83,9 @@ def update_weights(int[:] pred_seq, int[:] gold_seq, list sent, Weights w, int n # Update if prediction is not correct if gold_label != pred_label: for i in range(cur.index.shape[0]): - w.update_e(hash_ints(cur.index[i], gold_label, w.hash_bits), + w.update_e(feat_map.feat_i_for_label(cur.index[i], gold_label), cur.val[i] * alpha, n_updates) - w.update_e(hash_ints(cur.index[i], pred_label, w.hash_bits), + w.update_e(feat_map.feat_i_for_label(cur.index[i], pred_label), -cur.val[i] * alpha, n_updates) # Transition from from initial state @@ -105,14 +102,14 @@ def update_weights(int[:] pred_seq, int[:] gold_seq, list sent, Weights w, int n @cython.wraparound(True) -def viterbi(list sent, int n_labels, Weights w): +def viterbi(list sent, int n_labels, Weights w, FeatMap feat_map): """Returns best predicted sequence""" # Allocate trellis and back pointers path = np.zeros((len(sent), n_labels), dtype=np.int32)*-1 # trellis = sent.allowed_label_matrix(n_labels) trellis = np.zeros_like(path, dtype=np.float64) - viterbi_path(sent, n_labels, w, trellis, path) + viterbi_path(sent, n_labels, w, trellis, path, feat_map) best_seq = [trellis[-1].argmax()] for word_i in reversed(range(1, len(path))): @@ -121,7 +118,7 @@ def viterbi(list sent, int n_labels, Weights w): return [label + 1 for label in reversed(best_seq)] -cdef viterbi_path(list seq, int n_labels, Weights w, double[:, ::1] trellis, int[:, ::1] path): +cdef viterbi_path(list seq, int n_labels, Weights w, double[:, ::1] trellis, int[:, ::1] path, FeatMap feat_map): cdef: double min_score double score @@ -148,7 +145,7 @@ cdef viterbi_path(list seq, int n_labels, Weights w, double[:, ::1] trellis, int # Emission score for i in range(cur.index.shape[0]): - feat_i = hash_ints(cur.index[i], cur_label_0 + 1, w.hash_bits) + feat_i = feat_map.feat_i_for_label(cur.index[i], cur_label_0 + 1) e_score += w.e[feat_i] * cur.val[i] # Previous label