Permalink
Browse files

Introduced dictionary-based hashing

  • Loading branch information...
1 parent deaad1b commit 11c33fdc64f6a16e081659b1b3f5eb9380905e6a @andersjo andersjo committed Apr 29, 2014
Showing with 194 additions and 148 deletions.
  1. +27 −0 rungsted/feat_map.pxd
  2. +105 −0 rungsted/feat_map.pyx
  3. +0 −4 rungsted/hashing.pxd
  4. +0 −71 rungsted/hashing.pyx
  5. +0 −1 rungsted/input.pxd
  6. +34 −22 rungsted/input.pyx
  7. +16 −9 rungsted/runner.py
  8. +2 −28 rungsted/setup.py
  9. +10 −13 rungsted/struct_perceptron.pyx
View
@@ -0,0 +1,27 @@
+from libc.stdint cimport uint32_t, int32_t
+
+cdef class FeatMap(object):
+ cdef:
+ int frozen
+
+ cdef int32_t feat_i(self, char * feat)
+ cdef int32_t feat_i_for_label(self, uint32_t feat_i, uint32_t label)
+ cpdef int32_t n_feats(self)
+ cpdef int freeze(self)
+ cpdef int unfreeze(self)
+
+cdef class HashingFeatMap(FeatMap):
+ cdef:
+ int b
+ uint32_t mask
+
+
+
+cdef class DictFeatMap(FeatMap):
+ cdef:
+ int n_labels
+ int next_i
+ object feat2index
+
+
+
View
@@ -0,0 +1,105 @@
+#cython: boundscheck=False
+#cython: nonecheck=False
+#cython: wraparound=False
+
+from libc.stdint cimport uint32_t, int32_t, int64_t, uint64_t
+
+cdef extern from "string.h":
+ char * strncpy(char *, char *, size_t) nogil
+ int strlen(char *) nogil
+ void * memset(void *, int, size_t) nogil
+
+cdef extern from "MurmurHash3.h":
+ void MurmurHash3_x86_32 (void *, int, uint32_t, void *) nogil
+
+
+cdef class FeatMap(object):
+ cdef int32_t feat_i(self, char * feat):
+ return -1
+ cdef int32_t feat_i_for_label(self, uint32_t feat_i, uint32_t label):
+ return -1
+ cpdef int32_t n_feats(self):
+ return -1
+ cpdef int freeze(self):
+ self.frozen = 1
+ return self.frozen
+ cpdef int unfreeze(self):
+ self.frozen = 0
+ return self.frozen
+
+
+DEF MURMUR_SEED = 100
+# MAX_PADDED_LEN Should be a multiple of 4
+DEF MAX_PADDED_LEN = 4*512
+
+cdef class HashingFeatMap(FeatMap):
+ def __init__(self, int b):
+ self.b = b
+ self.mask = ((1 << b) - 1)
+
+ cdef int32_t feat_i(self, char * feat):
+ cdef:
+ uint32_t out = 0
+ int pad_len = 0
+ char padded_key[MAX_PADDED_LEN]
+ int padded_len
+ int key_len
+
+ key_len = strlen(feat)
+ # Truncate key
+ if key_len > MAX_PADDED_LEN:
+ key_len = MAX_PADDED_LEN
+
+ # Pad the string with the null byte making the length a multiple of 4.
+ # padded_len never exceeds MAX_PADDED_LEN, because the constant is a multiple of 4
+ padded_len = key_len + (key_len % 4)
+ memset(padded_key, 0, padded_len)
+
+ # Write the string on top of the padding
+ strncpy(padded_key, feat, key_len)
+
+ MurmurHash3_x86_32(padded_key, padded_len, MURMUR_SEED, &out)
+
+ return out & self.mask
+
+ cdef int32_t feat_i_for_label(self, uint32_t feat_i, uint32_t label):
+ cdef:
+ uint32_t out = 0
+ uint64_t input
+
+ # Combine the bits of the two 32-bit integers into a 64-bit int
+ input = label
+ input <<= 32
+ input |= feat_i
+
+ MurmurHash3_x86_32(&input, sizeof(uint64_t), MURMUR_SEED, &out)
+
+ return out & self.mask
+
+ cpdef int32_t n_feats(self):
+ return 2**self.b
+
+
+cdef class DictFeatMap(FeatMap):
+ def __init__(self, int n_labels):
+ self.next_i = 0
+ self.n_labels = n_labels
+ self.feat2index = {}
+
+ cdef int32_t feat_i(self, char * feat):
+ cdef int32_t key
+ key = self.feat2index.get(feat, -1)
+ if key != -1 or self.frozen == 1:
+ return key
+ else:
+ key = self.next_i
+ self.feat2index[feat] = key
+ self.next_i += 1
+ return key
+
+ cdef int32_t feat_i_for_label(self, uint32_t feat_i, uint32_t label):
+ # The weight weight has `n_labels` sections, each with `next_i` entries
+ return self.next_i * (label - 1) + feat_i
+
+ cpdef int32_t n_feats(self):
+ return self.next_i * self.n_labels
View
@@ -1,4 +0,0 @@
-from libc.stdint cimport uint32_t, int32_t, int64_t, uint64_t
-
-cpdef uint32_t hash_feat(char* key, int b)
-cpdef uint32_t hash_ints(int32_t int1, int32_t int2, int b)
View
@@ -1,71 +0,0 @@
-#cython: boundscheck=False
-#cython: nonecheck=False
-#cython: wraparound=False
-
-from libc.stdint cimport uint32_t, int32_t, int64_t, uint64_t
-
-cdef extern from "string.h":
- char * strncpy(char *, char *, size_t)
- int strlen(char *)
- void * memset(void *, int, size_t)
-
-cdef extern from "MurmurHash3.h":
- void MurmurHash3_x86_32 (void *, int, uint32_t, void *)
-
-
-DEF MURMUR_SEED = 100
-# MAX_PADDED_LEN Should be a multiple of 4
-DEF MAX_PADDED_LEN = 4*512
-
-
-cpdef inline uint32_t hash_feat(char* key, int b):
- cdef uint32_t out = 0
- cdef int pad_len = 0
- cdef char padded_key[MAX_PADDED_LEN]
- cdef int padded_len
- cdef uint32_t mask = ((1 << b) - 1)
- cdef int key_len
-
- key_len = len(key)
- # Truncate key
- if key_len > MAX_PADDED_LEN:
- key_len = MAX_PADDED_LEN
-
- # Pad the string with the null byte making the length a multiple of 4.
- # padded_len never exceeds MAX_PADDED_LEN, because the constant is a multiple of 4
- padded_len = key_len + (key_len % 4)
- memset(padded_key, 0, padded_len)
-
- # Write the string on top of the padding
- strncpy(padded_key, key, key_len)
-
- MurmurHash3_x86_32(padded_key, padded_len, MURMUR_SEED, &out)
-
- return out & mask
-
-cpdef inline uint32_t hash_ints(int32_t int1, int32_t int2, int b):
- cdef uint32_t out = 0
- cdef uint32_t mask = ((1 << b) - 1)
- cdef uint64_t input
-
- # Combine the bits of the two 32-bit integers into a 64-bit int
- input = int1
- input <<= 32
- input |= int2
-
- MurmurHash3_x86_32(&input, sizeof(uint64_t), MURMUR_SEED, &out)
-
- return out & mask
-
-# import numpy as np
-# arr1 = np.random.random_integers(0, 2**18, size=n_pairs).astype(np.int32)
-# arr2 = np.random.random_integers(0, 2**18, size=n_pairs).astype(np.int32)
-# In [5]: %timeit perf_hash_ints(arr1, arr2, 1000, 1000)
-# 100 loops, best of 3: 12.3 ms per loop
-def perf_hash_ints(int[:] arr1, int[:] arr2, n_pairs, n_rounds):
- cdef int i, j
-
- for i in range(n_rounds):
- for j in range(n_pairs):
- hash_ints(arr1[j], arr2[j], 18)
-
View
@@ -3,7 +3,6 @@ cdef class Dataset(object):
char *quadratic
char *ignore
int nnz
- int hash_bits
int n_labels
View
@@ -1,3 +1,7 @@
+#cython: boundscheck=False
+#cython: nonecheck=False
+#cython: wraparound=False
+
from libc.stdio cimport FILE, sscanf
from libc.stdint cimport uint32_t, int64_t
from libc.stdlib cimport malloc, free
@@ -12,9 +16,8 @@ from cpython cimport array
cimport cython
import sys
+from feat_map cimport FeatMap
-import hashing
-from hashing cimport hash_feat
cnp.import_array()
@@ -47,7 +50,6 @@ cdef extern from "ctype.h":
DEF MAX_LEN = 2048
DEF MAX_FEAT_NAME_LEN = 1024
cdef char* DEFAULT_NS = ""
-DEF HASH_BITS = 18
DEF BLOCK_SIZE = 50*1000
# DEF BLOCK_SIZE = 50
@@ -102,7 +104,7 @@ cdef inline int add_feature(Example e, int index, double val) except -1:
return 0
cdef class Dataset(object):
- def __init__(self, n_labels, hash_bits=18, quadratic=[], ignore=[]):
+ def __init__(self, n_labels, quadratic=[], ignore=[]):
for combo in quadratic:
if not isinstance(combo, str) or len(combo) != 2:
raise StandardError("Invalid quadratic combination: {}".format(combo))
@@ -116,7 +118,6 @@ cdef class Dataset(object):
self.ignore = ignore_str
self.nnz = 0
self.n_labels = n_labels
- self.hash_bits = hash_bits
cdef class Example(object):
@@ -228,9 +229,9 @@ cdef double separate_and_parse_val(char* string_with_value) except -1:
cdef int quadratic_combinations(char* quadratic, Example e, int[] ns_begin, char[] ns, int n_features,
- char** feature_begin) except -1:
+ char** feature_begin, FeatMap feat_map) except -1:
cdef:
- int arg_i
+ int arg_i, feat_i = -1
char arg1, arg2
int arg1_begin, arg2_begin
int arg1_i, arg2_i
@@ -273,24 +274,27 @@ cdef int quadratic_combinations(char* quadratic, Example e, int[] ns_begin, char
ns[arg1_i], feature_begin[arg1_i],
ns[arg2_i], feature_begin[arg2_i])
-
- add_feature(e,
- hash_feat(combined_name, HASH_BITS),
- e.val[arg1_i] * e.val[arg2_i])
- n_combos += 1
+ feat_i = feat_map.feat_i(combined_name)
+ if feat_i >= 0:
+ add_feature(e,
+ feat_i,
+ e.val[arg1_i] * e.val[arg2_i])
+ n_combos += 1
return n_combos
-cdef parse_features(char* feature_str, Example e, char* quadratic):
+cdef parse_features(char* feature_str, Example e, char* quadratic, FeatMap feat_map):
cdef:
char ns[MAX_LEN]
char * feature_begin[MAX_LEN]
# Indexes of the beginning of the namespace
int ns_begin[255] # Maximum value of char
+ int feat_i = -1
+
int n_features = 0
double cur_ns_mult = 1.0
@@ -323,22 +327,25 @@ cdef parse_features(char* feature_str, Example e, char* quadratic):
raise StandardError("Number of features on line exceeds maximum allowed (defined by MAX_LEN)")
snprintf(ns_and_feature_name, MAX_FEAT_NAME_LEN, "%s^%s", cur_ns, feat_and_val)
- add_feature(e,
- hash_feat(ns_and_feature_name, HASH_BITS),
- cur_ns_mult * separate_and_parse_val(feat_and_val))
+ feat_i = feat_map.feat_i(ns_and_feature_name)
- feature_begin[n_features] = feat_and_val
- ns[n_features] = cur_ns_first
- n_features += 1
+ if feat_i >= 0:
+ add_feature(e,
+ feat_i,
+ cur_ns_mult * separate_and_parse_val(feat_and_val))
+
+ feature_begin[n_features] = feat_and_val
+ ns[n_features] = cur_ns_first
+ n_features += 1
feat_and_val = strsep(&feature_str, " ")
- n_features += quadratic_combinations(quadratic, e, ns_begin, ns, n_features, feature_begin)
+ n_features += quadratic_combinations(quadratic, e, ns_begin, ns, n_features, feature_begin, feat_map)
return n_features
-def read_vw_seq(filename, n_labels, quadratic=[], ignore=[]):
+def read_vw_seq(filename, n_labels, FeatMap feat_map, quadratic=[], ignore=[]):
cdef:
char* fname
FILE* cfile
@@ -384,7 +391,12 @@ def read_vw_seq(filename, n_labels, quadratic=[], ignore=[]):
parse_header(header, e)
free(header)
- features_parsed = parse_features(bar_pos, e, dataset.quadratic)
+ features_parsed = parse_features(bar_pos, e, dataset.quadratic, feat_map)
+
+ # Add constant feature
+ add_feature(e, feat_map.feat_i("^Constant"), 1)
+ features_parsed += 1
+
e.length = features_parsed
e.init_views()
Oops, something went wrong.

0 comments on commit 11c33fd

Please sign in to comment.