Skip to content

Commit

Permalink
Introduced dictionary-based hashing
Browse files Browse the repository at this point in the history
  • Loading branch information
andersjo committed Apr 29, 2014
1 parent deaad1b commit 11c33fd
Show file tree
Hide file tree
Showing 9 changed files with 194 additions and 148 deletions.
27 changes: 27 additions & 0 deletions rungsted/feat_map.pxd
@@ -0,0 +1,27 @@
from libc.stdint cimport uint32_t, int32_t

cdef class FeatMap(object):
cdef:
int frozen

cdef int32_t feat_i(self, char * feat)
cdef int32_t feat_i_for_label(self, uint32_t feat_i, uint32_t label)
cpdef int32_t n_feats(self)
cpdef int freeze(self)
cpdef int unfreeze(self)

cdef class HashingFeatMap(FeatMap):
cdef:
int b
uint32_t mask



cdef class DictFeatMap(FeatMap):
cdef:
int n_labels
int next_i
object feat2index



105 changes: 105 additions & 0 deletions rungsted/feat_map.pyx
@@ -0,0 +1,105 @@
#cython: boundscheck=False
#cython: nonecheck=False
#cython: wraparound=False

from libc.stdint cimport uint32_t, int32_t, int64_t, uint64_t

cdef extern from "string.h":
char * strncpy(char *, char *, size_t) nogil
int strlen(char *) nogil
void * memset(void *, int, size_t) nogil

cdef extern from "MurmurHash3.h":
void MurmurHash3_x86_32 (void *, int, uint32_t, void *) nogil


cdef class FeatMap(object):
cdef int32_t feat_i(self, char * feat):
return -1
cdef int32_t feat_i_for_label(self, uint32_t feat_i, uint32_t label):
return -1
cpdef int32_t n_feats(self):
return -1
cpdef int freeze(self):
self.frozen = 1
return self.frozen
cpdef int unfreeze(self):
self.frozen = 0
return self.frozen


DEF MURMUR_SEED = 100
# MAX_PADDED_LEN Should be a multiple of 4
DEF MAX_PADDED_LEN = 4*512

cdef class HashingFeatMap(FeatMap):
def __init__(self, int b):
self.b = b
self.mask = ((1 << b) - 1)

cdef int32_t feat_i(self, char * feat):
cdef:
uint32_t out = 0
int pad_len = 0
char padded_key[MAX_PADDED_LEN]
int padded_len
int key_len

key_len = strlen(feat)
# Truncate key
if key_len > MAX_PADDED_LEN:
key_len = MAX_PADDED_LEN

# Pad the string with the null byte making the length a multiple of 4.
# padded_len never exceeds MAX_PADDED_LEN, because the constant is a multiple of 4
padded_len = key_len + (key_len % 4)
memset(padded_key, 0, padded_len)

# Write the string on top of the padding
strncpy(padded_key, feat, key_len)

MurmurHash3_x86_32(padded_key, padded_len, MURMUR_SEED, &out)

return out & self.mask

cdef int32_t feat_i_for_label(self, uint32_t feat_i, uint32_t label):
cdef:
uint32_t out = 0
uint64_t input

# Combine the bits of the two 32-bit integers into a 64-bit int
input = label
input <<= 32
input |= feat_i

MurmurHash3_x86_32(&input, sizeof(uint64_t), MURMUR_SEED, &out)

return out & self.mask

cpdef int32_t n_feats(self):
return 2**self.b


cdef class DictFeatMap(FeatMap):
def __init__(self, int n_labels):
self.next_i = 0
self.n_labels = n_labels
self.feat2index = {}

cdef int32_t feat_i(self, char * feat):
cdef int32_t key
key = self.feat2index.get(feat, -1)
if key != -1 or self.frozen == 1:
return key
else:
key = self.next_i
self.feat2index[feat] = key
self.next_i += 1
return key

cdef int32_t feat_i_for_label(self, uint32_t feat_i, uint32_t label):
# The weight weight has `n_labels` sections, each with `next_i` entries
return self.next_i * (label - 1) + feat_i

cpdef int32_t n_feats(self):
return self.next_i * self.n_labels
4 changes: 0 additions & 4 deletions rungsted/hashing.pxd

This file was deleted.

71 changes: 0 additions & 71 deletions rungsted/hashing.pyx

This file was deleted.

1 change: 0 additions & 1 deletion rungsted/input.pxd
Expand Up @@ -3,7 +3,6 @@ cdef class Dataset(object):
char *quadratic
char *ignore
int nnz
int hash_bits
int n_labels


Expand Down
56 changes: 34 additions & 22 deletions rungsted/input.pyx
@@ -1,3 +1,7 @@
#cython: boundscheck=False
#cython: nonecheck=False
#cython: wraparound=False

from libc.stdio cimport FILE, sscanf
from libc.stdint cimport uint32_t, int64_t
from libc.stdlib cimport malloc, free
Expand All @@ -12,9 +16,8 @@ from cpython cimport array

cimport cython
import sys
from feat_map cimport FeatMap

import hashing
from hashing cimport hash_feat

cnp.import_array()

Expand Down Expand Up @@ -47,7 +50,6 @@ cdef extern from "ctype.h":
DEF MAX_LEN = 2048
DEF MAX_FEAT_NAME_LEN = 1024
cdef char* DEFAULT_NS = ""
DEF HASH_BITS = 18
DEF BLOCK_SIZE = 50*1000
# DEF BLOCK_SIZE = 50

Expand Down Expand Up @@ -102,7 +104,7 @@ cdef inline int add_feature(Example e, int index, double val) except -1:
return 0

cdef class Dataset(object):
def __init__(self, n_labels, hash_bits=18, quadratic=[], ignore=[]):
def __init__(self, n_labels, quadratic=[], ignore=[]):
for combo in quadratic:
if not isinstance(combo, str) or len(combo) != 2:
raise StandardError("Invalid quadratic combination: {}".format(combo))
Expand All @@ -116,7 +118,6 @@ cdef class Dataset(object):
self.ignore = ignore_str
self.nnz = 0
self.n_labels = n_labels
self.hash_bits = hash_bits


cdef class Example(object):
Expand Down Expand Up @@ -228,9 +229,9 @@ cdef double separate_and_parse_val(char* string_with_value) except -1:


cdef int quadratic_combinations(char* quadratic, Example e, int[] ns_begin, char[] ns, int n_features,
char** feature_begin) except -1:
char** feature_begin, FeatMap feat_map) except -1:
cdef:
int arg_i
int arg_i, feat_i = -1
char arg1, arg2
int arg1_begin, arg2_begin
int arg1_i, arg2_i
Expand Down Expand Up @@ -273,24 +274,27 @@ cdef int quadratic_combinations(char* quadratic, Example e, int[] ns_begin, char
ns[arg1_i], feature_begin[arg1_i],
ns[arg2_i], feature_begin[arg2_i])


add_feature(e,
hash_feat(combined_name, HASH_BITS),
e.val[arg1_i] * e.val[arg2_i])
n_combos += 1
feat_i = feat_map.feat_i(combined_name)
if feat_i >= 0:
add_feature(e,
feat_i,
e.val[arg1_i] * e.val[arg2_i])
n_combos += 1

return n_combos



cdef parse_features(char* feature_str, Example e, char* quadratic):
cdef parse_features(char* feature_str, Example e, char* quadratic, FeatMap feat_map):
cdef:
char ns[MAX_LEN]
char * feature_begin[MAX_LEN]

# Indexes of the beginning of the namespace
int ns_begin[255] # Maximum value of char

int feat_i = -1

int n_features = 0
double cur_ns_mult = 1.0

Expand Down Expand Up @@ -323,22 +327,25 @@ cdef parse_features(char* feature_str, Example e, char* quadratic):
raise StandardError("Number of features on line exceeds maximum allowed (defined by MAX_LEN)")

snprintf(ns_and_feature_name, MAX_FEAT_NAME_LEN, "%s^%s", cur_ns, feat_and_val)
add_feature(e,
hash_feat(ns_and_feature_name, HASH_BITS),
cur_ns_mult * separate_and_parse_val(feat_and_val))
feat_i = feat_map.feat_i(ns_and_feature_name)

feature_begin[n_features] = feat_and_val
ns[n_features] = cur_ns_first
n_features += 1
if feat_i >= 0:
add_feature(e,
feat_i,
cur_ns_mult * separate_and_parse_val(feat_and_val))

feature_begin[n_features] = feat_and_val
ns[n_features] = cur_ns_first
n_features += 1

feat_and_val = strsep(&feature_str, " ")

n_features += quadratic_combinations(quadratic, e, ns_begin, ns, n_features, feature_begin)
n_features += quadratic_combinations(quadratic, e, ns_begin, ns, n_features, feature_begin, feat_map)

return n_features


def read_vw_seq(filename, n_labels, quadratic=[], ignore=[]):
def read_vw_seq(filename, n_labels, FeatMap feat_map, quadratic=[], ignore=[]):
cdef:
char* fname
FILE* cfile
Expand Down Expand Up @@ -384,7 +391,12 @@ def read_vw_seq(filename, n_labels, quadratic=[], ignore=[]):
parse_header(header, e)
free(header)

features_parsed = parse_features(bar_pos, e, dataset.quadratic)
features_parsed = parse_features(bar_pos, e, dataset.quadratic, feat_map)

# Add constant feature
add_feature(e, feat_map.feat_i("^Constant"), 1)
features_parsed += 1

e.length = features_parsed
e.init_views()

Expand Down

0 comments on commit 11c33fd

Please sign in to comment.