Permalink
Browse files

Save id to instance

  • Loading branch information...
1 parent c0a1e30 commit 0694849761eacb61ef5c4f7adff524edeb518ea6 @andersjo andersjo committed Apr 28, 2014
Showing with 36 additions and 31 deletions.
  1. +1 −1 rungsted/input.pxd
  2. +5 −3 rungsted/input.pyx
  3. +1 −3 rungsted/runner.py
  4. +8 −6 rungsted/setup.py
  5. +21 −18 rungsted/struct_perceptron.pyx
View
@@ -24,7 +24,7 @@ cdef class DataBlock(object):
cdef class Example(object):
cdef:
- char *id
+ char * id_
DataBlock block
public int[:] index
public double[:] val
View
@@ -13,6 +13,7 @@ from cpython cimport array
cimport cython
import sys
+import hashing
from hashing cimport hash_feat
cnp.import_array()
@@ -29,6 +30,7 @@ cdef extern from "string.h":
char * strchr ( char *, int )
char * strtok(char *, char *)
char * strsep(char **, char *)
+ char * strdup(const char *)
char * strcpy(char *, char *)
char * strncpy(char *, char *, size_t)
int strlen(char *)
@@ -148,11 +150,11 @@ cdef class Example(object):
def __dealloc__(self):
- PyMem_Free(self.id)
+ if self.id_: free(self.id_)
def __repr__(self):
return "<Example id={} with " \
- "{} features.>".format(self.id, self.length)
+ "{} features.>".format(self.id_, self.length)
cdef int parse_header(char* header, Example e) except -1:
@@ -169,7 +171,7 @@ cdef int parse_header(char* header, Example e) except -1:
pass
#constraints.append(label)
elif first_char == '\'':
- tag = &header_elem[1]
+ e.id_ = strdup(&header_elem[1])
elif isdigit(first_char):
# Tokens starting with a digit can be either
# - a label with optional cost, e.g. 3 and 3:0.4
View
@@ -10,9 +10,7 @@
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
-parser = argparse.ArgumentParser(description="""Discriminatory tagger using structured perceptron.
-Supports cost-sensitive learning where it is allowed to have multiple gold labels per token.
-""")
+parser = argparse.ArgumentParser(description="""Structured perceptron tagger""")
parser.add_argument('--train', help="Training data (vw format)")
parser.add_argument('--test', help="Test data (vw format)")
parser.add_argument('--hash-bits', '-b', help="Size of feature vector in bits (2**b)", type=int, default=20)
View
@@ -28,6 +28,8 @@
extra_compile_args=['-Wno-deprecated', '-Wno-unused-function', '-Wno-#warnings', '-Wno-deprecated-writable-strings']
+# extra_compile_args=['-w -ffast-math -O3 -fopenmp'],
+# extra_link_args=['-fopenmp'])]
setup(
name='Structured perceptron',
@@ -38,13 +40,13 @@
# ext_modules=cythonize("rungsted/*.pyx", sources=['rungsted/MurmurHash3.cpp'])
ext_modules=[
- Extension('rungsted.struct_perceptron', sources=['rungsted/struct_perceptron.pyx'],
- extra_compile_args=extra_compile_args, language='c++',),
- Extension('rungsted.input', sources=['rungsted/input.pyx',
- 'rungsted/MurmurHash3.cpp'],
+ Extension('struct_perceptron', sources=['struct_perceptron.pyx'],
extra_compile_args=extra_compile_args, language='c++'),
- Extension('rungsted.hashing', sources=['rungsted/hashing.pyx',
- 'rungsted/MurmurHash3.cpp'],
+ Extension('input', sources=['input.pyx',
+ 'MurmurHash3.cpp'],
+ extra_compile_args=extra_compile_args, language='c++'),
+ Extension('hashing', sources=['hashing.pyx',
+ 'MurmurHash3.cpp'],
extra_compile_args=extra_compile_args, language='c++')
],
include_dirs = [np.get_include()]
@@ -8,8 +8,8 @@ import cython
import numpy as np
cimport numpy as cnp
-from input cimport Example, Dataset, DataBlock
-from hashing cimport hash_ints
+from .input cimport Example, Dataset, DataBlock
+from .hashing cimport hash_ints
cdef extern from "math.h":
float INFINITY
@@ -73,8 +73,9 @@ cdef class Weights:
def update_weights(int[:] pred_seq, int[:] gold_seq, list sent, Weights w, int n_updates, double alpha, int n_labels):
- cdef int word_i
+ cdef int word_i, i
cdef Example cur
+ cdef int pred_label, gold_label
# Update emission features
for word_i in range(len(pred_seq)):
@@ -92,15 +93,15 @@ def update_weights(int[:] pred_seq, int[:] gold_seq, list sent, Weights w, int n
# Transition from from initial state
if word_i == 0:
- w.update_t(n_labels, gold_label, alpha, n_updates)
- w.update_t(n_labels, pred_label, -alpha, n_updates)
+ w.update_t(n_labels, gold_label - 1, alpha, n_updates)
+ w.update_t(n_labels, pred_label - 1, -alpha, n_updates)
# Transition features
for word_i in range(1, len(pred_seq)):
# If current or previous prediction is not correct
if gold_seq[word_i] != pred_seq[word_i] or gold_seq[word_i-1] != pred_seq[word_i-1]:
- w.update_t(gold_seq[word_i], gold_seq[word_i-1], alpha, n_updates)
- w.update_t(pred_seq[word_i], pred_seq[word_i-1], -alpha, n_updates)
+ w.update_t(gold_seq[word_i] - 1, gold_seq[word_i-1] - 1, alpha, n_updates)
+ w.update_t(pred_seq[word_i] - 1, pred_seq[word_i-1] - 1, -alpha, n_updates)
@cython.wraparound(True)
@@ -117,7 +118,7 @@ def viterbi(list sent, int n_labels, Weights w):
for word_i in reversed(range(1, len(path))):
best_seq.append(path[word_i, best_seq[-1]])
- return list(reversed(best_seq))
+ return [label + 1 for label in reversed(best_seq)]
cdef viterbi_path(list seq, int n_labels, Weights w, double[:, ::1] trellis, int[:, ::1] path):
@@ -127,6 +128,8 @@ cdef viterbi_path(list seq, int n_labels, Weights w, double[:, ::1] trellis, int
int min_prev
double e_score
+ # Zero-based labels
+ int cur_label_0, prev_label_0
int feat_i, i, j
int word_i = 0
double feat_val
@@ -135,8 +138,8 @@ cdef viterbi_path(list seq, int n_labels, Weights w, double[:, ::1] trellis, int
for word_i in range(len(seq)):
cur = seq[word_i]
# Current label
- for label_i in range(n_labels):
- if trellis[word_i, label_i] == -INFINITY:
+ for cur_label_0 in range(n_labels):
+ if trellis[word_i, cur_label_0] == -INFINITY:
continue
min_score = -1E9
@@ -145,20 +148,20 @@ cdef viterbi_path(list seq, int n_labels, Weights w, double[:, ::1] trellis, int
# Emission score
for i in range(cur.index.shape[0]):
- feat_i = hash_ints(cur.index[i], label_i, w.hash_bits)
+ feat_i = hash_ints(cur.index[i], cur_label_0 + 1, w.hash_bits)
e_score += w.e[feat_i] * cur.val[i]
# Previous label
if word_i == 0:
- trellis[word_i, label_i] = e_score + w.t[n_labels, label_i]
- path[word_i, label_i] = label_i
+ trellis[word_i, cur_label_0] = e_score + w.t[n_labels, cur_label_0]
+ path[word_i, cur_label_0] = cur_label_0
else:
- for label_j in range(n_labels):
- score = e_score + w.t[label_i, label_j] + trellis[word_i-1, label_j]
+ for prev_label_0 in range(n_labels):
+ score = e_score + w.t[cur_label_0, prev_label_0] + trellis[word_i-1, prev_label_0]
if score >= min_score:
min_score = score
- min_prev = label_j
- trellis[word_i, label_i] = min_score
- path[word_i, label_i] = min_prev
+ min_prev = prev_label_0
+ trellis[word_i, cur_label_0] = min_score
+ path[word_i, cur_label_0] = min_prev

0 comments on commit 0694849

Please sign in to comment.