Skip to content
Browse files

Move the latest predicted label onto the Example class

  • Loading branch information...
1 parent c727c62 commit 479a4eca3e21fd54fe62b163be41998183fd378c @andersjo andersjo committed May 8, 2014
Showing with 80 additions and 66 deletions.
  1. +2 −1 rungsted/input.pxd
  2. +9 −7 rungsted/input.pyx
  3. +19 −43 rungsted/runner.py
  4. +50 −15 rungsted/struct_perceptron.pyx
View
3 rungsted/input.pxd
@@ -23,6 +23,7 @@ cdef class Example(object):
public double[:] cost
vector[Feature] features
vector[int] constraints
+ public int pred_label
+ public int gold_label
- cpdef int flat_label(self)
cdef inline int add_feature(self, int, double)
View
16 rungsted/input.pyx
@@ -60,6 +60,8 @@ cdef class Example(object):
def __init__(self, Dataset dataset):
self.dataset = dataset
self.cost = array.clone(array.array("d"), dataset.n_labels, False)
+ self.pred_label = -1
+ self.gold_label = -1
# Initialize cost array with 1.0
cdef int i
for i in range(dataset.n_labels):
@@ -68,12 +70,6 @@ cdef class Example(object):
def __dealloc__(self):
if self.id_: free(self.id_)
- cpdef int flat_label(self):
- cdef int i
- for i in range(self.dataset.n_labels):
- if self.cost[i] == 0:
- return i + 1
-
cdef inline int add_feature(self, int index, double val):
cdef Feature feat = Feature(index, val)
self.features.push_back(feat)
@@ -86,7 +82,7 @@ cdef class Example(object):
cdef int parse_header(char* header, Example e) except -1:
cdef:
- int label
+ int label, label_0
char* header_elem = strsep(&header, " ")
double cost
@@ -129,6 +125,12 @@ cdef int parse_header(char* header, Example e) except -1:
header_elem = strsep(&header, " ")
+ # Set gold label if exists
+ for label_0 in range(e.dataset.n_labels):
+ if e.cost[label_0] == 0.0:
+ e.gold_label = label_0 + 1
+ break
+
return 0
cdef double separate_and_parse_val(char* string_with_value) except -1:
View
62 rungsted/runner.py
@@ -1,6 +1,7 @@
# coding: utf-8
import argparse
import logging
+import os
import random
import cPickle
import numpy as np
@@ -9,7 +10,7 @@
from feat_map import HashingFeatMap, DictFeatMap
from input import read_vw_seq
-from struct_perceptron import Weights, viterbi, update_weights
+from struct_perceptron import Weights, viterbi, update_weights, avg_loss, accuracy
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
@@ -58,10 +59,11 @@
if args.test:
test = read_vw_seq(args.test, args.n_labels, ignore=args.ignore, quadratic=args.quadratic, feat_map=feat_map)
logging.info("Test data {} sentences".format(len(test)))
- logging.info("Weight vector size {}".format(feat_map.n_feats()))
+
# Loading weights
w = Weights(n_labels, feat_map.n_feats())
+logging.info("Weight vector size {}".format(feat_map.n_feats()))
if args.initial_model:
w.load(open(args.initial_model))
@@ -72,61 +74,35 @@
# Training loop
if args.train:
+ epoch_msg = ""
for epoch in range(1, args.passes+1):
learning_rate = 0.1 if epoch < args.decay_delay else epoch**args.decay_exp * 0.1
if args.shuffle:
random.shuffle(train)
for sent in train:
- flattened_labels = [e.flat_label() for e in sent]
-
- gold_seq = np.array(flattened_labels, dtype=np.int32)
- pred_seq = np.array(viterbi(sent, n_labels, w, feat_map), dtype=np.int32)
-
- assert len(gold_seq) == len(pred_seq)
-
- update_weights(pred_seq, gold_seq, sent, w, n_updates, learning_rate, n_labels, feat_map)
+ viterbi(sent, n_labels, w, feat_map)
+ update_weights(sent, w, n_updates, learning_rate, n_labels, feat_map)
n_updates += 1
-
if n_updates % 1000 == 0:
- print >>sys.stderr, '\r{} k sentences total'.format(n_updates / 1000),
+ print >>sys.stderr, '\r{}\t{} k sentences total'.format(epoch_msg, n_updates / 1000),
+
+ epoch_msg = "[{}] train loss={:.4f} ".format(epoch, avg_loss(train))
+ print >>sys.stderr, "\r{}{}".format(epoch_msg, " "*72)
- if args.average:
- w.average_weights(n_updates)
+ if args.average:
+ w.average_weights(n_updates)
# Testing
if args.test:
- y_gold = []
- y_pred = []
-
- out = None
- if args.predictions:
- out = open(args.predictions, 'w')
-
- for sent in test:
- y_pred_sent = viterbi(sent, n_labels, w, feat_map)
- y_gold_sent = [e.flat_label() for e in sent]
-
- if out:
- for example, pred in zip(sent, y_pred_sent):
- print >>out, "{}\t{}\t{}".format(example.id_, example.flat_label(), pred)
-
+ with open(args.predictions or os.devnull, 'w') as out:
+ for sent in test:
+ viterbi(sent, n_labels, w, feat_map)
+ for example in sent:
+ print >>out, "{}\t{}\t{}".format(example.id_, example.gold_label, example.pred_label)
print >>out, ""
- y_gold += y_gold_sent
- y_pred += y_pred_sent
-
- if out:
- out.close()
-
- assert len(y_gold) == len(y_pred)
-
- correct = np.array(y_gold) == np.array(y_pred)
-
- accuracy = correct.sum() / float(len(correct))
-
- print >>sys.stderr, ''
- logging.info("Accuracy: {:.3f}".format(accuracy))
+ logging.info("Accuracy: {:.3f}".format(accuracy(test)))
# Save model
if args.final_model:
View
65 rungsted/struct_perceptron.pyx
@@ -79,18 +79,18 @@ cdef class Weights:
def save(self, file):
np.savez(file, e=self.e, t=self.t)
-def update_weights(int[:] pred_seq, int[:] gold_seq, list sent, Weights w, int n_updates, double alpha, int n_labels,
+def update_weights(list sent, Weights w, int n_updates, double alpha, int n_labels,
FeatMap feat_map):
cdef int word_i, i
- cdef Example cur
+ cdef Example cur, prev
cdef int pred_label, gold_label
cdef Feature feat
- # Update emission features
- for word_i in range(len(pred_seq)):
+ # Update emission features
+ for word_i in range(len(sent)):
cur = sent[word_i]
- pred_label = pred_seq[word_i]
- gold_label = gold_seq[word_i]
+ pred_label = cur.pred_label
+ gold_label = cur.gold_label
# Update if prediction is not correct
if gold_label != pred_label:
@@ -106,19 +106,50 @@ def update_weights(int[:] pred_seq, int[:] gold_seq, list sent, Weights w, int n
w.update_t(n_labels, pred_label - 1, -alpha, n_updates)
# Transition features
- for word_i in range(1, len(pred_seq)):
+ for word_i in range(1, len(sent)):
+ cur = sent[word_i]
+ prev = sent[word_i - 1]
# If current or previous prediction is not correct
- if gold_seq[word_i] != pred_seq[word_i] or gold_seq[word_i-1] != pred_seq[word_i-1]:
- w.update_t(gold_seq[word_i] - 1, gold_seq[word_i-1] - 1, alpha, n_updates)
- w.update_t(pred_seq[word_i] - 1, pred_seq[word_i-1] - 1, -alpha, n_updates)
+ if cur.gold_label != cur.pred_label or prev.gold_label != prev.pred_label:
+ w.update_t(cur.gold_label - 1, prev.gold_label - 1, alpha, n_updates)
+ w.update_t(cur.pred_label - 1, prev.pred_label - 1, -alpha, n_updates)
+
+cpdef double avg_loss(list sents):
+ cdef:
+ list sent
+ Example e
+ double total_cost = 0
+ int n = 0
+
+ for sent in sents:
+ for e in sent:
+ if e.pred_label > 0:
+ n += 1
+ total_cost += e.cost[e.pred_label - 1]
+ return total_cost / n
+
+cpdef double accuracy(list sents):
+ cdef:
+ list sent
+ Example e
+ int n = 0, correct = 0
+
+ for sent in sents:
+ for e in sent:
+ if e.pred_label > 0:
+ n += 1
+ if e.cost[e.pred_label - 1] == 0.0:
+ correct += 1
+
+ return float(correct) / n
@cython.wraparound(True)
def viterbi(list sent, int n_labels, Weights w, FeatMap feat_map):
"""Returns best predicted sequence"""
cdef Example e
cdef Feature feat
- cdef int word_0, label_0, label
+ cdef int word_0, label_0, label, i
# Allocate back pointers
cdef int[:, ::1] path = np.zeros((len(sent), n_labels), dtype=np.int32)*-1
@@ -137,10 +168,14 @@ def viterbi(list sent, int n_labels, Weights w, FeatMap feat_map):
# Find best sequence from the trellis
best_seq = [np.asarray(trellis)[-1].argmax()]
- for word_i in reversed(range(1, len(path))):
- best_seq.append(path[word_i, <int> best_seq[-1]])
-
- return [label + 1 for label in reversed(best_seq)]
+ for i in reversed(range(1, len(path))):
+ best_seq.append(path[i, <int> best_seq[-1]])
+ best_seq = [label + 1 for label in reversed(best_seq)]
+
+ for e, pred_label in zip(sent, best_seq):
+ e.pred_label = pred_label
+
+ return best_seq
cdef viterbi_path(list seq, int n_labels, Weights w, double[:, ::1] trellis, int[:, ::1] path, FeatMap feat_map):

0 comments on commit 479a4ec

Please sign in to comment.
Something went wrong with that request. Please try again.