Implemented instance weighting. Currently, only L-BFGS can support in…

…stance weighting. Debugging later.
chokkan · May 2, 2012 · a6f144b · a6f144b
1 parent d284755
commit a6f144b
Show file tree

Hide file tree

Showing 10 changed files with 33 additions and 10 deletions.
diff --git a/frontend/reader.c b/frontend/reader.c
@@ -34,6 +34,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include <crfsuite.h>
 #include "iwa.h"
@@ -106,7 +107,13 @@ int read_data(FILE *fpi, FILE *fpo, crfsuite_data_t* data, int group)
             break;
         case IWA_ITEM:
             if (lid == -1) {
-                lid = labels->get(labels, token->attr);
+                if (strncmp(token->attr, "@weight:", 8) == 0) {
+                    /* Instance weighting. */
+                    inst.weight = atof(token->attr+8);
+                } else {
+                    /* Label. */
+                    lid = labels->get(labels, token->attr);
+                }
             } else {
                 crfsuite_attribute_init(&cont);
                 cont.aid = attrs->get(attrs, token->attr);
@@ -124,6 +131,7 @@ int read_data(FILE *fpi, FILE *fpo, crfsuite_data_t* data, int group)
             crfsuite_data_append(data, &inst);
             crfsuite_instance_finish(&inst);
             inst.group = group;
+            inst.weight = 1.;
             ++n;
             break;
         }

diff --git a/include/crfsuite.h b/include/crfsuite.h
@@ -54,7 +54,7 @@ extern "C" {
  */
 
 /** Version number of CRFSuite library. */
-#define CRFSUITE_VERSION    "0.12"
+#define CRFSUITE_VERSION    "0.12.1"
 
 /** Copyright string of CRFSuite library. */
 #define CRFSUITE_COPYRIGHT  "Copyright (c) 2007-2011 Naoaki Okazaki"
@@ -160,6 +160,8 @@ typedef struct {
     crfsuite_item_t  *items;
     /** Array of the label sequence. */
     int         *labels;
+    /** Instance weight. */
+    floatval_t  weight;
     /** Group ID of the instance. */
 	int         group;
 } crfsuite_instance_t;

diff --git a/lib/crf/src/crf1d_encode.c b/lib/crf/src/crf1d_encode.c
@@ -834,7 +834,7 @@ static int encoder_objective_and_gradients_batch(encoder_t *self, dataset_t *ds,
         logl += logp;
 
         /* Update the model expectations of features. */
-        crf1de_model_expectation(crf1de, seq, g, 1.);
+        crf1de_model_expectation(crf1de, seq, g, seq->weight);
     }
 
     *f = -logl;

diff --git a/lib/crf/src/crf1d_feature.c b/lib/crf/src/crf1d_feature.c
@@ -199,7 +199,7 @@ crf1df_feature_t* crf1df_generate(
                 f.type = FT_TRANS;
                 f.src = prev;
                 f.dst = cur;
-                f.freq = 1;
+                f.freq = seq->weight;
                 featureset_add(set, &f);
             }
 
@@ -208,7 +208,7 @@ crf1df_feature_t* crf1df_generate(
                 f.type = FT_STATE;
                 f.src = item->contents[c].aid;
                 f.dst = cur;
-                f.freq = item->contents[c].value;
+                f.freq = seq->weight * item->contents[c].value;
                 featureset_add(set, &f);
 
                 /* Generate state features connecting attributes with all

diff --git a/lib/crf/src/crfsuite.c b/lib/crf/src/crfsuite.c
@@ -155,6 +155,7 @@ int  crfsuite_item_empty(crfsuite_item_t* item)
 void crfsuite_instance_init(crfsuite_instance_t* inst)
 {
     memset(inst, 0, sizeof(*inst));
+    inst->weight = 1.;
 }
 
 void crfsuite_instance_init_n(crfsuite_instance_t* inst, int num_items)
@@ -186,6 +187,7 @@ void crfsuite_instance_copy(crfsuite_instance_t* dst, const crfsuite_instance_t*
     dst->cap_items = src->cap_items;
     dst->items = (crfsuite_item_t*)calloc(dst->num_items, sizeof(crfsuite_item_t));
     dst->labels = (int*)calloc(dst->num_items, sizeof(int));
+    dst->weight = src->weight;
     dst->group = src->group;
     for (i = 0;i < dst->num_items;++i) {
         crfsuite_item_copy(&dst->items[i], &src->items[i]);
@@ -200,11 +202,13 @@ void crfsuite_instance_swap(crfsuite_instance_t* x, crfsuite_instance_t* y)
     x->cap_items = y->cap_items;
     x->items = y->items;
     x->labels = y->labels;
+    x->weight = y->weight;
     x->group = y->group;
     y->num_items = tmp.num_items;
     y->cap_items = tmp.cap_items;
     y->items = tmp.items;
     y->labels = tmp.labels;
+    y->weight = tmp.weight;
     y->group = tmp.group;
 }
 

diff --git a/lib/crf/src/train_arow.c b/lib/crf/src/train_arow.c
@@ -308,8 +308,8 @@ int crfsuite_train_arow(
             d = diff(inst->labels, viterbi, inst->num_items);
             if (0 < d) {
                 floatval_t alpha, frac;
-                floatval_t sc, norm2;
-                floatval_t tau, cost;
+                floatval_t sc;
+                floatval_t cost;
 
                 /*
                     Compute the cost of this instance.

diff --git a/lib/crf/src/train_l2sgd.c b/lib/crf/src/train_l2sgd.c
@@ -289,7 +289,7 @@ l2sgd_calibration(
     const training_option_t* opt
     )
 {
-    int i, s;
+    int i;
     int dec = 0, ok, trials = 1;
     int num = opt->calibration_candidates;
     clock_t clk_begin = clock();

diff --git a/win32/liblbfgs/lbfgs.h b/win32/liblbfgs/lbfgs.h
@@ -573,7 +573,7 @@ Among the various ports of L-BFGS, this library provides several features:
   The library is thread-safe, which is the secondary gain from the callback
   interface.
 - <b>Cross platform.</b> The source code can be compiled on Microsoft Visual
-  Studio 2005, GNU C Compiler (gcc), etc.
+  Studio 2010, GNU C Compiler (gcc), etc.
 - <b>Configurable precision</b>: A user can choose single-precision (float)
   or double-precision (double) accuracy by changing ::LBFGS_FLOAT macro.
 - <b>SSE/SSE2 optimization</b>:
@@ -592,12 +592,20 @@ This library is used by:
 
 @section download Download
 
-- <a href="http://www.chokkan.org/software/dist/liblbfgs-1.9.tar.gz">Source code</a>
+- <a href="https://github.com/downloads/chokkan/liblbfgs/liblbfgs-1.10.tar.gz">Source code</a>
+- <a href="https://github.com/chokkan/liblbfgs">GitHub repository</a>
 
 libLBFGS is distributed under the term of the
 <a href="http://opensource.org/licenses/mit-license.php">MIT license</a>.
 
 @section changelog History
+- Version 1.10 (2010-12-22):
+    - Fixed compiling errors on Mac OS X; this patch was kindly submitted by
+      Nic Schraudolph.
+    - Reduced compiling warnings on Mac OS X; this patch was kindly submitted
+      by Tamas Nepusz.
+    - Replaced memalign() with posix_memalign().
+    - Updated solution and project files for Microsoft Visual Studio 2010.
 - Version 1.9 (2010-01-29):
     - Fixed a mistake in checking the validity of the parameters "ftol" and
       "wolfe"; this was discovered by Kevin S. Van Horn.
@@ -718,6 +726,7 @@ Special thanks go to:
     - Yoshimasa Tsuruoka and Daisuke Okanohara for technical information about
       OWL-QN
     - Takashi Imamichi for the useful enhancements of the backtracking method
+    - Kevin S. Van Horn, Nic Schraudolph, and Tamas Nepusz for bug fixes
 
 Finally I would like to thank the original author, Jorge Nocedal, who has been
 distributing the effieicnt and explanatory implementation in an open source

diff --git a/win32/liblbfgs/lbfgs.lib b/win32/liblbfgs/lbfgs.lib
diff --git a/win32/liblbfgs/lbfgs_debug.lib b/win32/liblbfgs/lbfgs_debug.lib