Use char-ngram embedding for out-of-vocabulary words

Summary: **Description** Provide DeepText model with the functionality to load a secondary index (pre-trained char-ngram embedding, e.g. FastText) during training/test. Embeddings of out-of-vocabulary words will be computed on-the-fly during training/test by averaging the char-ngram embeddings. **Approach** This diff provides two custom operators to accomplish this task – ConditionalOp and IndexCharNgramGetOp. We first use IndexCharNgramGetOp to perform char-ngram index lookup and return a sparse tensor segmented by lengths for each token. The sparse tensor is then used to compute the average embedding provided by the char-ngram index. Finally, we use a ConditionalOp to replace those whose embeddings were not found in the original index during the feature apply stage. Please refer to documentations of the code for more details. Reviewed By: jamesr66a Differential Revision: D5666924 fbshipit-source-id: f76605d093154a014d5b9ebf9510de9d79874eee
facebookarchive · Sep 2, 2017 · 067f704 · 067f704
1 parent 725a099
commit 067f704
Show file tree

Hide file tree

Showing 3 changed files with 117 additions and 0 deletions.
diff --git a/caffe2/operators/conditional_op.cc b/caffe2/operators/conditional_op.cc
@@ -0,0 +1,64 @@
+#include "caffe2/operators/conditional_op.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+template <>
+bool ConditionalOp<CPUContext>::RunOnDevice() {
+  auto& condition = Input(0);
+  auto& dataT = Input(1);
+  auto& dataF = Input(2);
+
+  // verify the inputs shape
+  CAFFE_ENFORCE_EQ(condition.ndim(), 1);
+  CAFFE_ENFORCE(dataT.ndim() >= 1);
+  CAFFE_ENFORCE(dataT.dims()[0] == condition.dims()[0]);
+  CAFFE_ENFORCE_EQ(dataT.ndim(), dataF.ndim());
+  for (size_t i = 0; i < dataT.dims().size(); i++) {
+    CAFFE_ENFORCE(dataT.dims().at(i) == dataF.dims().at(i));
+  }
+  const auto innerSize = dataT.size_from_dim(1);
+  const auto innerSizeBytes = innerSize * dataT.meta().itemsize();
+  CAFFE_ENFORCE(innerSize * dataF.meta().itemsize() == innerSizeBytes);
+
+  // initialize output shape
+  auto* dataOut = Output(0);
+  const auto* condPtr = condition.template data<bool>();
+  dataOut->ResizeLike(dataT);
+  auto* outPtr = (char*)dataOut->raw_mutable_data(dataT.meta());
+
+  // perform conditional op along first dimension
+  const auto* ptrT = (char*)dataT.raw_data();
+  const auto* ptrF = (char*)dataF.raw_data();
+  for (TIndex i = 0; i < condition.size(); i++) {
+    auto* dst = outPtr + i * innerSizeBytes;
+    if (condPtr[i]) {
+      context_.template CopyItems<CPUContext, CPUContext>(
+          dataT.meta(), innerSize, ptrT + i * innerSizeBytes, dst);
+    } else {
+      context_.template CopyItems<CPUContext, CPUContext>(
+          dataF.meta(), innerSize, ptrF + i * innerSizeBytes, dst);
+    }
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(Conditional, ConditionalOp<CPUContext>);
+
+OPERATOR_SCHEMA(Conditional)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Given a 1-D tensor of boolean values, apply conditional operator along the first
+dimension of DataT and DataF and return DataO.  Note, DataT and DataF must
+have the exact same shape and type.
+)DOC")
+    .Input(0, "Condition", "Boolean tensor to select DataT or DataF")
+    .Input(1, "DataT", "Data to use when True")
+    .Input(2, "DataF", "Data to use when False")
+    .Output(0, "DataO", "Output data after applying ConditionalOp");
+
+NO_GRADIENT(Conditional);
+
+} // caffe2
diff --git a/caffe2/operators/conditional_op.h b/caffe2/operators/conditional_op.h
@@ -0,0 +1,24 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#ifndef CONDITIONAL_OP_H
+#define CONDITIONAL_OP_H
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+template <class Context>
+class ConditionalOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  ConditionalOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override;
+};
+
+} // caffe2
+
+#endif
diff --git a/caffe2/python/operator_test/conditional_test.py b/caffe2/python/operator_test/conditional_test.py
@@ -0,0 +1,29 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from hypothesis import given
+import hypothesis.strategies as st
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestConditionalOp(hu.HypothesisTestCase):
+    @given(rows_num=st.integers(1, 10000), **hu.gcs_cpu_only)
+    def test_conditional(self, rows_num, gc, dc):
+        op = core.CreateOperator(
+            "Conditional", ["condition", "data_t", "data_f"], "output"
+        )
+        data_t = np.random.random((rows_num, 10, 20)).astype(np.float32)
+        data_f = np.random.random((rows_num, 10, 20)).astype(np.float32)
+        condition = np.random.choice(a=[True, False], size=rows_num)
+
+        def ref(condition, data_t, data_f):
+            output = [
+                data_t[i] if condition[i] else data_f[i]
+                for i in range(rows_num)
+            ]
+            return (output,)
+
+        self.assertReferenceChecks(gc, op, [condition, data_t, data_f], ref)