Make output scores optional

Additional optimizations are possible when the output scores are not required. For example, we can skip the final LogSoftMax during greedy search.
guillaumekln · Apr 1, 2020 · f644971 · f644971
1 parent 1e5d549
commit f644971
Show file tree

Hide file tree

Showing 12 changed files with 97 additions and 34 deletions.
diff --git a/cli/translate.cc b/cli/translate.cc
@@ -87,6 +87,7 @@ int main(int argc, char* argv[]) {
   options.min_decoding_length = args["min_sent_length"].as<size_t>();
   options.num_hypotheses = args["n_best"].as<size_t>();
   options.use_vmap = args["use_vmap"].as<bool>();
+  options.return_scores = args["with_score"].as<bool>();
 
   std::istream* in = &std::cin;
   std::ostream* out = &std::cout;

diff --git a/docs/python.md b/docs/python.md
@@ -49,6 +49,7 @@ output = translator.translate_batch(
     max_decoding_length=250,   # Maximum prediction length.
     min_decoding_length=1,     # Minimum prediction length.
     use_vmap=False,            # Use the vocabulary mapping file saved in this model.
+    return_scores=True,        # Include the prediction scores in the output.
     return_attention=False,    # Include the attention vectors in the output.
     return_alternatives=False, # Return alternatives at the first unconstrained decoding position.
     sampling_topk=1,           # Randomly sample predictions from the top K candidates (with beam_size=1).

diff --git a/include/ctranslate2/decoding.h b/include/ctranslate2/decoding.h
@@ -20,7 +20,7 @@ namespace ctranslate2 {
            const dim_t min_length,
            const std::vector<size_t>* output_ids_map,
            std::vector<std::vector<std::vector<size_t>>>& sampled_ids,
-           std::vector<std::vector<float>>& scores,
+           std::vector<std::vector<float>>* scores = nullptr,
            std::vector<std::vector<std::vector<std::vector<float>>>>* attention = nullptr,
            const size_t num_hypotheses = 1) const = 0;
   };
@@ -40,7 +40,7 @@ namespace ctranslate2 {
            const dim_t min_length,
            const std::vector<size_t>* output_ids_map,
            std::vector<std::vector<std::vector<size_t>>>& sampled_ids,
-           std::vector<std::vector<float>>& scores,
+           std::vector<std::vector<float>>* scores = nullptr,
            std::vector<std::vector<std::vector<std::vector<float>>>>* attention = nullptr,
            const size_t num_hypotheses = 1) const override;
 
@@ -62,7 +62,7 @@ namespace ctranslate2 {
            const dim_t min_length,
            const std::vector<size_t>* output_ids_map,
            std::vector<std::vector<std::vector<size_t>>>& sampled_ids,
-           std::vector<std::vector<float>>& scores,
+           std::vector<std::vector<float>>* scores = nullptr,
            std::vector<std::vector<std::vector<std::vector<float>>>>* attention = nullptr,
            const size_t num_hypotheses = 1) const override;
   };
@@ -86,6 +86,7 @@ namespace ctranslate2 {
          const dim_t min_length,
          const size_t num_hypotheses,
          const bool return_alternatives,
+         const bool return_scores,
          const bool return_attention);
 
 }
diff --git a/include/ctranslate2/translation_result.h b/include/ctranslate2/translation_result.h
@@ -11,20 +11,23 @@ namespace ctranslate2 {
   class GenerationResult {
   public:
     GenerationResult(const size_t num_hypotheses, const bool with_attention);  // Empty result.
-    GenerationResult(std::vector<std::vector<T>> hypotheses,
-                     std::vector<float> scores);
+    GenerationResult(std::vector<std::vector<T>> hypotheses);
     GenerationResult(std::vector<std::vector<T>> hypotheses,
                      std::vector<float> scores,
                      std::vector<std::vector<std::vector<float>>> attention);
 
-    const std::vector<T>& output() const;
-    float score() const;
-
     size_t num_hypotheses() const;
+
+    const std::vector<T>& output() const;
     const std::vector<std::vector<T>>& hypotheses() const;
+
+    float score() const;
     const std::vector<float>& scores() const;
+    void set_scores(std::vector<float> scores);
+    bool has_scores() const;
 
     const std::vector<std::vector<std::vector<float>>>& attention() const;
+    void set_attention(std::vector<std::vector<std::vector<float>>> attention);
     bool has_attention() const;
 
     friend GenerationResult<std::string>

diff --git a/include/ctranslate2/translator.h b/include/ctranslate2/translator.h
@@ -37,6 +37,8 @@ namespace ctranslate2 {
     // beam_size unless return_alternatives is set).
     size_t num_hypotheses = 1;
 
+    // Store scores in the TranslationResult class.
+    bool return_scores = true;
     // Store attention vectors in the TranslationResult class.
     bool return_attention = false;
 

diff --git a/python/tests/test.py b/python/tests/test.py
@@ -113,6 +113,14 @@ def test_return_attention():
     assert len(attention) == 6  # Target length.
     assert len(attention[0]) == 6  # Source length.
 
+def test_ignore_scores():
+    translator = _get_transliterator()
+    output = translator.translate_batch(
+        [["آ" ,"ت" ,"ز" ,"م" ,"و" ,"ن"]],
+        beam_size=1,
+        return_scores=False)
+    assert "scores" not in output[0][0]
+
 def test_return_alternatives():
     translator = _get_transliterator()
     output = translator.translate_batch(

diff --git a/python/translator.cc b/python/translator.cc
@@ -106,6 +106,7 @@ class TranslatorWrapper
       options.min_decoding_length = min_decoding_length;
       options.num_hypotheses = num_hypotheses;
       options.use_vmap = use_vmap;
+      options.return_scores = with_scores;
 
       if (read_batch_size == 0)
         read_batch_size = max_batch_size;
@@ -128,6 +129,7 @@ class TranslatorWrapper
                            size_t max_decoding_length,
                            size_t min_decoding_length,
                            bool use_vmap,
+                           bool return_scores,
                            bool return_attention,
                            bool return_alternatives,
                            size_t sampling_topk,
@@ -154,6 +156,7 @@ class TranslatorWrapper
       options.min_decoding_length = min_decoding_length;
       options.num_hypotheses = num_hypotheses;
       options.use_vmap = use_vmap;
+      options.return_scores = return_scores;
       options.return_attention = return_attention;
       options.return_alternatives = return_alternatives;
 
@@ -165,8 +168,10 @@ class TranslatorWrapper
       py::list batch;
       for (size_t i = 0; i < result.num_hypotheses(); ++i) {
         py::dict hyp;
-        hyp["score"] = result.scores()[i];
         hyp["tokens"] = std_vector_to_py_list(result.hypotheses()[i]);
+        if (result.has_scores()) {
+          hyp["score"] = result.scores()[i];
+        }
         if (result.has_attention()) {
           py::list attn;
           for (const auto& attn_vector : result.attention()[i])
@@ -278,6 +283,7 @@ PYBIND11_MODULE(translator, m)
          py::arg("max_decoding_length")=250,
          py::arg("min_decoding_length")=1,
          py::arg("use_vmap")=false,
+         py::arg("return_scores")=true,
          py::arg("return_attention")=false,
          py::arg("return_alternatives")=false,
          py::arg("sampling_topk")=1,

diff --git a/src/decoding.cc b/src/decoding.cc
@@ -80,7 +80,7 @@ namespace ctranslate2 {
                      const dim_t min_length,
                      const std::vector<size_t>* output_ids_map,
                      std::vector<std::vector<std::vector<size_t>>>& sampled_ids,
-                     std::vector<std::vector<float>>& scores,
+                     std::vector<std::vector<float>>* scores,
                      std::vector<std::vector<std::vector<std::vector<float>>>>* attention,
                      const size_t num_hypotheses) const {
     PROFILE("beam_search");
@@ -107,8 +107,10 @@ namespace ctranslate2 {
     hypotheses.resize(batch_size);
     sampled_ids.clear();
     sampled_ids.resize(batch_size);
-    scores.clear();
-    scores.resize(batch_size);
+    if (scores) {
+      scores->clear();
+      scores->resize(batch_size);
+    }
     if (attention) {
       attention->clear();
       attention->resize(batch_size);
@@ -119,7 +121,8 @@ namespace ctranslate2 {
     for (dim_t i = 0; i < batch_size; ++i) {
       batch_offset[i] = i;
       sampled_ids[i].reserve(num_hypotheses);
-      scores[i].reserve(num_hypotheses);
+      if (scores)
+        (*scores)[i].reserve(num_hypotheses);
       if (attention)
         (*attention)[i].reserve(num_hypotheses);
     }
@@ -258,8 +261,10 @@ namespace ctranslate2 {
           for (auto& pair : hypotheses[batch_id]) {
             if (sampled_ids[batch_id].size() >= num_hypotheses)
               break;
-            scores[batch_id].push_back(-pair.first);
             sampled_ids[batch_id].emplace_back(std::move(pair.second.first));
+            if (scores) {
+              (*scores)[batch_id].push_back(-pair.first);
+            }
             if (attention) {
               (*attention)[batch_id].emplace_back(std::move(pair.second.second));
             }
@@ -331,7 +336,7 @@ namespace ctranslate2 {
                        const dim_t min_length,
                        const std::vector<size_t>* output_ids_map,
                        std::vector<std::vector<std::vector<size_t>>>& sampled_ids,
-                       std::vector<std::vector<float>>& scores,
+                       std::vector<std::vector<float>>* scores,
                        std::vector<std::vector<std::vector<std::vector<float>>>>* attention,
                        const size_t) const {
     PROFILE("greedy_search");
@@ -343,8 +348,10 @@ namespace ctranslate2 {
 
     sampled_ids.clear();
     sampled_ids.resize(batch_size);
-    scores.clear();
-    scores.resize(batch_size);
+    if (scores) {
+      scores->clear();
+      scores->resize(batch_size);
+    }
     if (attention) {
       attention->clear();
       attention->resize(batch_size);
@@ -358,7 +365,8 @@ namespace ctranslate2 {
     for (dim_t i = 0; i < batch_size; ++i) {
       batch_offset[i] = i;
       sampled_ids[i].resize(1);
-      scores[i].resize(1);
+      if (scores)
+        (*scores)[i].resize(1);
       if (attention)
         (*attention)[i].resize(1);
     }
@@ -374,7 +382,13 @@ namespace ctranslate2 {
               state,
               &logits,
               attention ? &attention_step_device : nullptr);
-      ops::LogSoftMax()(logits, log_probs);
+
+      // Compute log probs only if scores should be returned.
+      if (scores) {
+        ops::LogSoftMax()(logits, log_probs);
+      } else {
+        log_probs.shallow_copy(logits);
+      }
 
       // Penalize end_id, if configured.
       if (step < min_length)
@@ -399,8 +413,10 @@ namespace ctranslate2 {
         } else {
           sample_from.at<int32_t>(i) = true_id;
           sampled_ids[batch_id][0].push_back(true_id);
-          scores[batch_id][0] += best_probs.scalar_at<float>({i});
           ++count_alive;
+          if (scores) {
+            (*scores)[batch_id][0] += best_probs.scalar_at<float>({i});
+          }
           if (attention) {
             const auto* attn = attention_step.index<float>({i});
             (*attention)[batch_id][0].emplace_back(attn, attn + attention_step.dim(-1));
@@ -482,6 +498,7 @@ namespace ctranslate2 {
          const dim_t min_length,
          const size_t num_hypotheses,
          const bool return_alternatives,
+         const bool return_scores,
          const bool return_attention) {
     dim_t start_step = 0;
 
@@ -518,7 +535,7 @@ namespace ctranslate2 {
                                         /*min_length=*/1,
                                         output_ids_map,
                                         expanded_ids,
-                                        expanded_scores,
+                                        return_scores ? &expanded_scores : nullptr,
                                         return_attention ? &expanded_attention : nullptr,
                                         num_hypotheses);
 
@@ -542,7 +559,7 @@ namespace ctranslate2 {
                            min_length,
                            output_ids_map,
                            sampled_ids,
-                           scores,
+                           return_scores ? &scores : nullptr,
                            return_attention ? &attention : nullptr,
                            return_alternatives ? 1 : num_hypotheses);
 
@@ -571,7 +588,7 @@ namespace ctranslate2 {
             ids.insert(ids.begin(), prefix_ids->at(i).begin(), prefix_ids->at(i).end());
 
           // Finalize the score.
-          if (!expanded_scores.empty())
+          if (return_scores && !expanded_scores.empty())
             scores[i][h] += expanded_scores[i][h];
 
           // Finalize the attention.
@@ -587,13 +604,12 @@ namespace ctranslate2 {
         }
       }
 
+      GenerationResult<size_t> result(std::move(sampled_ids[i]));
+      if (return_scores)
+        result.set_scores(std::move(scores[i]));
       if (return_attention)
-        results.emplace_back(std::move(sampled_ids[i]),
-                             std::move(scores[i]),
-                             std::move(attention[i]));
-      else
-        results.emplace_back(std::move(sampled_ids[i]),
-                             std::move(scores[i]));
+        result.set_attention(std::move(attention[i]));
+      results.emplace_back(std::move(result));
     }
 
     return results;

diff --git a/src/translation_result.cc b/src/translation_result.cc
@@ -10,10 +10,8 @@ namespace ctranslate2 {
   }
 
   template <typename T>
-  GenerationResult<T>::GenerationResult(std::vector<std::vector<T>> hypotheses,
-                                        std::vector<float> scores)
-    : _hypotheses(std::move(hypotheses))
-    , _scores(std::move(scores)) {
+  GenerationResult<T>::GenerationResult(std::vector<std::vector<T>> hypotheses)
+    : _hypotheses(std::move(hypotheses)) {
   }
 
   template <typename T>
@@ -50,11 +48,26 @@ namespace ctranslate2 {
     return _scores;
   }
 
+  template <typename T>
+  void GenerationResult<T>::set_scores(std::vector<float> scores) {
+    _scores = std::move(scores);
+  }
+
+  template <typename T>
+  bool GenerationResult<T>::has_scores() const {
+    return !_scores.empty();
+  }
+
   template <typename T>
   const std::vector<std::vector<std::vector<float>>>& GenerationResult<T>::attention() const {
     return _attention;
   }
 
+  template <typename T>
+  void GenerationResult<T>::set_attention(std::vector<std::vector<std::vector<float>>> attention) {
+    _attention = std::move(attention);
+  }
+
   template <typename T>
   bool GenerationResult<T>::has_attention() const {
     return !_attention.empty();

diff --git a/src/translator.cc b/src/translator.cc
@@ -329,6 +329,7 @@ namespace ctranslate2 {
       options.min_decoding_length,
       options.num_hypotheses,
       options.return_alternatives,
+      options.return_scores,
       options.return_attention);
 
     // Convert generated ids to tokens.

diff --git a/src/translator_pool.cc b/src/translator_pool.cc
@@ -118,7 +118,7 @@ namespace ctranslate2 {
       stats.num_tokens += hypotheses[0].size();
       for (size_t n = 0; n < hypotheses.size(); ++n) {
         if (with_scores)
-          out << scores[n] << " ||| ";
+          out << (result.has_scores() ? scores[n] : 0) << " ||| ";
         for (size_t i = 0; i < hypotheses[n].size(); ++i) {
           if (i > 0)
             out << ' ';

diff --git a/tests/translator_test.cc b/tests/translator_test.cc
@@ -327,3 +327,14 @@ TEST(TranslatorTest, InvalidNumHypotheses) {
   std::vector<std::string> input = {"آ" ,"ت" ,"ز" ,"م" ,"و" ,"ن"};
   EXPECT_THROW(translator.translate(input, options), std::invalid_argument);
 }
+
+TEST(TranslatorTest, IgnoreScore) {
+  Translator translator = default_translator();
+  TranslationOptions options;
+  options.beam_size = 1;
+  options.return_scores = false;
+  const std::vector<std::string> input = {"آ" ,"ت" ,"ز" ,"م" ,"و" ,"ن"};
+  const TranslationResult result = translator.translate(input, options);
+  EXPECT_FALSE(result.has_scores());
+  EXPECT_EQ(result.output(), (std::vector<std::string>{"a", "t", "z", "m", "o", "n"}));
+}