Adding Google SentencePiece as a Tokenizer (#1106)

Summary: This diff adds SentencePiece as a pip requirement, and a tokenizer shell for PyText ## Motivation and Context We need SentencePiece to support modern cross lingual models ## How Has This Been Tested A unit test has been added. ## Types of changes - [ ] Docs change / refactoring / dependency upgrade - [ ] Bug fix (non-breaking change which fixes an issue) - [x] New feature (non-breaking change which adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) ## Checklist - [x] My code follows the code style of this project. - [x] My change requires a change to the documentation. - [x] I have updated the documentation accordingly. - [x] I have read the **CONTRIBUTING** document. - [x] I have completed my CLA (see **CONTRIBUTING**) - [x] I have added tests to cover my changes. - [x] All new and existing tests passed. Pull Request resolved: #1106 Test Plan: Imported from GitHub, without a `Test Plan:` line. Tests are non-issue because the TARGET file was missing dependency, which cannot be filled up in OSS. Look at the diff stacked on top of this one for tests. Reviewed By: hudeven Differential Revision: D18309626 Pulled By: snisarg fbshipit-source-id: a0d16417023e237eb29a9355c22e203e380efbb5
facebookresearch · Nov 8, 2019 · 322fc47 · 322fc47
1 parent 5882620
commit 322fc47
Show file tree

Hide file tree

Showing 6 changed files with 77 additions and 0 deletions.
diff --git a/docs_requirements.txt b/docs_requirements.txt
@@ -8,6 +8,7 @@ numpy
 onnx
 pytorch-pretrained-bert
 requests
+sentencepiece
 torchtext
 tensorboard==1.14
 pandas
diff --git a/pytext/data/tensorizers.py b/pytext/data/tensorizers.py
@@ -5,6 +5,7 @@
 
 import torch
 from pytext.common import Padding
+from pytext.config import ConfigBase
 from pytext.config.component import Component, ComponentType, create_component
 from pytext.data.data_structures.annotation import (
     REDUCE,
@@ -20,6 +21,7 @@
 from pytext.torchscript.tensorizer import VectorNormalizer
 from pytext.utils import cuda, precision
 from pytext.utils.data import Slot
+from sentencepiece import SentencePieceProcessor
 
 from .utils import (
     BOL,
@@ -1420,6 +1422,45 @@ def tensorize(self, batch):
         return cuda.tensor(batch, torch.float)
 
 
+class CppProcessorMixin:
+    """Cpp processors like SentencePiece don't pickle well; reload them."""
+
+    def _load_processor(self):
+        raise NotImplementedError
+
+    def __getstate__(self):
+        state = dict(vars(self))
+        state.pop("processor")
+        return state
+
+    def __setstate__(self, state):
+        vars(self).update(state)
+        self._load_processor()
+
+
+class SentencePieceTokenizer(Tokenizer, CppProcessorMixin):
+    """Sentence piece tokenizer."""
+
+    class Config(ConfigBase):
+        sp_model_path: str = ""
+
+    def __init__(self, sp_model_path: str = ""):
+        self.sp_model_path = sp_model_path
+        self._load_processor()
+
+    @classmethod
+    def from_config(cls, config: Config):
+        return cls(config.sp_model_path)
+
+    def tokenize(self, input_str: str) -> List[Token]:
+        pieces = self.processor.EncodeAsPieces(input_str)
+        return [Token(piece, -1, -1) for piece in pieces]
+
+    def _load_processor(self):
+        self.processor = SentencePieceProcessor()
+        self.processor.Load(self.sp_model_path)
+
+
 def initialize_tensorizers(tensorizers, data_source, from_scratch=True):
     """A utility function to stream a data source to the initialize functions
     of a dict of tensorizers."""

diff --git a/requirements.txt b/requirements.txt
@@ -12,3 +12,4 @@ scipy
 torchtext
 tensorboard==1.14
 torch
+sentencepiece
diff --git a/tests/models/__init__.py b/tests/models/__init__.py
@@ -0,0 +1,2 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
diff --git a/tests/models/sentencepiece.model b/tests/models/sentencepiece.model
diff --git a/tests/tensorizer_test.py b/tests/tensorizer_test.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import unittest
+
+from pytext.data.tensorizers import SentencePieceTokenizer
+
+
+class SentencePieceTokenizerTest(unittest.TestCase):
+    def test_tokenize(self):
+        sentence = "Testing out sentencepiece"
+        expected = [
+            "▁T",
+            "est",
+            "ing",
+            "▁out",
+            "▁sen",
+            "t",
+            "ence",
+            "p",
+            "i",
+            "e",
+            "ce",
+        ]
+        sp_tokenizer = SentencePieceTokenizer.from_config(
+            SentencePieceTokenizer.Config(
+                sp_model_path="tests/models/sentencepiece.model"
+            )
+        )
+        tokens = sp_tokenizer.tokenize(sentence)
+        tokens = [token.value for token in tokens]
+        self.assertEqual(tokens, expected)