diff --git a/pytext/data/bert_tensorizer.py b/pytext/data/bert_tensorizer.py index 41526ac70..7fb6bb068 100644 --- a/pytext/data/bert_tensorizer.py +++ b/pytext/data/bert_tensorizer.py @@ -7,7 +7,7 @@ from fairseq.data.legacy.masked_lm_dictionary import BertDictionary from pytext.config.component import ComponentType, create_component from pytext.data.tensorizers import Tensorizer, TokenTensorizer, lookup_tokens -from pytext.data.tokenizers import Gpt2Tokenizer, Tokenizer, WordPieceTokenizer +from pytext.data.tokenizers import GPT2BPETokenizer, Tokenizer, WordPieceTokenizer from pytext.data.utils import BOS, EOS, MASK, PAD, UNK, Vocabulary, pad_and_tensorize from pytext.torchscript.tensorizer import ScriptRoBERTaTensorizer from pytext.torchscript.vocab import ScriptVocabulary @@ -121,7 +121,7 @@ def tensorize(self, batch): class RoBERTaTensorizer(BERTTensorizer): class Config(Tensorizer.Config): columns: List[str] = ["text"] - tokenizer: Gpt2Tokenizer.Config = Gpt2Tokenizer.Config() + tokenizer: GPT2BPETokenizer.Config = GPT2BPETokenizer.Config() max_seq_len: int = 256 @classmethod diff --git a/pytext/data/test/data/gpt2_dict.txt b/pytext/data/test/data/gpt2_dict.txt new file mode 100644 index 000000000..89f10af8f --- /dev/null +++ b/pytext/data/test/data/gpt2_dict.txt @@ -0,0 +1,2 @@ +19703 850314647 +8690 800385005 diff --git a/pytext/data/test/data/gpt2_encoder.json b/pytext/data/test/data/gpt2_encoder.json new file mode 100644 index 000000000..423357bcc --- /dev/null +++ b/pytext/data/test/data/gpt2_encoder.json @@ -0,0 +1 @@ +{"otype": 8690, "Prot": 19703} \ No newline at end of file diff --git a/pytext/data/test/data/gpt2_vocab.bpe b/pytext/data/test/data/gpt2_vocab.bpe new file mode 100644 index 000000000..46556de51 --- /dev/null +++ b/pytext/data/test/data/gpt2_vocab.bpe @@ -0,0 +1,9 @@ +#version: 0.2 +ĠProt otype +r o +o t +p e +P ro +y pe +ot ype +Pro t diff --git a/pytext/data/test/tokenizers_test.py b/pytext/data/test/tokenizers_test.py index c78066268..ecbf10ce5 100644 --- a/pytext/data/test/tokenizers_test.py +++ b/pytext/data/test/tokenizers_test.py @@ -3,7 +3,8 @@ import unittest -from pytext.data.tokenizers import Tokenizer +from pytext.data.tokenizers import GPT2BPETokenizer, Tokenizer +from pytext.data.tokenizers.tokenizer import Token class TokenizeTest(unittest.TestCase): @@ -47,3 +48,19 @@ def test_split_with_regex(self): expected = "please buy me a coffee he implored in vain".split() tokens = tokenizer.tokenize(sentence) self.assertListEqual(expected, [t.value for t in tokens]) + + +class GPT2BPETest(unittest.TestCase): + def test_gpt2_bpe_tokenizer(self): + text = "Prototype" + expected = [Token("19703", 0, 4), Token("8690", 4, 9)] + tokenizer = GPT2BPETokenizer.from_config( + GPT2BPETokenizer.Config( + token_dictionary_path="pytext/data/test/data/gpt2_dict.txt", + bpe_vocab_path="pytext/data/test/data/gpt2_vocab.bpe", + bpe_encoder_path="pytext/data/test/data/gpt2_encoder.json", + ) + ) + tokens = tokenizer.tokenize(text) + print(tokens) + self.assertEqual(tokens, expected) diff --git a/pytext/data/tokenizers/__init__.py b/pytext/data/tokenizers/__init__.py index b05bba243..81fe97195 100644 --- a/pytext/data/tokenizers/__init__.py +++ b/pytext/data/tokenizers/__init__.py @@ -3,7 +3,7 @@ from .tokenizer import ( DoNothingTokenizer, - Gpt2Tokenizer, + GPT2BPETokenizer, Token, Tokenizer, WordPieceTokenizer, @@ -11,7 +11,7 @@ __all__ = [ - "Gpt2Tokenizer", + "GPT2BPETokenizer", "Token", "Tokenizer", "DoNothingTokenizer", diff --git a/pytext/data/tokenizers/tokenizer.py b/pytext/data/tokenizers/tokenizer.py index a6860e5e4..352a3ae0a 100644 --- a/pytext/data/tokenizers/tokenizer.py +++ b/pytext/data/tokenizers/tokenizer.py @@ -176,7 +176,7 @@ def __setstate__(self, state): self.re = regex -class Gpt2Tokenizer(Tokenizer): +class GPT2BPETokenizer(Tokenizer): """Tokenizer for gpt-2 and RoBERTa.""" class Config(ConfigBase):