Skip to content
This repository has been archived by the owner on Nov 22, 2022. It is now read-only.

Commit

Permalink
Unify GPT2BPE Tokenizer (#1110)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #1110

1. renmae Gpt2Tokenizer to GPT2BPETokenizer
2. remove GPTBPETokenizer since it is duplicated

Reviewed By: m3rlin45

Differential Revision: D18337568

fbshipit-source-id: a59d0c3e53933d2819c8c4e3224a02bc589b8c65
  • Loading branch information
chenyangyu1988 authored and facebook-github-bot committed Nov 6, 2019
1 parent 8eee775 commit 783b0c7
Show file tree
Hide file tree
Showing 7 changed files with 35 additions and 6 deletions.
4 changes: 2 additions & 2 deletions pytext/data/bert_tensorizer.py
Expand Up @@ -7,7 +7,7 @@
from fairseq.data.legacy.masked_lm_dictionary import BertDictionary
from pytext.config.component import ComponentType, create_component
from pytext.data.tensorizers import Tensorizer, TokenTensorizer, lookup_tokens
from pytext.data.tokenizers import Gpt2Tokenizer, Tokenizer, WordPieceTokenizer
from pytext.data.tokenizers import GPT2BPETokenizer, Tokenizer, WordPieceTokenizer
from pytext.data.utils import BOS, EOS, MASK, PAD, UNK, Vocabulary, pad_and_tensorize
from pytext.torchscript.tensorizer import ScriptRoBERTaTensorizer
from pytext.torchscript.vocab import ScriptVocabulary
Expand Down Expand Up @@ -121,7 +121,7 @@ def tensorize(self, batch):
class RoBERTaTensorizer(BERTTensorizer):
class Config(Tensorizer.Config):
columns: List[str] = ["text"]
tokenizer: Gpt2Tokenizer.Config = Gpt2Tokenizer.Config()
tokenizer: GPT2BPETokenizer.Config = GPT2BPETokenizer.Config()
max_seq_len: int = 256

@classmethod
Expand Down
2 changes: 2 additions & 0 deletions pytext/data/test/data/gpt2_dict.txt
@@ -0,0 +1,2 @@
19703 850314647
8690 800385005
1 change: 1 addition & 0 deletions pytext/data/test/data/gpt2_encoder.json
@@ -0,0 +1 @@
{"otype": 8690, "Prot": 19703}
9 changes: 9 additions & 0 deletions pytext/data/test/data/gpt2_vocab.bpe
@@ -0,0 +1,9 @@
#version: 0.2
ĠProt otype
r o
o t
p e
P ro
y pe
ot ype
Pro t
19 changes: 18 additions & 1 deletion pytext/data/test/tokenizers_test.py
Expand Up @@ -3,7 +3,8 @@

import unittest

from pytext.data.tokenizers import Tokenizer
from pytext.data.tokenizers import GPT2BPETokenizer, Tokenizer
from pytext.data.tokenizers.tokenizer import Token


class TokenizeTest(unittest.TestCase):
Expand Down Expand Up @@ -47,3 +48,19 @@ def test_split_with_regex(self):
expected = "please buy me a coffee he implored in vain".split()
tokens = tokenizer.tokenize(sentence)
self.assertListEqual(expected, [t.value for t in tokens])


class GPT2BPETest(unittest.TestCase):
def test_gpt2_bpe_tokenizer(self):
text = "Prototype"
expected = [Token("19703", 0, 4), Token("8690", 4, 9)]
tokenizer = GPT2BPETokenizer.from_config(
GPT2BPETokenizer.Config(
token_dictionary_path="pytext/data/test/data/gpt2_dict.txt",
bpe_vocab_path="pytext/data/test/data/gpt2_vocab.bpe",
bpe_encoder_path="pytext/data/test/data/gpt2_encoder.json",
)
)
tokens = tokenizer.tokenize(text)
print(tokens)
self.assertEqual(tokens, expected)
4 changes: 2 additions & 2 deletions pytext/data/tokenizers/__init__.py
Expand Up @@ -3,15 +3,15 @@

from .tokenizer import (
DoNothingTokenizer,
Gpt2Tokenizer,
GPT2BPETokenizer,
Token,
Tokenizer,
WordPieceTokenizer,
)


__all__ = [
"Gpt2Tokenizer",
"GPT2BPETokenizer",
"Token",
"Tokenizer",
"DoNothingTokenizer",
Expand Down
2 changes: 1 addition & 1 deletion pytext/data/tokenizers/tokenizer.py
Expand Up @@ -176,7 +176,7 @@ def __setstate__(self, state):
self.re = regex


class Gpt2Tokenizer(Tokenizer):
class GPT2BPETokenizer(Tokenizer):
"""Tokenizer for gpt-2 and RoBERTa."""

class Config(ConfigBase):
Expand Down

0 comments on commit 783b0c7

Please sign in to comment.