From fd31ea1889bab046bf7fed16034fd609f06c2618 Mon Sep 17 00:00:00 2001 From: Fan Wang Date: Mon, 4 Nov 2019 20:36:52 -0800 Subject: [PATCH] Change model input tokens to optional (#1099) Summary: Pull Request resolved: https://github.com/facebookresearch/pytext/pull/1099 The Byte LSTM does not need the input of tokens, which it inherits from LSTM language model. Making it optional and pass it as None will allow to model to skip build vocab part and report confusing OOV problems. Reviewed By: kmalik22 Differential Revision: D18253210 fbshipit-source-id: 4de4a726c713b8f1d9b3f7991cef5118bf8d13c6 --- pytext/models/language_models/lmlstm.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pytext/models/language_models/lmlstm.py b/pytext/models/language_models/lmlstm.py index fc2160e3c..b79cc3fd4 100644 --- a/pytext/models/language_models/lmlstm.py +++ b/pytext/models/language_models/lmlstm.py @@ -52,7 +52,7 @@ class LMLSTM(BaseModel): class Config(BaseModel.Config): class ModelInput(Model.Config.ModelInput): - tokens: TokenTensorizer.Config = TokenTensorizer.Config( + tokens: Optional[TokenTensorizer.Config] = TokenTensorizer.Config( add_bos_token=True, add_eos_token=True ) @@ -67,8 +67,17 @@ class ModelInput(Model.Config.ModelInput): stateful: bool = False caffe2_format: ExporterType = ExporterType.PREDICTOR + @classmethod + def checkTokenConfig(cls, tokens: Optional[TokenTensorizer.Config]): + if tokens is None: + raise ValueError( + "Tokens cannot be None. Please set it to TokenTensorizer in" + "config file." + ) + @classmethod def from_config(cls, config: Config, tensorizers: Dict[str, Tensorizer]): + cls.checkTokenConfig(tensorizers["tokens"]) embedding = create_module(config.embedding, tensorizer=tensorizers["tokens"]) representation = create_module( config.representation, embed_dim=embedding.embedding_dim