Change model input tokens to optional (#1099)

Summary: Pull Request resolved: #1099 The Byte LSTM does not need the input of tokens, which it inherits from LSTM language model. Making it optional and pass it as None will allow to model to skip build vocab part and report confusing OOV problems. Reviewed By: kmalik22 Differential Revision: D18253210 fbshipit-source-id: 42080fae1c22bf86648892bb18d01a1ad07b6a9c
facebookresearch · Nov 6, 2019 · 7f167a7 · 7f167a7
1 parent 783b0c7
commit 7f167a7
Showing 1 changed file with 10 additions and 1 deletion.
diff --git a/pytext/models/language_models/lmlstm.py b/pytext/models/language_models/lmlstm.py
@@ -52,7 +52,7 @@ class LMLSTM(BaseModel):
 
     class Config(BaseModel.Config):
         class ModelInput(Model.Config.ModelInput):
-            tokens: TokenTensorizer.Config = TokenTensorizer.Config(
+            tokens: Optional[TokenTensorizer.Config] = TokenTensorizer.Config(
                 add_bos_token=True, add_eos_token=True
             )
 
@@ -67,8 +67,17 @@ class ModelInput(Model.Config.ModelInput):
         stateful: bool = False
         caffe2_format: ExporterType = ExporterType.PREDICTOR
 
+    @classmethod
+    def checkTokenConfig(cls, tokens: Optional[TokenTensorizer.Config]):
+        if tokens is None:
+            raise ValueError(
+                "Tokens cannot be None. Please set it to TokenTensorizer in"
+                "config file."
+            )
+
     @classmethod
     def from_config(cls, config: Config, tensorizers: Dict[str, Tensorizer]):
+        cls.checkTokenConfig(tensorizers["tokens"])
         embedding = create_module(config.embedding, tensorizer=tensorizers["tokens"])
         representation = create_module(
             config.representation, embed_dim=embedding.embedding_dim