Merge pull request #50 from gretelai/aw/syn-14-large-datasets

Aw/syn 14 large datasets
gretelai · Sep 3, 2020 · b237bc4 · b237bc4
2 parents 81c6ad0 + c3a03cc
commit b237bc4
Show file tree

Hide file tree

Showing 3 changed files with 9 additions and 10 deletions.
diff --git a/src/gretel_synthetics/config.py b/src/gretel_synthetics/config.py
@@ -69,6 +69,10 @@ class BaseConfig:
             will be replaced with the <unk> tag. Good defaults are ``0.995`` for languages with rich
             character sets like Japanese or Chinese, and 1.0 for other languages or machine data.
             Default is ``1.0``.
+        pretrain_sentences (optional): The number of lines spm_train first loads. Remaining lines are simply
+            discarded. Since spm_train loads entire corpus into memory, this size will depend on the memory
+            size of the machine. It also affects training time.
+            Default is ``100000``.
         dp (optional): If ``True``, train model with differential privacy enabled. This setting provides
             assurances that the models will encode general patterns in data rather than facts
             about specific training examples. These additional guarantees can usefully strengthen
@@ -116,7 +120,6 @@ class BaseConfig:
     rnn_units: int = 256
     dropout_rate: float = 0.2
     rnn_initializer: str = "glorot_uniform"
-    max_line_len: int = 2048
 
     # Input data configs
     field_delimiter: Optional[str] = None
@@ -125,6 +128,8 @@ class BaseConfig:
     # Tokenizer settings
     vocab_size: int = 20000
     character_coverage: float = 1.0
+    pretrain_sentences: int = 100000
+    max_line_len: int = 2048
 
     # Diff privacy configs
     dp: bool = False

diff --git a/src/gretel_synthetics/train.py b/src/gretel_synthetics/train.py
@@ -166,17 +166,10 @@ def _train_tokenizer(store: BaseConfig) -> spm.SentencePieceProcessor:
         vocab_size=store.vocab_size,
         hard_vocab_limit=False,
         max_sentence_length=store.max_line_len,
+        input_sentence_size=store.pretrain_sentences,
+        shuffle_input_sentence=True,
         character_coverage=store.character_coverage
     )
-    """
-    spm.SentencePieceTrainer.Train(
-        f'--input={store.training_data} '
-        f'--model_prefix={store.tokenizer_prefix} '
-        f'--user_defined_symbols=<n>,{store.field_delimiter_token} '
-        f'--vocab_size={store.vocab_size} '
-        f'--hard_vocab_limit=false '
-        f'--character_coverage={store.character_coverage}')
-    """
     _move_tokenizer_model(store)
 
     sp = spm.SentencePieceProcessor()

diff --git a/tests/test_config.py b/tests/test_config.py
@@ -40,6 +40,7 @@ def test_local_config_settings(mkdir):
         "rnn_initializer": "glorot_uniform",
         "vocab_size": 20000,
         "character_coverage": 1.0,
+        "pretrain_sentences": 100000,
         "dp": False,
         "dp_learning_rate": 0.015,
         "dp_noise_multiplier": 1.1,