Skip to content

Commit

Permalink
Merge pull request #50 from gretelai/aw/syn-14-large-datasets
Browse files Browse the repository at this point in the history
Aw/syn 14 large datasets
  • Loading branch information
zredlined committed Sep 3, 2020
2 parents 81c6ad0 + c3a03cc commit b237bc4
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 10 deletions.
7 changes: 6 additions & 1 deletion src/gretel_synthetics/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ class BaseConfig:
will be replaced with the <unk> tag. Good defaults are ``0.995`` for languages with rich
character sets like Japanese or Chinese, and 1.0 for other languages or machine data.
Default is ``1.0``.
pretrain_sentences (optional): The number of lines spm_train first loads. Remaining lines are simply
discarded. Since spm_train loads entire corpus into memory, this size will depend on the memory
size of the machine. It also affects training time.
Default is ``100000``.
dp (optional): If ``True``, train model with differential privacy enabled. This setting provides
assurances that the models will encode general patterns in data rather than facts
about specific training examples. These additional guarantees can usefully strengthen
Expand Down Expand Up @@ -116,7 +120,6 @@ class BaseConfig:
rnn_units: int = 256
dropout_rate: float = 0.2
rnn_initializer: str = "glorot_uniform"
max_line_len: int = 2048

# Input data configs
field_delimiter: Optional[str] = None
Expand All @@ -125,6 +128,8 @@ class BaseConfig:
# Tokenizer settings
vocab_size: int = 20000
character_coverage: float = 1.0
pretrain_sentences: int = 100000
max_line_len: int = 2048

# Diff privacy configs
dp: bool = False
Expand Down
11 changes: 2 additions & 9 deletions src/gretel_synthetics/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,17 +166,10 @@ def _train_tokenizer(store: BaseConfig) -> spm.SentencePieceProcessor:
vocab_size=store.vocab_size,
hard_vocab_limit=False,
max_sentence_length=store.max_line_len,
input_sentence_size=store.pretrain_sentences,
shuffle_input_sentence=True,
character_coverage=store.character_coverage
)
"""
spm.SentencePieceTrainer.Train(
f'--input={store.training_data} '
f'--model_prefix={store.tokenizer_prefix} '
f'--user_defined_symbols=<n>,{store.field_delimiter_token} '
f'--vocab_size={store.vocab_size} '
f'--hard_vocab_limit=false '
f'--character_coverage={store.character_coverage}')
"""
_move_tokenizer_model(store)

sp = spm.SentencePieceProcessor()
Expand Down
1 change: 1 addition & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def test_local_config_settings(mkdir):
"rnn_initializer": "glorot_uniform",
"vocab_size": 20000,
"character_coverage": 1.0,
"pretrain_sentences": 100000,
"dp": False,
"dp_learning_rate": 0.015,
"dp_noise_multiplier": 1.1,
Expand Down

0 comments on commit b237bc4

Please sign in to comment.