In [1]:
import onmt
import torch
import torch.nn as nn

In [5]:
from collections import defaultdict, Counter
from onmt.inputters.inputter import _load_vocab, _build_fields_vocab, get_fields, IterOnDevice
from onmt.dynamic.corpus import ParallelCorpus
from onmt.dynamic.iterator import DynamicDatasetIter
from argparse import Namespace
import yaml

In [31]:
# enable logging
from onmt.utils.logging import init_logger, logger
init_logger()

<RootLogger root (INFO)>

### Retrieve data

In [2]:
!wget https://s3.amazonaws.com/opennmt-trainingdata/toy-ende.tar.gz

--2020-09-22 12:55:36--  https://s3.amazonaws.com/opennmt-trainingdata/toy-ende.tar.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.88.237
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.88.237|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1662081 (1,6M) [application/x-gzip]
Saving to: ‘toy-ende.tar.gz’


2020-09-22 12:55:37 (2,55 MB/s) - ‘toy-ende.tar.gz’ saved [1662081/1662081]



In [3]:
!tar xf toy-ende.tar.gz

In [4]:
ls toy-ende

src-test.txt   src-val.txt   tgt-train.txt
src-train.txt  tgt-test.txt  tgt-val.txt


### Prepare data and vocab

In [16]:
yaml_config = """
## Where the vocab(s) will be written
save_data: toy-ende/run/example
# Corpus opts:
data:
    corpus:
        path_src: toy-ende/src-train.txt
        path_tgt: toy-ende/tgt-train.txt
        transforms: []
        weight: 1
    valid:
        path_src: data/src-val.txt
        path_tgt: data/tgt-val.txt
        transforms: []
"""
config = yaml.safe_load(yaml_config)
with open("toy-ende/config.yaml", "w") as f:
    f.write(yaml_config)

In [11]:
from onmt.dynamic.parse import DynamicArgumentParser
parser = DynamicArgumentParser(description='build_vocab.py')

In [12]:
from onmt.dynamic.opts import dynamic_prepare_opts
dynamic_prepare_opts(parser)

In [18]:
base_args = (["-config", "toy-ende/config.yaml", "-src_vocab", "toto"])
parser.parse_known_args(base_args)

(Namespace(config='toy-ende/config.yaml', data="{'corpus': {'path_src': 'toy-ende/src-train.txt', 'path_tgt': 'toy-ende/tgt-train.txt', 'transforms': [], 'weight': 1}, 'valid': {'path_src': 'data/src-val.txt', 'path_tgt': 'data/tgt-val.txt', 'transforms': []}}", dynamic_dict=False, fast_align_model=None, fast_align_root=None, insert_ratio=0.0, lemmatize_target=50, mask_length='subword', mask_ratio=0.0, max_num_tags=3, max_tag_id=10, max_term_tokens=3, max_terms=3, n_sample=-1, onmttok_kwargs="{'mode': 'none'}", overlap='nested', overwrite=False, permute_sent_ratio=0.0, poisson_lambda=0.0, random_ratio=0.0, replace_length=-1, rotate_ratio=0.5, save_config=None, save_data='toy-ende/run/example', share_vocab=False, src_seq_length=200, src_seq_length_trunc=None, src_subword_model=None, src_subword_type='none', src_vocab='toto', src_vocab_size=50000, src_words_min_frequency=0, subword_alpha=0, subword_nbest=1, switchout_temperature=1.0, tags_ratio=0.1, terminology_ratio=0.1, tgt_seq_length=

We need to build the vocab from the text files.

In [3]:
src_vocab_path = "../../toy-ende/run/example.vocab.src"
tgt_vocab_path = "../../toy-ende/run/example.vocab.tgt"

In [4]:
# initialize the frequency counter
counters = defaultdict(Counter)
# load source vocab
_src_vocab, _src_vocab_size = _load_vocab(
    src_vocab_path,
    'src',
    counters,
    min_freq=1)
# load target vocab
_tgt_vocab, _tgt_vocab_size = _load_vocab(
    tgt_vocab_path,
    'tgt',
    counters,
    min_freq=1)

In [5]:
# initialize fields
src_nfeats, tgt_nfeats = 0, 0 # do not support word features for now
fields = get_fields(
    'text', src_nfeats, tgt_nfeats)

In [6]:
fields

{'src': <onmt.inputters.text_dataset.TextMultiField at 0x7fb9cf548cc0>,
 'tgt': <onmt.inputters.text_dataset.TextMultiField at 0x7fb9cebe4898>,
 'indices': <torchtext.data.field.Field at 0x7fb9cebe4978>}

In [7]:
# build fields vocab
share_vocab = False
vocab_size_multiple = 1
src_vocab_size = 30000
tgt_vocab_size = 30000
src_words_min_frequency = 1
tgt_words_min_frequency = 1
vocab_fields = _build_fields_vocab(
    fields, counters, 'text', share_vocab,
    vocab_size_multiple,
    src_vocab_size, src_words_min_frequency,
    tgt_vocab_size, tgt_words_min_frequency)

An alternative way of creating these fields is to run `onmt_train` without actually training, to just output the necessary files.

In [8]:
src_text_field = vocab_fields["src"].base_field
src_vocab = src_text_field.vocab
src_padding = src_vocab.stoi[src_text_field.pad_token]

tgt_text_field = vocab_fields['tgt'].base_field
tgt_vocab = tgt_text_field.vocab
tgt_padding = tgt_vocab.stoi[tgt_text_field.pad_token]

Next we specify the core model itself. Here we will build a small model with an encoder and an attention based input feeding decoder. Both models will be RNNs and the encoder will be bidirectional

In [20]:
emb_size = 100
rnn_size = 500
# Specify the core model.

encoder_embeddings = onmt.modules.Embeddings(emb_size, len(src_vocab),
                                             word_padding_idx=src_padding)

encoder = onmt.encoders.RNNEncoder(hidden_size=rnn_size, num_layers=1,
                                   rnn_type="LSTM", bidirectional=True,
                                   embeddings=encoder_embeddings)

decoder_embeddings = onmt.modules.Embeddings(emb_size, len(tgt_vocab),
                                             word_padding_idx=tgt_padding)
decoder = onmt.decoders.decoder.InputFeedRNNDecoder(
    hidden_size=rnn_size, num_layers=1, bidirectional_encoder=True, 
    rnn_type="LSTM", embeddings=decoder_embeddings)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = onmt.models.model.NMTModel(encoder, decoder)
model.to(device)

# Specify the tgt word generator and loss computation module
model.generator = nn.Sequential(
    nn.Linear(rnn_size, len(tgt_vocab)),
    nn.LogSoftmax(dim=-1)).to(device)

loss = onmt.utils.loss.NMTLossCompute(
    criterion=nn.NLLLoss(ignore_index=tgt_padding, reduction="sum"),
    generator=model.generator)

Now we set up the optimizer. This could be a core torch optim class, or our wrapper which handles learning rate updates and gradient normalization automatically.

In [21]:
lr = 1
torch_optimizer = torch.optim.SGD(model.parameters(), lr=lr)
optim = onmt.utils.optimizers.Optimizer(
    torch_optimizer, learning_rate=lr, max_grad_norm=2)

Now we need to create the dynamic dataset iterator.

In [22]:
src_train = "../../toy-ende/src-train.txt"
tgt_train = "../../toy-ende/tgt-train.txt"
src_val = "../../toy-ende/src-val.txt"
tgt_val = "../../toy-ende/tgt-val.txt"

# build the ParallelCorpus
corpus = ParallelCorpus(src_train, tgt_train)
valid = ParallelCorpus(src_val, tgt_val)

In [23]:
corpora = {"corpus": corpus}
transforms = {}
opts = Namespace()
opts.batch_size = 4096
opts.batch_type = "tokens"
opts.valid_batch_size = 8
opts.batch_size_multiple = 1
opts.data_type = "text"
opts.bucket_size = 4096
opts.pool_factor = 100
opts.data = {"corpus": {"weight": 1}}

In [24]:
# build the dataset iterator
is_train = True
train_iter = DynamicDatasetIter(
    corpora, transforms, vocab_fields, opts, is_train,
    stride=1, offset=0)

### corpora_info {'corpus': {'weight': 1}}


In [25]:
train_iter = iter(IterOnDevice(train_iter, 0))

In [26]:
corpora = {"valid": valid}
transforms = {}
opts = Namespace()
opts.batch_size = 4096
opts.batch_type = "tokens"
opts.valid_batch_size = 8
opts.batch_size_multiple = 1
opts.data_type = "text"
opts.bucket_size = 4096
opts.pool_factor = 100
opts.data = {"valid": {"weight": 1}}

In [27]:
# build the dataset iterator
is_train = False
valid_iter = DynamicDatasetIter(
    corpora, transforms, vocab_fields, opts, is_train,
    stride=1, offset=0)

### corpora_info {'valid': {'weight': 1}}


In [28]:
valid_iter = IterOnDevice(valid_iter, 0)

Finally we train.

In [29]:


report_manager = onmt.utils.ReportMgr(
    report_every=50, start_time=None, tensorboard_writer=None)

trainer = onmt.Trainer(model=model,
                       train_loss=loss,
                       valid_loss=loss,
                       optim=optim,
                       report_manager=report_manager)

trainer.train(train_iter=train_iter,
              train_steps=400,
              valid_iter=valid_iter,
              valid_steps=200)

[2020-09-22 12:52:42,213 INFO] Start training loop and validate every 200 steps...
[2020-09-22 12:52:42,214 INFO] corpus's transforms: TransformPipe()
[2020-09-22 12:52:42,215 INFO] Loading ParallelCorpus(../../toy-ende/src-train.txt, ../../toy-ende/tgt-train.txt, align=None)...
[2020-09-22 12:52:48,315 INFO] Step 50/  400; acc:   7.77; ppl: 4517.68; xent: 8.42; lr: 1.00000; 19806/19563 tok/s;      6 sec
[2020-09-22 12:52:51,717 INFO] Loading ParallelCorpus(../../toy-ende/src-train.txt, ../../toy-ende/tgt-train.txt, align=None)...
[2020-09-22 12:52:54,251 INFO] Step 100/  400; acc:   9.42; ppl: 1884.87; xent: 7.54; lr: 1.00000; 18663/18757 tok/s;     12 sec
[2020-09-22 12:53:00,160 INFO] Step 150/  400; acc:   9.94; ppl: 1438.19; xent: 7.27; lr: 1.00000; 19698/19626 tok/s;     18 sec
[2020-09-22 12:53:01,289 INFO] Loading ParallelCorpus(../../toy-ende/src-train.txt, ../../toy-ende/tgt-train.txt, align=None)...
[2020-09-22 12:53:06,249 INFO] Step 200/  400; acc:  11.20; ppl: 1121.74; xe

<onmt.utils.statistics.Statistics at 0x7fb9c0241588>

To use the model, we need to load up the translation functions 

In [8]:
import onmt.translate

In [12]:
translator = onmt.translate.Translator(beam_size=10, fields=data.fields, model=model)
builder = onmt.translate.TranslationBuilder(data=valid_data, fields=data.fields)

valid_data.src_vocabs
for batch in valid_iter:
    trans_batch = translator.translate_batch(batch=batch, data=valid_data)
    translations = builder.from_batch(trans_batch)
    for trans in translations:
        print(trans.log(0))
    break

PRED SCORE: -4.0690

SENT 0: ('The', 'competitors', 'have', 'other', 'advantages', ',', 'too', '.')
PRED 0: .

PRED SCORE: -4.2736

SENT 0: ('The', 'company', '&apos;s', 'durability', 'goes', 'back', 'to', 'its', 'first', 'boss', ',', 'a', 'visionary', ',', 'Thomas', 'J.', 'Watson', 'Sr.')
PRED 0: .

PRED SCORE: -4.0144

SENT 0: ('&quot;', 'From', 'what', 'we', 'know', 'today', ',', 'you', 'have', 'to', 'ask', 'how', 'I', 'could', 'be', 'so', 'wrong', '.', '&quot;')
PRED 0: .

PRED SCORE: -4.1361

SENT 0: ('Boeing', 'Co', 'shares', 'rose', '1.5%', 'to', '$', '67.94', '.')
PRED 0: .

PRED SCORE: -4.1382

SENT 0: ('Some', 'did', 'not', 'believe', 'him', ',', 'they', 'said', 'that', 'he', 'got', 'dizzy', 'even', 'in', 'the', 'truck', ',', 'but', 'always', 'wanted', 'to', 'fulfill', 'his', 'dream', ',', 'that', 'of', 'becoming', 'a', 'pilot', '.')
PRED 0: .

PRED SCORE: -3.8881

SENT 0: ('In', 'your', 'opinion', ',', 'the', 'council', 'should', 'ensure', 'that', 'the', 'band', 'immediately

  return self.add_(other)
