### What in this notebook

- Running the code in `demo.ipynb` with the vulnerabilities dataset

### Loading the library

In [248]:
import onmt
from onmt.inputters.inputter import _load_vocab, _build_fields_vocab, get_fields, IterOnDevice
from onmt.inputters.corpus import ParallelCorpus
from onmt.inputters.dynamic_iterator import DynamicDatasetIter
from onmt.translate import GNMTGlobalScorer, Translator, TranslationBuilder
from onmt.utils.misc import set_random_seed

In [249]:
import yaml
import torch
import torch.nn as nn
from argparse import Namespace
from collections import defaultdict, Counter

In [250]:
# enable logging
from onmt.utils.logging import init_logger, logger
import os

# Defining log path to keep track of the experiment
LOG_PATH = 'log/log_100k_steps'
FULL_LOG_PATH = os.getcwd() + '/' + LOG_PATH
if os.path.exists(FULL_LOG_PATH):
    os.remove(FULL_LOG_PATH)
    
init_logger(log_file=LOG_PATH)

<RootLogger root (INFO)>

In [251]:
init_logger?

[0;31mSignature:[0m [0minit_logger[0m[0;34m([0m[0mlog_file[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mlog_file_level[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0mrotate[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m [0mlog_level[0m[0;34m=[0m[0;36m20[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mFile:[0m      ~/miniconda3/envs/vrepair/lib/python3.10/site-packages/onmt/utils/logging.py
[0;31mType:[0m      function


In [252]:
BATCH_SIZE=4 # data iterater definition
VALID_BATCH_SIZE=1 # data iterator definition
SRC_VOCAB_SIZE=2000 # fields definition
TGT_VOCAB_SIZE=2000 # fields definition
SRC_SEQ_LENGTH=1000 # currently not needed as we train a dummy on preprocessed data
TGT_SEQ_LENGTH=100 # currently not needed as we train a dummy on preprocessed data
LEARNING_RATE=0.0005 # loss definition
LABEL_SMOOTHING=0.1 # loss definition
ADAM_DECAY=0.9 # loss definition
RNN_HIDDEN=256 # model definition
EMBEDDING=256 # model definition
WORD_VEC=256 # model definition
DROP_OUT=0.1 # model definition
TRAIN_STEPS=100000
VALID_STEPS=20000

### Build fields

Use the original data to bulid processing field

In [253]:
src_vocab_path = "vul_data/data.vocab.src"
tgt_vocab_path = "vul_data/data.vocab.tgt"

In [254]:
# initialize the frequency counter
counters = defaultdict(Counter)
# load source vocab
_src_vocab, _src_vocab_size = _load_vocab(
    src_vocab_path,
    'src',
    counters)
# load target vocab
_tgt_vocab, _tgt_vocab_size = _load_vocab(
    tgt_vocab_path,
    'tgt',
    counters)

[2022-06-22 02:59:19,965 INFO] Loading src vocabulary from vul_data/data.vocab.src
[2022-06-22 02:59:20,020 INFO] Loaded src vocab has 36352 tokens.
[2022-06-22 02:59:20,031 INFO] Loading tgt vocabulary from vul_data/data.vocab.tgt
[2022-06-22 02:59:20,038 INFO] Loaded tgt vocab has 5924 tokens.


**RQ 1**: Compairing the generated dictionary with the one created from the original source code of VRepair

In [255]:
# initialize fields
src_nfeats, tgt_nfeats = 0, 0 # do not support word features for now
fields = get_fields(
    'text', src_nfeats, tgt_nfeats)

Sample output of VRepair log
> [2022-05-17 20:29:10,536 INFO] Loading src vocabulary from /home/lgm/VRepair2.0/param_sweep_tgt/10_parameter_sweep/data.vocab.src \
 [2022-05-17 20:29:10,579 INFO] Loaded src vocab has 36352 tokens. \
[2022-05-17 20:29:10,588 INFO] Loading tgt vocabulary from /home/lgm/VRepair2.0/param_sweep_tgt/10_parameter_sweep/data.vocab.tgt \
[2022-05-17 20:29:10,594 INFO] Loaded tgt vocab has 5924 tokens. \
[2022-05-17 20:29:10,596 INFO] Building fields with vocab in counters... \
[2022-05-17 20:29:10,599 INFO]  * tgt vocab size: 5004. \
[2022-05-17 20:29:10,631 INFO]  * src vocab size: 5002. \
[2022-05-17 20:29:10,632 INFO]  * src vocab size = 5002 \
[2022-05-17 20:29:10,632 INFO]  * tgt vocab size = 5004

**ANS RQ1:** The vocab generated is the same with previous experiments

In [256]:
share_vocab = False
vocab_size_multiple = 1
src_vocab_size = SRC_VOCAB_SIZE
tgt_vocab_size = TGT_VOCAB_SIZE
src_words_min_frequency = 1
tgt_words_min_frequency = 1
vocab_fields = _build_fields_vocab(
    fields, counters, 'text', share_vocab,
    vocab_size_multiple,
    src_vocab_size, src_words_min_frequency,
    tgt_vocab_size, tgt_words_min_frequency)

[2022-06-22 02:59:22,450 INFO]  * tgt vocab size: 2004.
[2022-06-22 02:59:22,470 INFO]  * src vocab size: 2002.


In [257]:
_build_fields_vocab?

[0;31mSignature:[0m
[0m_build_fields_vocab[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mfields[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcounters[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdata_type[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mshare_vocab[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvocab_size_multiple[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msrc_vocab_size[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msrc_words_min_frequency[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtgt_vocab_size[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtgt_words_min_frequency[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msrc_specials[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtgt_specials[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mFile:[0m      ~/miniconda3/envs/vrepair/lib/python3.10/site-packages/onmt/inputters/inputter.py
[0;31mType:[

### Model and optimizer creation

From this point on, the field vocab is used instead of the origial vocab 

In [258]:
src_text_field = vocab_fields["src"].base_field
src_vocab = src_text_field.vocab 
src_padding = src_vocab.stoi[src_text_field.pad_token]

tgt_text_field = vocab_fields['tgt'].base_field
tgt_vocab = tgt_text_field.vocab
tgt_padding = tgt_vocab.stoi[tgt_text_field.pad_token]

In [259]:
emb_size = EMBEDDING
rnn_size = RNN_HIDDEN
# Specify the core model.

encoder_embeddings = onmt.modules.Embeddings(emb_size, len(src_vocab),
                                             word_padding_idx=src_padding)

encoder = onmt.encoders.RNNEncoder(hidden_size=rnn_size, num_layers=1,
                                   rnn_type="LSTM", bidirectional=True,
                                   embeddings=encoder_embeddings)

decoder_embeddings = onmt.modules.Embeddings(emb_size, len(tgt_vocab),
                                             word_padding_idx=tgt_padding)
decoder = onmt.decoders.decoder.InputFeedRNNDecoder(
    hidden_size=rnn_size, num_layers=1, bidirectional_encoder=True, 
    rnn_type="LSTM", embeddings=decoder_embeddings)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = onmt.models.model.NMTModel(encoder, decoder)
model.to(device)

# Specify the tgt word generator and loss computation module
model.generator = nn.Sequential(
    nn.Linear(rnn_size, len(tgt_vocab)),
    nn.LogSoftmax(dim=-1)).to(device)

loss = onmt.utils.loss.NMTLossCompute(
    criterion=onmt.utils.loss.LabelSmoothingLoss(ignore_index=tgt_padding, label_smoothing=LABEL_SMOOTHING, tgt_vocab_size=len(tgt_vocab)),
    generator=model.generator)
loss.to(device)

NMTLossCompute(
  (criterion): LabelSmoothingLoss()
  (generator): Sequential(
    (0): Linear(in_features=256, out_features=2004, bias=True)
    (1): LogSoftmax(dim=-1)
  )
)

```
from onmt.opts import dynamic_prepare_opts
from onmt.utils.parse import ArgumentParser
from onmt.constants import ModelTask
parser = ArgumentParser(description='build_loss_compute')

base_args = (["-copy_attn", "True" , "-label_smoothing", str(LABEL_SMOOTHING), "-model_task", ModelTask.SEQ2SEQ])
opts, unknown = parser.parse_known_args(base_args)
loss = onmt.utils.loss.build_loss_compute(model, tgt_field=tgt_text_field, opt=opts, train=True)
valid_loss = onmt.utils.loss.build_loss_compute(model, tgt_field=tgt_text_field, opt=opts, train=False)
```

In [260]:
lr = LEARNING_RATE
torch_optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=ADAM_DECAY)
optim = onmt.utils.optimizers.Optimizer(
    torch_optimizer, learning_rate=lr)

In [261]:
print(model)

NMTModel(
  (encoder): RNNEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(2002, 256, padding_idx=1)
        )
      )
    )
    (rnn): LSTM(256, 128, bidirectional=True)
  )
  (decoder): InputFeedRNNDecoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(2004, 256, padding_idx=1)
        )
      )
    )
    (dropout): Dropout(p=0.0, inplace=False)
    (rnn): StackedLSTM(
      (dropout): Dropout(p=0.0, inplace=False)
      (layers): ModuleList(
        (0): LSTMCell(512, 256)
      )
    )
    (attn): GlobalAttention(
      (linear_in): Linear(in_features=256, out_features=256, bias=False)
      (linear_out): Linear(in_features=512, out_features=256, bias=False)
    )
  )
  (generator): Sequential(
    (0): Linear(in_features=256, out_features=2004, bias=True)
    (1): LogSoftmax(dim=-1)
  )
)


### Create data iterator

In [262]:
src_train = "vul_data/random_fine_tune_train.src.txt"
tgt_train = "vul_data/random_fine_tune_train.tgt.txt"
src_val = "vul_data/random_fine_tune_valid.src.txt"
tgt_val = "vul_data/random_fine_tune_valid.tgt.txt"

# build the ParallelCorpus
corpus = ParallelCorpus("corpus", src_train, tgt_train)
valid = ParallelCorpus("valid", src_val, tgt_val)

In [263]:
# build the training iterator
train_iter = DynamicDatasetIter(
    corpora={"corpus": corpus},
    corpora_info={"corpus": {"weight": 1}},
    transforms={},
    fields=vocab_fields,
    is_train=True,
    batch_type="sents",
    batch_size=BATCH_SIZE,
    batch_size_multiple=1,
    data_type="text")

In [264]:
DynamicDatasetIter?

[0;31mInit signature:[0m
[0mDynamicDatasetIter[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mcorpora[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcorpora_info[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtransforms[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfields[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mis_train[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbatch_type[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbatch_size[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbatch_size_multiple[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdata_type[0m[0;34m=[0m[0;34m'text'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbucket_size[0m[0;34m=[0m[0;36m2048[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpool_factor[0m[0;34m=[0m[0;36m8192[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstride[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0moffset[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31

In [265]:
# make sure the iteration happens on GPU 0 (-1 for CPU, N for GPU N)
train_iter = iter(IterOnDevice(train_iter, 0))

In [266]:
# build the validation iterator
valid_iter = DynamicDatasetIter(
    corpora={"valid": valid},
    corpora_info={"valid": {"weight": 1}},
    transforms={},
    fields=vocab_fields,
    is_train=False,
    batch_type="sents",
    batch_size=VALID_BATCH_SIZE,
    batch_size_multiple=1,
    data_type="text")

In [267]:
valid_iter = IterOnDevice(valid_iter, 0)

### Training

In [None]:
%%capture output
report_manager = onmt.utils.ReportMgr(
    report_every=50, start_time=None, tensorboard_writer=None)

trainer = onmt.Trainer(model=model,
                       train_loss=loss,
                       valid_loss=loss,
                       optim=optim,
                       report_manager=report_manager,
                       dropout=DROP_OUT)

trainer.train(train_iter=train_iter,
              train_steps=TRAIN_STEPS,
              valid_iter=valid_iter,
              valid_steps=VALID_STEPS)

[2022-06-22 02:59:37,722 INFO] Start training loop and validate every 20000 steps...
[2022-06-22 02:59:37,723 INFO] corpus's transforms: TransformPipe()
[2022-06-22 02:59:37,723 INFO] Weighted corpora loaded so far:
			* corpus: 1
[2022-06-22 02:59:40,291 INFO] Step 50/100000; acc:  11.37; ppl: 210.14; xent: 5.35; lr: 0.00050; 21252/2398 tok/s;      3 sec
[2022-06-22 02:59:42,465 INFO] Step 100/100000; acc:  11.70; ppl: 52.43; xent: 3.96; lr: 0.00050; 28169/2575 tok/s;      5 sec
[2022-06-22 02:59:44,599 INFO] Step 150/100000; acc:  11.63; ppl: 49.46; xent: 3.90; lr: 0.00050; 28154/2604 tok/s;      7 sec
[2022-06-22 02:59:46,659 INFO] Step 200/100000; acc:  11.84; ppl: 46.34; xent: 3.84; lr: 0.00050; 26642/2641 tok/s;      9 sec
[2022-06-22 02:59:48,841 INFO] Step 250/100000; acc:  11.62; ppl: 47.70; xent: 3.86; lr: 0.00050; 27344/2624 tok/s;     11 sec
[2022-06-22 02:59:50,944 INFO] Step 300/100000; acc:  11.14; ppl: 48.47; xent: 3.88; lr: 0.00050; 25877/2667 tok/s;     13 sec
[2022-0