# Educativo

In [13]:
!python training.py --name_file=Educativo/educativo  --dir_files=ParallelCorpora --dir_results=results_educativo_5/bpe_5000 --evaluate=True --bpe=False

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
4768it [00:03, 1310.11it/s]
607it [00:00, 1394.52it/s]
607it [00:00, 1353.53it/s]
100%|████████████████████████████████████| 5375/5375 [00:00<00:00, 81738.23it/s]
^C
Traceback (most recent call last):
  File "training.py", line 150, in <module>
    fire.Fire(main)
  File "/home/krivas/anaconda3/lib/python3.7/site-packages/fire/core.py", line 127, in Fire
    component_trace = _Fire(component, args, context, name)
  File "/home/krivas/anaconda3/lib/python3.7/site-packages/fire/core.py", line 366, in _Fire
    component, remaining_args)
  File "/home/krivas/anaconda3/lib/python3.7/site-packages/fire/core.py", line 542, in _CallCallable
    result = fn(*varargs, **kwargs)
  File "training.py", line 141, in main
    hypothesis = (' '.join(predictor.predict_instance(instance)['predicted_tokens'])).replace('@@UNKNOWN@@', '').replace('@@ ', '').split()
  File "/home/krivas/anaconda3/lib/python3.7/site-pa

In [20]:
# coding: utf-8

import itertools
from typing import Iterator, List, Dict
import fire
from allennlp.common.util import prepare_environment
from allennlp.data.dataset_readers.seq2seq import Seq2SeqDatasetReader
from allennlp.data.iterators import BucketIterator
from allennlp.data.fields import TextField, IndexField
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.data import Instance

from allennlp.data.tokenizers.character_tokenizer import CharacterTokenizer
from allennlp.data.tokenizers.word_tokenizer import WordTokenizer
from allennlp.data.vocabulary import Vocabulary
from allennlp.nn.activations import Activation
from allennlp.models.encoder_decoders.simple_seq2seq import SimpleSeq2Seq
from allennlp.models import DecomposableAttention
from allennlp.modules.attention import LinearAttention, BilinearAttention, DotProductAttention
from allennlp.modules.seq2seq_encoders import PytorchSeq2SeqWrapper, StackedSelfAttentionEncoder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.predictors import SimpleSeq2SeqPredictor
from allennlp.training.trainer import Trainer
from allennlp.data.dataset_readers import DatasetReader
from allennlp.training.metrics.bleu import BLEU
from allennlp.common.params import Params

prepare_environment(Params({}))

import random
import torch
import torch.optim as optim
import nltk
import pandas as pd
import numpy as np
import os 
from pathlib import Path
import warnings

EN_EMBEDDING_DIM = 128
ZH_EMBEDDING_DIM = 128
CUDA_DEVICE = 0

def main(name_file='Educativo/educativo', dir_files='ParallelCorpora', dir_results='results_educativo_4/bpe_5000', cuda_id=0, hidden_dim=1024, evaluate=True, bpe=False):
    
    dir_train = os.path.join(dir_files, name_file)
    dir_test = os.path.join(dir_files, name_file)
    os.makedirs(dir_results, exist_ok=True)
    
    reader = Seq2SeqDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=WordTokenizer(),
        delimiter='\t',
        source_token_indexers={'tokens': SingleIdTokenIndexer()},
        target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')})
    
    if bpe:
        train_dataset = reader.read(os.path.join(dir_files, name_file + '_train.bpe.tsv'))
        validation_dataset = reader.read(os.path.join(dir_files, name_file + '_val.bpe.tsv'))
        test_dataset = reader.read(os.path.join(dir_files, name_file + '_test.bpe.tsv'))
    else:
        train_dataset = reader.read(os.path.join(dir_files, name_file + '_train.tsv'))
        validation_dataset = reader.read(os.path.join(dir_files, name_file + '_val.tsv'))
        test_dataset = reader.read(os.path.join(dir_files, name_file + '_test.tsv'))
    
    vocab = Vocabulary.from_instances(train_dataset + validation_dataset,
                                      min_count={'tokens': 3, 'target_tokens': 3})

    en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                             embedding_dim=EN_EMBEDDING_DIM)
    print('size of vocab:', vocab)

    encoder = PytorchSeq2SeqWrapper(torch.nn.GRU(EN_EMBEDDING_DIM, hidden_dim, dropout=0.25, num_layers=2, bidirectional=True, batch_first=True))

    source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})
    # attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')())
    # attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM)
    attention = DotProductAttention()

    max_decoding_steps = 100   # TODO: make this variable
    model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps,
                          target_embedding_dim=ZH_EMBEDDING_DIM,
                          target_namespace='target_tokens',
                          attention=attention,
                          beam_size=1,
                          use_bleu=True).cuda()
    
    optimizer = optim.Adam(model.parameters())
    iterator = BucketIterator(batch_size=100, sorting_keys=[("source_tokens", "num_tokens")])

    iterator.index_with(vocab)

    if not evaluate:
        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          iterator=iterator,
                          train_dataset=train_dataset,
                          #validation_metric='+BLEU',
                          validation_dataset=validation_dataset,
                          num_epochs=40,
                          serialization_dir=dir_results,
                          cuda_device=cuda_id)

        trainer.train()
        
        
        
    #reader = CustomSeq2SeqDatasetReader(
    #    vocab,
    #    source_tokenizer=WordTokenizer(),
    #    target_tokenizer=WordTokenizer(),
    #    delimiter='\t',
    #    source_token_indexers={'tokens': SingleIdTokenIndexer('source_tokens')},
    #    target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')})
    
    if bpe:
        test_dataset = reader.read(os.path.join(dir_files, name_file + '_test.bpe.tsv'))
    else:
        test_dataset = reader.read(os.path.join(dir_files, name_file + '_test.tsv'))

    with open(os.path.join(dir_results, "best.th"), 'rb') as f:
        model.load_state_dict(torch.load(f))
    model.eval()
    predictor = SimpleSeq2SeqPredictor(model, reader)
    total_bleu = 0
    len_test = 0
    for instance in test_dataset:

        hypothesis = (' '.join(predictor.predict_instance(instance)['predicted_tokens'])).replace('@@UNKNOWN@@', '').replace('@@ ', '').split()
        reference = (' '.join([token.text for token in instance['target_tokens'].tokens[1:-1]]).replace('@@ ', '')).split()
        print(instance['source_tokens'].tokens[1:-1])
        print(hypothesis)
        print(reference)
        print()
        
        if len(reference) and len(instance['source_tokens'].tokens[1:-1]):
            len_test += 1

            total_bleu += nltk.translate.bleu_score.sentence_bleu([reference], hypothesis)
    
    print('=================================')
    print('BLEU test:', total_bleu / len_test)
                                                                            

In [21]:
main(dir_results='results_flashcards_6/bpe_5000')

4768it [00:03, 1250.56it/s]
607it [00:00, 1412.75it/s]
607it [00:00, 1376.78it/s]
100%|██████████| 5375/5375 [00:00<00:00, 81650.01it/s]


size of vocab: Vocabulary with namespaces:
 	Non Padded Namespaces: {'*tags', '*labels'}
 	Namespace: tokens, Size: 1671 
 	Namespace: target_tokens, Size: 1417 



607it [00:00, 906.28it/s] 


RuntimeError: Error(s) in loading state_dict for SimpleSeq2Seq:
	size mismatch for _source_embedder.token_embedder_tokens.weight: copying a param with shape torch.Size([866, 128]) from checkpoint, the shape in current model is torch.Size([1671, 128]).
	size mismatch for _target_embedder.weight: copying a param with shape torch.Size([770, 128]) from checkpoint, the shape in current model is torch.Size([1417, 128]).
	size mismatch for _output_projection_layer.weight: copying a param with shape torch.Size([770, 2048]) from checkpoint, the shape in current model is torch.Size([1417, 2048]).
	size mismatch for _output_projection_layer.bias: copying a param with shape torch.Size([770]) from checkpoint, the shape in current model is torch.Size([1417]).

In [1]:
!python training.py --name_file=Educativo/all/educativo_5000  --dir_files=ParallelCorpora --dir_results=results_educativo_2/bpe_5000

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
4768it [00:04, 1159.86it/s]
607it [00:00, 1251.29it/s]
607it [00:00, 1185.72it/s]
100%|████████████████████████████████████| 5375/5375 [00:00<00:00, 72537.09it/s]
BLEU test: 0.03533705687076492


In [2]:
!python training.py --name_file=Educativo/all/educativo_10000  --dir_files=ParallelCorpora --dir_results=results_educativo_{folder}/bpe_10000

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
4768it [00:03, 1246.85it/s]
607it [00:00, 1346.29it/s]
607it [00:00, 1260.95it/s]
100%|████████████████████████████████████| 5375/5375 [00:00<00:00, 74385.74it/s]
BLEU test: 0.029993427458531466


In [3]:
!python training.py --name_file=Educativo/all/educativo_15000  --dir_files=ParallelCorpora --dir_results=results_educativo/bpe_15000

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
4768it [00:03, 1251.04it/s]
607it [00:00, 1350.28it/s]
607it [00:00, 1264.09it/s]
100%|████████████████████████████████████| 5375/5375 [00:00<00:00, 75290.25it/s]
BLEU test: 0.005327791414768956


In [4]:
!python training.py --name_file=Educativo/all/educativo_20000  --dir_files=ParallelCorpora --dir_results=results_educativo/bpe_20000

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
4768it [00:03, 1266.83it/s]
607it [00:00, 1370.95it/s]
607it [00:00, 1280.84it/s]
100%|████████████████████████████████████| 5375/5375 [00:00<00:00, 76347.88it/s]
BLEU test: 0.03230557350632636


# Religioso

In [5]:
!python training.py --name_file=Religioso/all/religioso_5000  --dir_files=ParallelCorpora --dir_results=results_religioso/bpe_5000

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
10021it [00:15, 653.98it/s]
1263it [00:01, 680.90it/s]
1263it [00:02, 611.34it/s]
100%|██████████████████████████████████| 11284/11284 [00:00<00:00, 42879.48it/s]
BLEU test: 0.006334807962230557


In [6]:
!python training.py --name_file=Religioso/all/religioso_10000  --dir_files=ParallelCorpora --dir_results=results_religioso/bpe_10000

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
10021it [00:14, 713.46it/s]
1263it [00:01, 744.85it/s]
1263it [00:01, 725.51it/s]
100%|██████████████████████████████████| 11284/11284 [00:00<00:00, 44782.97it/s]
BLEU test: 0.008752968583175471


In [7]:
!python training.py --name_file=Religioso/all/religioso_15000  --dir_files=ParallelCorpora --dir_results=results_religioso/bpe_15000

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
10021it [00:13, 756.17it/s]
1263it [00:01, 728.26it/s]
1263it [00:01, 768.26it/s]
100%|██████████████████████████████████| 11284/11284 [00:00<00:00, 47345.48it/s]
BLEU test: 0.007384567420679714


In [8]:
!python training.py --name_file=Religioso/all/religioso_20000  --dir_files=ParallelCorpora --dir_results=results_religioso/bpe_20000

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
10021it [00:12, 781.68it/s]
1263it [00:01, 750.26it/s]
1263it [00:01, 789.79it/s]
100%|██████████████████████████████████| 11284/11284 [00:00<00:00, 47823.16it/s]
BLEU test: 0.0023481668446239013


# Flashcards

In [9]:
!python training.py --name_file=Flashcards/all/flashcards_5000  --dir_files=ParallelCorpora --dir_results=results_flashcards/bpe_5000

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
4876it [00:02, 2234.44it/s]
774it [00:00, 2413.06it/s]
2090it [00:00, 2170.95it/s]
100%|███████████████████████████████████| 5650/5650 [00:00<00:00, 127262.57it/s]
BLEU test: 0.014600467404777016


In [10]:
!python training.py --name_file=Flashcards/all/flashcards_10000  --dir_files=ParallelCorpora --dir_results=results_flashcards/bpe_10000

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
4876it [00:02, 2368.69it/s]
774it [00:00, 2690.60it/s]
2090it [00:00, 2298.32it/s]
100%|███████████████████████████████████| 5650/5650 [00:00<00:00, 128957.19it/s]
BLEU test: 0.012546480745815363


In [11]:
!python training.py --name_file=Flashcards/all/flashcards_15000  --dir_files=ParallelCorpora --dir_results=results_flashcards/bpe_15000

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
4876it [00:02, 2411.90it/s]
774it [00:00, 2733.44it/s]
2090it [00:00, 2326.05it/s]
100%|███████████████████████████████████| 5650/5650 [00:00<00:00, 133234.11it/s]
BLEU test: 1.0945440478630852e-233


In [12]:
!python training.py --name_file=Flashcards/all/flashcards_20000  --dir_files=ParallelCorpora --dir_results=results_flashcards/bpe_20000

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
4876it [00:01, 2633.18it/s]
774it [00:00, 2716.21it/s]
2090it [00:00, 2313.81it/s]
100%|███████████████████████████████████| 5650/5650 [00:00<00:00, 127421.32it/s]
BLEU test: 0.012360495449667473


|     j       | 5000   | 10000  | 15000   | 20000  |
|------------|--------|--------|---------|--------|
| Educativo  | 0.0353 | 0.0299 | 0.0053  | 0.0323 |
| Religioso  | 0.0063 | 0.0087 | 0.0073  | 0.0023 |
| Flashcards | 0.0146 | 0.0125 | 0.00001 | 0.0123 |