# Assignment 7

Train a Transformer model for Machine Translation from Russian to English.  
Dataset: http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz   
Make all source and target text to lower case.  
Use following tokenization for english:  
```
import sentencepiece as spm

...
spm.SentencePieceTrainer.Train('--input=data/text.en --model_prefix=bpe_en --vocab_size=32000 --character_coverage=0.98 --model_type=bpe')

tok_en = spm.SentencePieceProcessor()
tok_en.load('bpe_en.model')

TGT = data.Field(
    fix_length=50,
    init_token='<s>',
    eos_token='</s>',
    lower=True,
    tokenize = lambda x: tok_en.encode_as_pieces(x),
    batch_first=True,
)

...
TGT.build_vocab(..., min_freq=5)
...

```
Score: corpus-bleu `nltk.translate.bleu_score.corpus_bleu`  
Use last 1000 sentences for model evalutation (test dataset).  
Use your target sequence tokenization for BLEU score.  
Use max_len=50 for sequence prediction.  


Hint: You may consider much smaller model, than shown in the example.  

Baselines:  
[4 point] BLEU = 0.05  
[6 point] BLEU = 0.10  
[9 point] BLEU = 0.15  

[1 point] Share weights between target embeddings and output dense layer. Notice, they have the same shape.


Readings:
1. BLUE score how to https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
1. Transformer code and comments http://nlp.seas.harvard.edu/2018/04/03/attention.html

In [1]:
pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |▎                               | 10kB 21.2MB/s eta 0:00:01[K     |▋                               | 20kB 3.0MB/s eta 0:00:01[K     |█                               | 30kB 4.4MB/s eta 0:00:01[K     |█▎                              | 40kB 2.9MB/s eta 0:00:01[K     |█▋                              | 51kB 3.6MB/s eta 0:00:01[K     |██                              | 61kB 4.2MB/s eta 0:00:01[K     |██▏                             | 71kB 4.9MB/s eta 0:00:01[K     |██▌                             | 81kB 5.5MB/s eta 0:00:01[K     |██▉                             | 92kB 6.1MB/s eta 0:00:01[K     |███▏                            | 102kB 4.7MB/s eta 0:00:01[K     |███▌                            | 112kB 4.7MB/s eta 0:00:01[K     |███▉                     

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd

from torchtext import datasets, data
from tqdm import tqdm
from tqdm import tqdm_notebook

import sentencepiece as spm


import math, copy, time

import matplotlib.pyplot as plt
import seaborn
seaborn.set_context(context="talk")
%matplotlib inline

DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import os
os.chdir('gdrive/My Drive/Colab Notebooks')

In [5]:
# tokenize english 
with open('news-commentary-v13.ru-en.en') as f:
    with open('text.en', 'w') as out:
            out.write(f.read().lower())
        
spm.SentencePieceTrainer.Train('--input=text.en --model_prefix=bpe_en --vocab_size=32000 --character_coverage=0.98 --model_type=bpe')

True

In [6]:
# tokenize russian

with open('news-commentary-v13.ru-en.ru') as f:
    with open('text.ru', 'w') as out:
            out.write(f.read().lower())
        
spm.SentencePieceTrainer.Train('--input=text.ru --model_prefix=bpe_ru --vocab_size=32000 --character_coverage=0.98 --model_type=bpe')

True

In [0]:
tok_ru = spm.SentencePieceProcessor()
tok_ru.load('bpe_ru.model')

tok_en = spm.SentencePieceProcessor()
tok_en.load('bpe_en.model')

SRC = data.Field(
    fix_length=50,
    init_token='<s>',
    eos_token='</s>',
    lower=True,
    tokenize = lambda x: tok_ru.encode_as_pieces(x),
    batch_first=True,
)

TGT = data.Field(
    fix_length=50,
    init_token='<s>',
    eos_token='</s>',
    lower=True,
    tokenize = lambda x: tok_en.encode_as_pieces(x),
    batch_first=True,
)

fields = (('src', SRC), ('tgt', TGT))

In [8]:
with open('text.ru') as f:
    src_snt = list(map(str.strip, f.readlines()))
    
with open('text.en') as f:
    tgt_snt = list(map(str.strip, f.readlines()))
    
examples = [data.Example.fromlist(x, fields) for x in tqdm(zip(src_snt, tgt_snt))]
test = data.Dataset(examples[-1000:], fields)
train, valid = data.Dataset(examples[:-1000], fields).split(0.9)

235159it [01:15, 3100.81it/s]


In [9]:
print('src: ' + " ".join(train.examples[50].src))
print('tgt: ' + " ".join(train.examples[50].tgt))

src: ▁высокомерие ▁приводит ▁к ▁возникновению ▁ « бе лых ▁пят ен » .
tgt: ▁hubris ▁creates ▁blind ▁spots .


In [10]:
len(train), len(valid), len(test)

(210743, 23416, 1000)

In [0]:
TGT.build_vocab(train, min_freq=5)
SRC.build_vocab(train, min_freq=5)

# Model

In [0]:
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many 
    other models.
    """
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
        
    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        return self.decode(self.encode(src, src_mask), src_mask,
                            tgt, tgt_mask)
    
    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)
    
    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

In [0]:
class Generator(nn.Module):
    "Define standard linear + softmax generation step."
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)

In [0]:
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [0]:
class Encoder(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [0]:
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [0]:
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

In [0]:
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

In [0]:
class Decoder(nn.Module):
    "Generic N layer decoder with masking."
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

In [0]:
class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)
 
    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

In [0]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

In [0]:
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

In [0]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k 
        query, key, value = \
            [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]
        
        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value, mask=mask, 
                                 dropout=self.dropout)
        
        # 3) "Concat" using a view and apply a final linear. 
        x = x.transpose(1, 2).contiguous() \
             .view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)

In [0]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

In [0]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

In [0]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], 
                         requires_grad=False)
        return self.dropout(x)

In [0]:
def make_model(src_vocab, tgt_vocab, N=1, 
               d_model=512, d_ff=512, h=8, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), 
                             c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab))
    
    # This was important from their code. 
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform(p)
    return model

# Batches and masking

In [0]:
class Batch:
    "Object for holding a batch of data with mask during training."
    def __init__(self, src, trg=None, pad=0):
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        if trg is not None:
            self.trg = trg[:, :-1]
            self.trg_y = trg[:, 1:]
            self.trg_mask = \
                self.make_std_mask(self.trg, pad)
            self.ntokens = (self.trg_y != pad).data.sum()
    
    @staticmethod
    def make_std_mask(tgt, pad):
        "Create a mask to hide padding and future words."
        tgt_mask = (tgt != pad).unsqueeze(-2)
        tgt_mask = tgt_mask & Variable(
            subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
        return tgt_mask

# Iterator and criterion

In [0]:
class BucketIteratorWrapper(DataLoader):
    __initialized = False

    def __init__(self, iterator: data.Iterator):
#         super(BucketIteratorWrapper,self).__init__()
        self.batch_size = iterator.batch_size
        self.num_workers = 1
        self.collate_fn = None
        self.pin_memory = False
        self.drop_last = False
        self.timeout = 0
        self.worker_init_fn = None
        self.sampler = iterator
        self.batch_sampler = iterator
        self.__initialized = True

    def __iter__(self):
        return map(
            lambda batch: Batch(batch.src, batch.tgt, pad=TGT.vocab.stoi['<pad>']),
            self.batch_sampler.__iter__()
        )

    def __len__(self):
        return len(self.batch_sampler)
    
class MyCriterion(nn.Module):
    def __init__(self, generator, pad_idx):
        super(MyCriterion, self).__init__()
        self.generator = generator
        self.pad_idx = pad_idx
        self.criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=pad_idx)
        self.criterion.cuda()
        
    def forward(self, x, target):
        ntokens = (target != self.pad_idx).data.sum()
        x = self.generator(x)

        return self.criterion(x.reshape(-1, x.size(-1)), 
                              target.reshape(-1))  / ntokens

# Continue

In [31]:
torch.cuda.empty_cache()

batch_size = 128
num_epochs = 1
pad_idx = SRC.vocab.stoi["<pad>"]

train_iter, valid_iter, test_iter = data.BucketIterator.splits((train, valid, test), 
                                              batch_sizes=(batch_size, batch_size, batch_size), 
                                  sort_key=lambda x: len(x.src),
                                  shuffle=True,
                                  device=DEVICE,
                                  sort_within_batch=False)
                                  
train_iter = BucketIteratorWrapper(train_iter)
valid_iter = BucketIteratorWrapper(valid_iter)
test_iter = BucketIteratorWrapper(test_iter)

src_len = len(SRC.vocab)
tgt_len = len(TGT.vocab)
model = make_model(src_len, tgt_len, N=1)
model = model.to(DEVICE)

criterion = MyCriterion(model.generator, pad_idx)
optimizer = optim.Adam(model.parameters(), lr=0.01)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)

# share weights
#<TODO>



In [0]:
def train_epoch(data_iter, train_len, model, criterion, optimizer):
    total_loss = 0
    data_iter = tqdm(data_iter, total=train_len)
    counter = 0

    for batch in data_iter:
        optimizer.zero_grad()

        pred = model.forward(batch.src, batch.trg, batch.src_mask, batch.trg_mask)
        loss = criterion(pred, batch.trg_y)
        loss.backward()
        optimizer.step()

        curr_loss = loss.detach().item()

        total_loss += curr_loss
        data_iter.set_postfix(loss = curr_loss)
        counter +=1
        
    total_loss /= counter
    return total_loss

def valid_epoch(data_iter, valid_len, model, criterion):
    total_loss = 0
    data_iter = tqdm(data_iter, total=valid_len)
    counter = 0

    for batch in data_iter:
        pred = model.forward(batch.src, batch.trg, batch.src_mask, batch.trg_mask)
        loss = criterion(pred, batch.trg_y)
        
        curr_loss = loss.detach().item()
        
        total_loss += curr_loss
        data_iter.set_postfix(loss = curr_loss)
        counter +=1
        
    total_loss /= counter
    return total_loss

In [32]:
train_len = len(list(iter(train_iter)))
valid_len = len(list(iter(valid_iter)))

train_len, valid_len

(1647, 183)

In [40]:
for epoch in range(num_epochs):
    model.train()
    loss = train_epoch(train_iter, train_len, model, criterion, optimizer)
    print('train', loss)
    
    model.eval()
    with torch.no_grad():
        loss = valid_epoch(valid_iter, valid_len, model, criterion)
        scheduler.step(loss)
        print('valid', loss)

100%|██████████| 1647/1647 [8:32:11<00:00, 15.81s/it, loss=5.77]
  0%|          | 0/183 [00:00<?, ?it/s]

train 6.071489456862338


100%|██████████| 183/183 [15:47<00:00,  5.08s/it, loss=6.22]

valid 5.646349437901231





In [41]:
#from sklearn.externals import joblib
#joblib.dump(model, "gen_model.pkl")



['gen_model.pkl']

In [33]:
from sklearn.externals import joblib
model = joblib.load("gen_model.pkl")



In [0]:
def beam_search(model, src, src_mask, max_len=10, k=5):
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(TGT.vocab.stoi["<s>"]).type_as(src.data)
    beam = [(ys, 0)]

    for i in range(max_len):
        candidates= []
        candidates_probs = []
        prev_prob = None

        for sent_pred, sent_prob in beam:
            if sent_pred[0][-1] == '</s>':
                candidates.append(send_pred)
                candidates_probs.append(sent_prob)
            else:
                pr = model.decode(memory, src_mask, sent_pred,
                                        subsequent_mask(sent_pred.size(1)).type_as(src.data))[0][i]
                top_k = torch.argsort(-pr)[:k].tolist()
                prev_prob = pr.tolist()
                for t in top_k:
                  candidates.append(torch.cat([sent_pred, torch.ones(1, 1).type_as(src.data).fill_(t)], dim=1))
                  candidates_probs.append(sent_prob + np.log(pr.tolist()[t])) 
         
        top_candidates = np.argsort(-np.array(candidates_probs))[:k]
        beam = []
        for cand in top_candidates:
          beam.append((candidates[cand], candidates_probs[cand]))

    return beam

In [60]:
model.eval()
with torch.no_grad():
    for i, batch in enumerate(valid_iter):
        src = batch.src[:1]
        src_key_padding_mask = src != SRC.vocab.stoi["<pad>"]
        beam = beam_search(model, src, src_key_padding_mask)
        
        seq = []
        for i in range(1, src.size(1)):
            sym = SRC.vocab.itos[src[0, i]]
            if sym == "</s>": break
            seq.append(sym)
        seq = tok_ru.decode_pieces(seq)
        print("\nSource:", seq)
        
        print("Translation:")
        for pred, pred_proba in beam:                
            seq = []
            for i in range(1, pred.size(1)):
                sym = TGT.vocab.itos[pred[0, i]]
                if sym == "</s>": break
                seq.append(sym)
            seq = tok_en.decode_pieces(seq)
            print(f"pred {pred_proba:.2f}:", seq)
                
        seq = []
        for i in range(1, batch.trg.size(1)):
            sym = TGT.vocab.itos[batch.trg[0, i]]
            if sym == "</s>": break
            seq.append(sym)
        seq = tok_en.decode_pieces(seq)
        print("Target:", seq)
        break


Source: рост
Translation:
pred -7.44: between right there technology recogni see millionzz power
pred -7.71: between right there technology recogni see millionz between right
pred -7.78: between right there technology recogni see millionz power technology
pred -7.85: between right there technology recogni see millionzzz
pred -7.95: between right there technology recogni see millionz power point
Target: inflation


In [67]:
hypotheses = []
references = []

model.eval()
with torch.no_grad():
    for batch in tqdm(test_iter):
        for sent in range(len(batch.src)):
            src = batch.src[sent:sent+1]
            src_key_padding_mask = src != SRC.vocab.stoi["<pad>"]
            beam = beam_search(model, src, src_key_padding_mask, max_len = 20)

            for pred, pred_proba in beam[:1]:                
                seq = []
                for i in range(1, pred.size(1)):
                    sym = TGT.vocab.itos[pred[0, i]]
                    if sym == "</s>":
                      break
                    seq.append(sym)
                seq = tok_en.decode_pieces(seq)

                new_trg = batch.trg[sent:sent+1].tolist()[0]
                refs = []
                for i in range(1, batch.trg.size(1)):
                     new_trg_2 =  TGT.vocab.itos[new_trg[i]]
                     if new_trg_2 == "</s>" or new_trg_2  == "<pad>":
                       break
                     refs.append(new_trg_2)
                refs = tok_en.decode_pieces(refs)

                hypotheses.append(seq.split())
                references.append(refs.split())


  0%|          | 0/8 [00:00<?, ?it/s][A
 12%|█▎        | 1/8 [01:14<08:38, 74.07s/it][A
 25%|██▌       | 2/8 [02:28<07:24, 74.15s/it][A
 38%|███▊      | 3/8 [03:41<06:08, 73.72s/it][A
 50%|█████     | 4/8 [04:54<04:54, 73.61s/it][A
 62%|██████▎   | 5/8 [06:07<03:40, 73.41s/it][A
 75%|███████▌  | 6/8 [07:23<02:28, 74.13s/it][A
 88%|████████▊ | 7/8 [08:37<01:14, 74.24s/it][A
100%|██████████| 8/8 [09:37<00:00, 69.98s/it][A
[A

In [0]:
from nltk import translate
from nltk.translate.bleu_score import corpus_bleu

In [69]:
corpus_bleu(references, hypotheses, 
            smoothing_function=translate.bleu_score.SmoothingFunction().method3,
            auto_reweigh=True
           )

5.726114449927518e-05