In [414]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np
import matplotlib.pyplot as plt

from data import ParallelCorpus
from model import EmbedAlign
from util import Timer, AnnealKL, align, predict_alignments, eval_alignments

np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x10974bd90>

In [135]:
e_train_path = 'hansards/train/train.e'
e_dev_path = 'hansards/dev/dev.e'
e_test_path = 'hansards/test/test.e'
f_train_path = 'hansards/train/train.f'
f_dev_path = 'hansards/dev/dev.f'
f_test_path = 'hansards/test/test.f'

l1_vocab_size = 10000
l2_vocab_size = 10000
max_lines = None
length_ordered = False
emb_dim = 50
hidden_dim = 50
z_dim = 50
batch_size = 32
num_epochs = 5
learning_rate = 1e-3
print_every = 10
save_every = 1000
write_every = 100
mean_sent = True

corpus = ParallelCorpus(e_train_path, e_dev_path, e_test_path,
                          f_train_path, f_dev_path, f_test_path,
                          l1_vocab_size, l2_vocab_size,
                          max_lines, ordered=length_ordered)

Loaded parallel corpus with 231164 lines.


In [455]:
model = torch.load('models/model.pt')

In [456]:
def make_sent(x, l1=True):
    if l1:
        sent = [corpus.l1.dictionary.i2w[i] for i in x.data.numpy()]
    else:
        sent = [corpus.l2.dictionary.i2w[i] for i in x.data.numpy()]
    return ' '.join(sent)

In [457]:
batch_size = 2
batches = corpus.batches(batch_size)
x, y = next(batches)
print(x.shape)
print(y.shape)

print("\nEnglish:")
for k in range(batch_size):
    e = make_sent(x[k])
    print(k, ':', e)
    
print('\nFrench:')
for k in range(batch_size):
    f = make_sent(y[k], l1=False)
    print(k, ':', f)

torch.Size([2, 40])
torch.Size([2, 44])

English:
0 : i should point out that mr. shortliffe explicitly recommended that the new city be designated as bilingual by the ontario legislature and not by ottawa ' s new municipal council . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
1 : moreover , mr. shortliffe noted that it will be up to the city of ottawa ' s senior council to determine the scope and nature of the services that will be available in both official languages of the country .

French:
0 : je dois noter quem. de a , de façon explicite , recommandé que la nouvelle cité soit désignée bilingue par la législature de le ontario et non pas par le nouveau conseil de la municipalité de ottawa . <pad> <pad> <pad> <pad> <pad> <pad>
1 : de plus , m. de a noté que il sera de le ressort de le haut conseil de la cité de ottawa de déterminer la portée et la nature de les services qui seront disponibles dans les deux langues officielles de le pays .


In [458]:
mu, sigma = model.encoder(x)
z = model.sample(mu, sigma)

px = model.f(z)
py = model.g(z)          

In [459]:
def make_alignment(x, y, py, e, f):
    a = align(x, y, py)
    e = e.split()
    f = f.split()
    n = len(f)
    pairs = list(zip(range(n), a[0,:n]))
    return [(f[i], e[j]) for i, j in pairs]

In [460]:
pairs = make_alignment(x, y, py, e, f)
pairs

[('de', 'moreover'),
 ('plus', 'moreover'),
 (',', ','),
 ('m.', 'be'),
 ('de', 'the'),
 ('a', 'to'),
 ('noté', 'senior'),
 ('que', 'the'),
 ('il', "'"),
 ('sera', 'will'),
 ('de', 'senior'),
 ('le', 'be'),
 ('ressort', 'up'),
 ('de', 'nature'),
 ('le', 'the'),
 ('haut', 'nature'),
 ('conseil', 'of'),
 ('de', 'ottawa'),
 ('la', 's'),
 ('cité', 'senior'),
 ('de', 'nature'),
 ('ottawa', 'determine'),
 ('de', 'the'),
 ('déterminer', 'to'),
 ('la', 'to'),
 ('portée', 'the'),
 ('et', 'scope'),
 ('la', 'scope'),
 ('nature', 'senior'),
 ('de', 'to'),
 ('les', 'services'),
 ('services', 'will'),
 ('qui', 'the'),
 ('seront', 'nature'),
 ('disponibles', 'city'),
 ('dans', 'the'),
 ('les', 'nature'),
 ('deux', 'be'),
 ('langues', 'it'),
 ('officielles', 'it'),
 ('de', 'it'),
 ('le', 'it'),
 ('pays', 'it'),
 ('.', 'it')]

In [461]:
for i in range(batch_size):
    s = px[i]
    _, pred = s.max(dim=-1)
    print_sent(pred)
    print_sent(x[i])
    print()

i should point out that mr. shortliffe explicitly recommended that the new city be designated as bilingual by the ontario legislature and not by ottawa ' s new municipal council . talking no. make advisable has federal thousands has monitoring 

i should point out that mr. shortliffe explicitly recommended that the new city be designated as bilingual by the ontario legislature and not by ottawa ' s new municipal council . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> 


moreover , mr. shortliffe noted that it will be up to the city of ottawa ' s senior council to determine the scope and nature of the services that will be available in both official languages of the country . 

moreover , mr. shortliffe noted that it will be up to the city of ottawa ' s senior council to determine the scope and nature of the services that will be available in both official languages of the country . 




In [462]:
for i in range(batch_size):
    s = py[i]
    _, pred = s.max(dim=-1)
    print_sent(pred, l1=False)
    print_sent(y[i], l1=False)
    print()

je devrait règlement de que . glen explicitement recommandé que de nouveau ville de conséquences comme bilingue par de ontario législative et ne par ottawa nisga' de nouveau municipalités conseil . il . les civiles mike yvon bob mme vanclief 

je dois noter quem. de a , de façon explicite , recommandé que la nouvelle cité soit désignée bilingue par la législature de le ontario et non pas par le nouveau conseil de la municipalité de ottawa . <pad> <pad> <pad> <pad> <pad> <pad> 


plus , . glen de que il va être de à de la de ottawa le de de conseil à déterminer le portée et la de le services que de de les dans deux officielles langues de de pays . 

de plus , m. de a noté que il sera de le ressort de le haut conseil de la cité de ottawa de déterminer la portée et la nature de les services qui seront disponibles dans les deux langues officielles de le pays . 




In [463]:
zero, one = Variable(torch.zeros(z_dim)), Variable(torch.ones(z_dim))
normal = torch.distributions.Normal(zero, one)
z = normal.sample()

px = model.f(z)
py = model.g(z)

px = px.data.numpy()
py = py.data.numpy()

ix = np.argsort(px)[::-1]
p = np.sort(px)[::-1]
for i in range(10):
    print('{}: {} ({:.3f})'.format(i, corpus.l1.dictionary.i2w[ix[i]], p[i]))

0: solutions (0.837)
1: a (0.039)
2: has (0.028)
3: issue (0.016)
4: fact (0.014)
5: make (0.011)
6: did (0.007)
7: those (0.005)
8: unanimity (0.005)
9: 1997 (0.004)


In [464]:
def homotopy(start, end, steps=10, k=10):
    lmbdas = np.linspace(0, 1, steps)
    for k, lmbda in enumerate(lmbdas):
        lmbda = float(lmbda)
        z = lmbda*end + (1-lmbda)*start
        px = model.f(z)
        py = model.g(z)

        px = px.data.numpy()
        py = py.data.numpy()

        ix = np.argsort(px)[::-1]
        px = np.sort(px)[::-1]
        py = np.sort(py)[::-1]
        print('Step {}/{}'.format(k, steps))
        for i in range(10):
            print('{}: {} ({:.3f}) || {} ({:.3f})'.format(i, 
                    corpus.l1.dictionary.i2w[ix[i]], px[i],
                    corpus.l2.dictionary.i2w[ix[i]], py[i]))
        print()

In [465]:
homotopy(zero, 100*one)

Step 0/10
0: parent (0.000) || maritime (0.001)
1: desks (0.000) || réalisent (0.001)
2: sixteen (0.000) || cohérent (0.000)
3: armenian (0.000) || donateur (0.000)
4: morrison (0.000) || bennett (0.000)
5: 449 (0.000) || élémentaires (0.000)
6: termed (0.000) || perquisitions (0.000)
7: dump (0.000) || justifiées (0.000)
8: museums (0.000) || malheur (0.000)
9: 458 (0.000) || drapeaux (0.000)

Step 1/10
0: nault (0.541) || rouge (0.640)
1: 47 (0.396) || touristique (0.162)
2: going (0.042) || ici (0.068)
3: caroline (0.021) || formulé (0.066)
4: disagrees (0.000) || distances (0.037)
5: indicated (0.000) || définitif (0.024)
6: association (0.000) || seraient (0.001)
7: stating (0.000) || désaccord (0.000)
8: disputes (0.000) || regrettable (0.000)
9: arguments (0.000) || divorce (0.000)

Step 2/10
0: nault (0.517) || rouge (0.885)
1: 47 (0.471) || touristique (0.094)
2: going (0.011) || ici (0.013)
3: caroline (0.001) || formulé (0.004)
4: disagrees (0.000) || distances (0.004)
5: in