In [1]:
import numpy as np

from scipy.stats import pearsonr
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
from scipy import stats

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F
import torch.distributions.kl as kl
torch.manual_seed(1)

import time
from datetime import datetime

from collections import defaultdict
from collections import Counter

from random import randint
import pickle

import warnings
warnings.filterwarnings('error')

In [2]:
min_count = 5
embedding_dim = 100
det_embedding_dim = 128
batch_size = 100
epochs = 10
learning_rate = 0.01
window_size = 5

In [3]:
#read a subset 10000 sentences
with open('corpus2id_hansard_en_subs.pickle', 'rb') as f:
    corpus2id_en = pickle.load(f)
    
with open('corpus2id_hansard_fr_subs.pickle', 'rb') as f:
    corpus2id_fr = pickle.load(f)

In [4]:
print(len(corpus2id_en), len(corpus2id_fr))

10000 10000


In [5]:
with open('unigram_probs_hansard_en_subs.p', 'rb') as f:
    unigram_en = pickle.load(f)
    
with open('unigram_probs_hansard_fr_subs.p', 'rb') as f:
    unigram_fr = pickle.load(f)
    
vocabulary_size_en = len(unigram_en)
vocabulary_size_fr = len(unigram_fr)
print(vocabulary_size_en, vocabulary_size_fr)

6330 7199


In [6]:
max_s_len_en = 0
max_s_len_fr = 0


for s in range(len(corpus2id_en)):
    
    if len(corpus2id_en[s]) > max_s_len_en:
        max_s_len_en = len(corpus2id_en[s])
        
    if len(corpus2id_fr[s]) > max_s_len_fr:
        max_s_len_fr = len(corpus2id_fr[s])
        
print('Maximum sentence length EN:', max_s_len_en)
print('Maximum sentence length FR:', max_s_len_fr)


Maximum sentence length EN: 185
Maximum sentence length FR: 275


In [7]:
def create_batches_EA(corpus_en, vocabulary_size_en, corpus_fr, vocabulary_size_fr, max_sentence_length_en,max_sentence_length_fr, batch_size):
    
    print(max_sentence_length_en, max_sentence_length_fr)
    batches_en = []
    batches_fr = []
    
    no_sentences = len(corpus_en)
    
    indices = np.arange(0, no_sentences)
        
    #shuffle set
    np.random.shuffle(indices)
    
    for bn in range(no_sentences):
        
        b_indices = indices[bn*batch_size:bn*batch_size + batch_size]
        
        batch_max_en = 0
        batch_max_fr= 0
        
        for d in b_indices:
            sent_len = len(corpus_en[d])
            
            if sent_len > batch_max_en:
                batch_max_en = sent_len
                
            sent_len = len(corpus_fr[d])
            
            if sent_len > batch_max_fr:
                batch_max_fr = sent_len
                
        sentence_b_en = []
        sentence_b_fr = []
        
        for d in b_indices:
            
            sent_en = corpus_en[d]
            
#             dif = batch_max_en - len(sent_en)
            
#             for wd in range(dif):
#                 sent_en.append(vocabulary_size_en)
                
            sent_fr = corpus_fr[d]
            
#             dif = batch_max_fr - len(sent_fr)
            
#             for wd in range(dif):
#                 sent_fr.append(vocabulary_size_fr)
                
#             sentence_b_en.append(sent_en)
        
#             sentence_b_fr.append(sent_fr)
    
            
        batches_en.append(sent_en)
        
        batches_fr.append(sent_fr)
    
    return batches_en, batches_fr

In [8]:
batches_en, batches_fr = create_batches_EA(corpus2id_en, vocabulary_size_en, corpus2id_fr, vocabulary_size_fr, max_s_len_en,max_s_len_fr, batch_size)

185 275


In [82]:
batches_en[0]

[2064, 434, 3741, 887, 3977, 4157, 1198, 3152, 4162, 4349]

In [None]:
class EmbedAlign(nn.Module):
    def __init__(self, vocab_size_en,vocab_size_fr, embedding_dim):
        super(EmbedAlign, self).__init__()
        
        self.vocab_size_en = vocab_size_en
        self.vocab_size_fr = vocab_size_fr
        
        #for the inference model
        self.w_embeddings = nn.Embedding(vocab_size_en, embedding_dim)
        #encoder        
        self.bidirLSTM = nn.LSTM(embedding_dim, embedding_dim, bidirectional=True)
        #h_i = hi< + hi>
        
        self.mu_1 = nn.Linear(embedding_dim, embedding_dim)
        self.mu_2 = nn.Linear(embedding_dim, embedding_dim)
            
        self.sigma_1 = nn.Linear(embedding_dim, embedding_dim)
        self.sigma_2 = nn.Linear(embedding_dim, embedding_dim)
        
        #for the generative model
        self.affine1L1 = nn.Linear(embedding_dim, embedding_dim)
        self.affine2L1 = nn.Linear(embedding_dim, vocab_size_en)
        
        self.affine1L2 = nn.Linear(embedding_dim, embedding_dim)
        self.affine2L2 = nn.Linear(embedding_dim, vocab_size_fr)
       
        self.dist_norm = torch.distributions.multivariate_normal.MultivariateNormal(torch.ones(embedding_dim),torch.diag(torch.ones(embedding_dim)))
            
    def forward(self, batch_en, batch_fr, mu_i, sigma_i, z_i):
        
        kl_score = 0.0
        sent_logx = 0.0
        sent_logy = 0.0
        
        m = len(batch_en)
        
        for x in batch_en:
            embeddings = self.w_embeddings(x)
            #view_shape = embeddings.shape[0]
            output, (hidden, cell) = self.bidirLSTM(embeddings.view(1, 1, -1)) 

            hid_f = hidden[0]
            hid_b = hidden[1]

            conc_hids = hid_f + hid_b
            
            mu = self.mu_1(conc_hids.squeeze())
            mu = F.relu(mu)
            mu = self.mu_2(mu)

            sigma = self.sigma_1(conc_hids.squeeze())
            sigma = F.relu(sigma)
            sigma = self.sigma_2(sigma)
            sigma = F.softplus(sigma)

            mu_i[x] = mu
            sigma_i[x] = sigma

            epsilon = torch.distributions.multivariate_normal.MultivariateNormal(torch.zeros(embedding_dim),torch.diag(torch.ones(embedding_dim))).sample()

            #reparameterize
            z_i[x] = mu + epsilon * sigma

            #generative using sampled zi
            #variational location and scale
            #same zi for x and y

            zi = z_i[x] #sampled z

            xi = self.affine1L1(zi)
            xi = F.relu(xi)
            xi = self.affine2L1(xi)
            xi = F.log_softmax(xi, dim=0) #cat generation - target

            yi = self.affine1L2(zi)
            yi = F.relu(yi)
            yi = self.affine2L2(yi)
            yi = F.log_softmax(yi, dim=0) #cat generation - source

            sent_logx += xi[x]
            
            #mu[torch.isnan(mu)] = 0
            #print(mu_i[x], sigma_i[x])
            
            kl_loss = -(1 + torch.log(sigma**2) - mu ** 2 - sigma**2)/2
            
            kl_score += kl_loss
            
            best_j = 0
            best_prob = 0
            for y in batch_fr:
                y_prob = yi[y].data
                if y_prob > best_prob:
                    best_prob = y_prob
                    best_j = y #TODO
                    
            sent_logy += (-torch.Tensor([np.log(m)])) + best_prob
                
        final_out = -sent_logx - sent_logy + torch.sum(kl_score)
            
        return final_out, mu_i, sigma_i, xi,yi,z_i


epochs = 10
model = EmbedAlign(vocabulary_size_en, vocabulary_size_fr, embedding_dim) #pad
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

losses = []
avg_losses = []

z_i = defaultdict()
mu_i = defaultdict()
sigma_i = defaultdict()
        
portion = 10000

print('epoch, total loss, average loss, duration')
for e in range(epochs):
    
    then = datetime.now()
    
    total_loss = 0.0
    for b in range(portion):
        
        if len(batches_en[b]) > 0 and len(batches_fr[b]) > 0:
            batch_en = torch.tensor(np.asarray(batches_en[b]), dtype= torch.long)

            batch_fr = torch.tensor(np.asarray(batches_fr[b]), dtype= torch.long)

            optimizer.zero_grad()

            loss, mu_i, sigma_i, xi,yi,z_i = model(batch_en, batch_fr,mu_i, sigma_i, z_i)

            loss.backward()

            optimizer.step()

            total_loss += loss.item()       
            
            if b % 1000 == 0:
                print('st', b)
    
    now = datetime.now()
        
    losses.append(total_loss)
    
    avg_loss = np.mean(losses)/len(batches_en)
    
    print(e, total_loss, avg_loss, now-then)
    
    avg_losses.append(avg_loss)
    
with open('mu_' + str(e) + '.pickle', 'wb') as file:
    pickle.dump(mu_i, file)
with open('sigma_' + str(e) + '.pickle', 'wb') as file:
    pickle.dump(sigma_i, file)
with open('xi_' + str(e) + '.pickle', 'wb') as file:
    pickle.dump(xi, file)
with open('yi_' + str(e) + '.pickle', 'wb') as file:
    pickle.dump(yi, file)

iteration= list(range(len(losses)))

plt.plot(iteration, losses)
plt.xlabel("Iterations for Embed-Align")
plt.ylabel('Average loss')
plt.title('Evolution of the loss as a function of the iteration')
plt.savefig("embed.png")
plt.show()




epoch, total loss, average loss, duration
st 0
st 1000
st 2000
st 3000
st 4000
st 5000
st 6000
st 7000
st 8000


In [113]:
dist1 = torch.distributions.multivariate_normal.MultivariateNormal(torch.ones(embedding_dim),torch.diag(torch.ones(embedding_dim)))
dist2 = torch.distributions.multivariate_normal.MultivariateNormal(torch.ones(embedding_dim),torch.diag(torch.ones(embedding_dim)))

kl_score = kl.kl_divergence(dist1,dist2)
print(kl_score)

tensor(0.)


In [None]:
def calculate_kl_divergence(m1, m2, s1, s2, embedding_dim):
    
    epsilon = torch.distributions.multivariate_normal.MultivariateNormal(torch.zeros(embedding_dim),torch.diag(torch.ones(embedding_dim))).sample()

    kl_score = kl.kl_divergence()
    return kl_score

In [None]:
iteration= list(range(len(avg_losses)))

plt.plot(iteration, avg_losses)
plt.xlabel("Iterations for Embed-Align")
plt.ylabel('Average loss')
plt.title('Evolution of average loss as a function of the iteration')
plt.savefig("embedalign.png")
plt.show()


In [None]:
def convert_to_one_hot(index, vocab_size):
    
    one_hot = np.zeros(vocab_size)
    one_hot[index] += 1
    
    one_hot = torch.from_numpy(one_hot)
        
    return one_hot

In [None]:
convert_to_one_hot(2,10)

In [None]:
torch.sign(torch.tensor([i for i in range(9)], dtype=torch.long))