In [None]:
import numpy as np

from scipy.stats import pearsonr
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
from scipy import stats

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F
torch.manual_seed(1)

import time
from datetime import datetime

from collections import defaultdict
from collections import Counter

from random import randint
import pickle

import warnings
warnings.filterwarnings('error')

In [None]:
min_count = 2
embedding_dim = 100
det_embedding_dim = 128
batch_size = 50
epochs = 10
learning_rate = 0.01
window_size = 5

In [None]:
with open('corpus2id_hansard_en.pickle', 'rb') as f:
    corpus2id_en = pickle.load(f)
    
with open('corpus2id_hansard_fr.pickle', 'rb') as f:
    corpus2id_fr = pickle.load(f)

In [None]:
print(len(corpus2id_en), len(corpus2id_fr))

In [None]:
with open('unigram_probs_hansard_en.p', 'rb') as f:
    unigram_en = pickle.load(f)
    
with open('unigram_probs_hansard_fr.p', 'rb') as f:
    unigram_fr = pickle.load(f)
    
vocabulary_size_en = len(unigram_en)
vocabulary_size_fr = len(unigram_fr)
print(vocabulary_size_en, vocabulary_size_fr)

In [None]:
max_s_len_en = 0
max_s_len_fr = 0


for s in range(len(corpus2id_en)):
    
    if len(corpus2id_en[s]) > max_s_len_en:
        max_s_len_en = len(corpus2id_en[s])
        
    if len(corpus2id_fr[s]) > max_s_len_fr:
        max_s_len_fr = len(corpus2id_fr[s])
        
print('Maximum sentence length EN:', max_s_len_en)
print('Maximum sentence length FR:', max_s_len_fr)

# HANSARDS
# Maximum sentence length EN: 266
# Maximum sentence length FR: 352

In [None]:
def create_batches_EA(corpus_en, vocabulary_size_en, corpus_fr, vocabulary_size_fr, max_sentence_length_en,max_sentence_length_fr, batch_size):
    
    batches_en = []
    batches_fr = []
    
    batch_number = len(corpus_en) // batch_size
    print(batch_number)
    no_sentences = len(corpus_en)
    
    indices = np.arange(0, no_sentences)
        
    #shuffle set
    np.random.shuffle(indices)
    
    for bn in range(batch_number):
        
        b_indices = indices[bn*batch_size:bn*batch_size + batch_size]
        
        batch_max_en = 0
        batch_max_fr= 0
        
        for d in b_indices:
            sent_len = len(corpus_en[d])
            
            if sent_len > batch_max_en:
                batch_max_en = sent_len
                
            sent_len = len(corpus_fr[d])
            
            if sent_len > batch_max_fr:
                batch_max_fr = sent_len
                
        sentence_b_en = []
        sentence_b_fr = []
        
        for d in b_indices:
            
            sent_en = corpus_en[d]
            
            dif = batch_max_en - len(sent_en)
            
            for wd in range(dif):
                sent_en.append(vocabulary_size_en)
                
            sent_fr = corpus_fr[d]
            
            dif = batch_max_fr - len(sent_fr)
            
            for wd in range(dif):
                sent_fr.append(vocabulary_size_fr)
                              
                    
            sentence_b_en.append(sent_en)
        
            sentence_b_en.append(sent_fr)
    
            
        batches_en.append(sent_en)
        
        batches_fr.append(sent_fr)
    
    return batches_en, batches_fr

In [None]:
batches_en, batches_fr = create_batches_EA(corpus2id_en, vocabulary_size_en, corpus2id_fr, vocabulary_size_fr, max_s_len_en,max_s_len_fr, batch_size)

In [None]:
class EmbedAlign(nn.Module):
    def __init__(self, vocab_size_en,vocab_size_fr, embedding_dim):
        super(EmbedAlign, self).__init__()
        
        #for the inference model
        self.w_embeddings = nn.Embedding(vocab_size_en, embedding_dim)
        #encoder        
        self.bidirLSTM = nn.LSTM(embedding_dim, embedding_dim, bidirectional=True)
        #h_i = hi< + hi>
        
        self.mu_1 = nn.Linear(embedding_dim, embedding_dim)
        self.mu_2 = nn.Linear(embedding_dim, embedding_dim)
            
        self.sigma_1 = nn.Linear(embedding_dim, embedding_dim)
        self.sigma_2 = nn.Linear(embedding_dim, embedding_dim)
        
        #for the generative model
        self.affine1L1 = nn.Linear(embedding_dim, embedding_dim)
        self.affine2L1 = nn.Linear(embedding_dim, vocab_size_en)
        
        self.affine1L2 = nn.Linear(embedding_dim, embedding_dim)
        self.affine2L2 = nn.Linear(embedding_dim, vocab_size_fr)
       
      
    def forward(self, batch_en, batch_fr):
        embeddings = self.w_embeddings(batch_en)
        
        output, hidden = self.bidirLSTM(embeddings) #  
        hid_f = hidden[0][0]
        hid_b = hidden[0][1]
        
        output = hid_f + hid_b
        
        mu = self.mu_1(output.squeeze())
        mu = F.relu(mu)
        mu = self.mu_2(mu)
        
        sigma = self.sigma_1(output.squeeze())
        sigma = F.relu(sigma)
        sigma = self.sigma_2(sigma)
        sigma = F.softplus(sigma)
        
        epsilon = torch.distributions.multivariate_normal.MultivariateNormal(torch.zeros(dims),torch.diag(torch.ones(dims))).sample()
        
        #reparameterize
        zi = mu + epsilon * sigma
        
        
        #generative using sampled zi
        #same zi for x and y
        
        xi = self.affine1L1(zi)
        xi = F.relu(xi)
        xi = self.affine2L1(xi)
        xi = F.log_softmax(xi) #cat generation - target
        
        yi = self.affine1L1(zi)
        yi = F.relu(yi)
        yi = self.affine2L1(yi)
        yi = F.log_softmax(yi) #cat generation - source
        
        alignment_dist = 1 / len(batch_en) #m being the sentence length of L1
        
    
#         logx = 
#         logy = 
#         KL = -
        
        final_out = logx + logy + KL
        return final_out

In [None]:
epochs = 150
model = EmbedAlign(vocabulary_size_en, vocabulary_size_fr, embedding_dim)
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

losses = []
avg_losses = []

print('epoch, total loss, average loss, duration')
for e in range(epochs):
    
    then = datetime.now()
    
    total_loss = 0.0
    for b in range(len(batches_en)):
        
        batch_en = torch.Tensor(np.asarray(batches_en[b])).long()
           
        print(batch_en)
        batch_fr = torch.Tensor(np.asarray(batches_fr[b])).long()
        
        optimizer.zero_grad()

        loss = model(batches_en, batches_fr)
        
        loss.backward()
        
        optimizer.step()

        total_loss += loss.item()       
    
    now = datetime.now()
        
    losses.append(total_loss)
    
    avg_loss = np.mean(losses)/no_batch
    
    print(e, total_loss, avg_loss, now-then)
    
    avg_losses.append(avg_loss)

In [None]:
def convert_to_one_hot(index, vocab_size):
    
    one_hot = np.zeros(vocab_size)
    one_hot[index] += 1
    
    one_hot = torch.from_numpy(one_hot)
        
    return one_hot

In [None]:
convert_to_one_hot(2,10)

In [None]:
torch.sign(torch.tensor([i for i in range(9)], dtype=torch.long))