In [15]:
import numpy as np

from scipy.stats import pearsonr
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
from scipy import stats

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F
torch.manual_seed(1)

import time
from datetime import datetime

from collections import defaultdict
from collections import Counter

from random import randint
import pickle

import warnings
warnings.filterwarnings('error')

from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial

import string
puncs = set(string.punctuation)

In [16]:
class EmbedAlign(nn.Module):
    def __init__(self, vocab_size_en,vocab_size_fr, embedding_dim):
        super(EmbedAlign, self).__init__()
        
        self.vocab_size_en = vocab_size_en
        self.vocab_size_fr = vocab_size_fr
        self.embedding_dim = embedding_dim
        #for the inference model
        self.w_embeddings = nn.Embedding(self.vocab_size_en, self.embedding_dim)
        #encoder        
        self.bidirLSTM = nn.LSTM(self.embedding_dim, self.embedding_dim, bidirectional=True)
        #h_i = hi< + hi>
        
        self.mu_1 = nn.Linear(self.embedding_dim, self.embedding_dim)
        self.mu_2 = nn.Linear(self.embedding_dim, self.embedding_dim)
            
        self.sigma_1 = nn.Linear(self.embedding_dim, self.embedding_dim)
        self.sigma_2 = nn.Linear(self.embedding_dim, self.embedding_dim)
        
        #for the generative model
        self.affine1L1 = nn.Linear(self.embedding_dim, self.embedding_dim)
        self.affine2L1 = nn.Linear(self.embedding_dim, self.vocab_size_en)
        
        self.affine1L2 = nn.Linear(self.embedding_dim, self.embedding_dim)
        self.affine2L2 = nn.Linear(self.embedding_dim, self.vocab_size_fr)
       
        self.dist_norm = torch.distributions.multivariate_normal.MultivariateNormal(torch.ones(self.embedding_dim),torch.diag(torch.ones(self.embedding_dim)))
            
    def forward(self, batch_en, batch_fr, mu_i, sigma_i, z_i, best_alignments):
        
        kl_score = 0.0
        sent_logx = 0.0
        sent_logy = 0.0
        
        m = len(batch_en)
        
        for x in range(len(batch_en)):
            
            word_x = batch_en[x]
            embeddings = self.w_embeddings(word_x)
            #view_shape = embeddings.shape[0]
            output, (hidden, cell) = self.bidirLSTM(embeddings.view(1, 1, -1)) 

            hid_f = hidden[0]
            hid_b = hidden[1]

            conc_hids = hid_f + hid_b
            
            mu = self.mu_1(conc_hids.squeeze())
            mu = F.relu(mu)
            mu = self.mu_2(mu)

            sigma = self.sigma_1(conc_hids.squeeze())
            sigma = F.relu(sigma)
            sigma = self.sigma_2(sigma)
            sigma = F.softplus(sigma)

            mu_i.append((word_x, mu))
            sigma_i.append((word_x,sigma))
            
            epsilon = torch.distributions.multivariate_normal.MultivariateNormal(torch.zeros(self.embedding_dim),torch.diag(torch.ones(self.embedding_dim))).sample()

            #reparameterize
            zi = mu + epsilon * sigma
            z_i.append((word_x, zi))
            
            #generative using sampled zi
            #variational location and scale
            #same zi for x and y

            xi = self.affine1L1(zi)
            xi = F.relu(xi)
            xi = self.affine2L1(xi)
            xi = F.log_softmax(xi, dim=0) #cat generation - target

            yi = self.affine1L2(zi)
            yi = F.relu(yi)
            yi = self.affine2L2(yi)
            yi = F.log_softmax(yi, dim=0) #cat generation - source

            sent_logx += xi[word_x]
            
            #mu[torch.isnan(mu)] = 0
            #print(mu_i[x], sigma_i[x])
            
            kl_loss = -(1 + torch.log(sigma**2) - mu ** 2 - sigma**2)/2
            
            kl_score += kl_loss
            
            best_j = 0
            best_prob = 0
            for y in range(len(batch_fr)):
                word_y = batch_fr[y]
                y_prob = yi[word_y].data
                exp_y_prob = np.exp(y_prob)
                
                if exp_y_prob > best_prob:
                    best_prob = exp_y_prob
                    best_j = y 
                    best_y_prob = y_prob
                    
            best_alignments.append((x,best_j))
                    
            sent_logy += (-torch.Tensor([np.log(m)])) + best_y_prob
                
        final_out = -sent_logx - sent_logy + torch.sum(kl_score)
            
        return final_out, mu_i, sigma_i, xi,yi,z_i, best_alignments

In [17]:
with open('mu_0.pickle', 'rb') as f:
    mu = pickle.load(f)
    
with open('sigma_0.pickle', 'rb') as f:
    sigma = pickle.load(f)
    
with open('xi_0.pickle', 'rb') as f:
    xi = pickle.load(f)
    
with open('yi_0.pickle', 'rb') as f:
    yi = pickle.load(f)

In [38]:
portion = 1000

with open('model_embed' + str(portion) + '.pickle', 'rb') as f:
     model = pickle.load(f)

In [39]:
with open('tokens2id_hansard_en_subs.pickle', 'rb') as f:
    tokens2id_en = pickle.load(f)
    
with open('tokens2id_hansard_fr_subs.pickle', 'rb') as f:
    tokens2id_fr = pickle.load(f)
    
with open('id2tokens_hansard_en_subs.pickle', 'rb') as f:
    id2tokens_en = pickle.load(f)
    
with open('id2tokens_hansard_fr_subs.pickle', 'rb') as f:
    id2tokens_fr = pickle.load(f)

In [40]:
len(yi)

7199

In [41]:
with open('lst/lst.gold.candidates') as file:
    whole_gold = file.read().splitlines() 
    
    gold_cands_l = [l.split('::') for l in whole_gold]

In [42]:
gold_cands = defaultdict(dict)

for g in gold_cands_l:
    word_pos = g[0]
    
    word, postag = word_pos.split('.')
    
    candidates = g[1].split(';')
    
    gold_cands[word] = {'postag':postag, 'candidates':candidates}

In [43]:
with open('lst/lst_test.preprocessed') as file:
    
    lst_sentences = [l.split() for l in file.readlines()]

In [44]:
lst_test_preprocessed = defaultdict(dict)

for l in lst_sentences:
    word, postag = l[0].split('.')
    sentence_no = int(l[1])
    word_position = int(l[2])
    sentence_tokens = l[3:]
    
    processed_tokens = []
    
    #remove punctuations TODO position might change
    punc_inds = []
    for s in range(len(sentence_tokens)):
        
        if sentence_tokens[s] not in puncs:
            processed_tokens.append(sentence_tokens[s])
        else:
            punc_inds.append(s)
                
    temp_word_position = word_position
    
    for d in punc_inds:
        if d < temp_word_position:
            word_position -= 1
        
    lst_test_preprocessed[(word,sentence_no)] = {'postag':postag, 'word_position':word_position,'sentence':processed_tokens}

In [45]:
def get_context_window(sentence, central_word_index, window_size):
    
    context = []
    word = sentence[central_word_index]
    
    for w in range(1,window_size+1):
        
        left_cont = central_word_index - w
        right_cont = central_word_index + w
        
        #find the window words to the left and right
        #add as pair if they are inside sentence boundaries
        
        if left_cont > -1:
            context.append(sentence[left_cont])
            
        if right_cont < len(sentence):
            context.append(sentence[right_cont])
            
    return context

In [46]:
def out_ranked_results(word, postag, sentence_id, sorted_d):
    
    word_pos = str(word+'.'+postag)
    res = 'RANKED\t' + word_pos + ' '+ str(sentence_id)
    
    for d in sorted_d:
        res += '\t' + d[0]+' '+ str(d[1])
    
    return res

In [75]:
results_to_write_cos = []
results_to_write_KL = []

len_w_embeds = 6330
embedding_dim = 100

for ls in lst_test_preprocessed:

    #GET WORD AND SENTENCE RELATED INFO
    item = lst_test_preprocessed[ls]
    central_word = ls[0]
    sentence_id = ls[1]

    postag = item['postag']
    word_position = item['word_position']
    sentence = item['sentence']
    
    sentence = torch.tensor(sentence2id(sentence, tokens2id_en), dtype = torch.long)
    
    sent_f = sentence #dummy

    #get the list of candidate gold annotations
    cands = gold_cands[central_word]['candidates']
    
    z_i = []
    mu_i = []
    sigma_i = []
    best_alignments = []
    

    loss, mu_i, sigma_i, xi,yi,z_i, best_alignments = model(sentence, sent_f,mu_i, sigma_i, z_i, best_alignments)

    cos_sims = defaultdict(float)
    KL_scores = defaultdict(float)
  
    if central_word in tokens2id_en:
        id_token = tokens2id_en[central_word]

        if id_token < len_w_embeds:
            #get the parameters of the word
            embed1_mu = mu_i[word_position][1]
            embed1_sigma = sigma_i[word_position][1]

            #just mean at test time - confirmed with Miguel
            z_central = (embed1_mu).detach().numpy()
                        
            z_central = z_central/np.linalg.norm(z_central.data)

            for c in cands:
                #for each candidate find the cosine similarity between target and central

                if c in tokens2id_en:
                    id_for_cand = tokens2id_en[c]

                    if id_for_cand < len_w_embeds:

                        #sentences with the candidate word
                        sentence[word_position] = id_for_cand
                        loss, mu_i, sigma_i, xi,yi,z_i, best_alignments = model(sentence, sent_f,mu_i, sigma_i, z_i, best_alignments)

                        #get the parameters of the word
                        embed2_mu = mu_i[word_position][1]
                        embed2_sigma = sigma_i[word_position][1]

                        #just mean at test time - confirmed with Miguel
                        z_context = (embed2_mu).detach().numpy()
                        z_context = z_context / np.linalg.norm(z_context.data)

                        cosine_similarity = 1 - spatial.distance.cosine(z_central, z_context)
                        cos_sims[c] = cosine_similarity

                        loss1 = torch.log(embed1_sigma/embed2_sigma) 
                        numerator = (embed2_sigma.pow(2) + (embed1_mu - embed1_sigma).pow(2))
                        KL_scores[c] = (loss1 + numerator / (2*embed1_sigma.pow(2)) - 0.5).sum()
                        KL_scores[c] = KL_scores[c].data.numpy()
                else:
                    cos_sims[c] = 0
                    KL_scores[c] = float('inf')
                
        sorted_d_cos = sorted(cos_sims.items(), key=lambda x: x[1], reverse=True)
        sorted_d_KL = sorted(KL_scores.items(), key=lambda x: x[1], reverse=True)

        ranked_res_cos = out_ranked_results(central_word, postag, sentence_id, sorted_d_cos)
        ranked_res_kl = out_ranked_results(central_word, postag, sentence_id, sorted_d_KL)

        results_to_write_cos.append(ranked_res_cos)
        results_to_write_KL.append(ranked_res_kl)
    else:

        for c in cands:

            id_for_cand = tokens2id_en[c]

            cos_sims[c] = 0

            KL_scores[c] = float('inf')
            
        sorted_d_cos = sorted(cos_sims.items(), key=lambda x: x[1], reverse=True)

        sorted_d_KL = sorted(KL_scores.items(), key=lambda x: x[1], reverse=True)

        ranked_res_cos = out_ranked_results(central_word, postag, sentence_id, sorted_d_cos)
        
        ranked_res_kl = out_ranked_results(central_word, postag, sentence_id, sorted_d_KL)

        results_to_write_cos.append(ranked_res_cos)
        results_to_write_KL.append(ranked_res_kl)

        
file_name = 'embedalign_lst_cos.txt'

results_to_write = results_to_write_cos
with open (file_name, 'w') as f:

    for r in range(len(results_to_write)):

        if r == len(results_to_write) - 1:
            print(r)
            f.write(results_to_write[r])
        else:
            f.write(results_to_write[r])
            f.write('\n')
            
file_name = 'embedalign_lst_kl.txt'

results_to_write = results_to_write_KL


with open (file_name, 'w') as f:

    for r in range(len(results_to_write)):

        if r == len(results_to_write) - 1:
            f.write(results_to_write[r])
        else:
            f.write(results_to_write[r])
            f.write('\n')
            
        

1709


In [50]:
cd ..

/home/ece/Desktop/ulllab/ULL-lab/Untitled Folder/ULL-labs/ULL-2


In [51]:
cd lst

/home/ece/Desktop/ulllab/ULL-lab/Untitled Folder/ULL-labs/ULL-2/lst


In [76]:
! python lst_gap.py lst_test.gold embedalign_lst_kl.txt out no-mwe
! python lst_gap.py lst_test.gold embedalign_lst_cos.txt out no-mwe


MEAN_GAP	0.267197711765367


MEAN_GAP	0.29614730845081483



In [30]:
test_e = 'testing/test/test.e'
test_f = 'testing/test/test.f'

with open(test_e) as e:
    test_sentences_e = [l.split() for l in e.readlines()]
with open(test_f) as f:
    test_sentences_f = [l.split() for l in f.readlines()]

num_test_sentences = len(test_sentences_e)
print(num_test_sentences)

447


In [31]:
dev_e = 'validation/dev.e'
dev_f = 'validation/dev.f'

with open(dev_e) as e:
    val_sentences_e = [l.split() for l in e.readlines()]
with open(dev_e) as f:
    val_sentences_f = [l.split() for l in f.readlines()]

num_val_sentences = len(val_sentences_e)
print(num_val_sentences)

37


In [28]:
#convert sentence tokens to ids
def sentence2id(sentence, tokens2id_lang):
    
    s_ids = []
    
    for w in sentence:
        
        token_id = tokens2id_lang[w.lower()]

        s_ids.append(token_id)
        
    return s_ids

In [29]:
#alignment

def write_test_results(filename, model, sentences_e, sentences_f):
    
    with open(filename,"w") as naaclfile:
        
        for sent in range(len(sentences_e)):
            
            z_i = []
            mu_i = []
            sigma_i = []
            
            sent_e = sentences_e[sent]
            sent_f = sentences_f[sent]

            l = len(sent_e) #includes null
            m = len(sent_f)

            sent_e = torch.tensor(sentence2id(sent_e, tokens2id_en), dtype=torch.long)
            sent_f = torch.tensor(sentence2id(sent_f, tokens2id_fr), dtype=torch.long)
            
            best_alignments = []
            
            loss, mu_i, sigma_i, xi,yi,z_i, best_alignments = model(sent_e, sent_f,mu_i, sigma_i, z_i, best_alignments)

            for b_pair in best_alignments:
             
                naaclfile.write(str(sent+1) + " " + str(b_pair[0]+1) + " " + str(b_pair[1]+1) + " S" + "\n")                

                

In [18]:
with open('model_embed.pickle', 'rb') as f:
    model = pickle.load(f)
    
write_test_results('naacl_embed_test', model,test_sentences_e, test_sentences_f)

write_test_results('naacl_embed_val', model,val_sentences_e, val_sentences_f)