In [80]:
import numpy as np
import collections
from datetime import datetime

from scipy.stats import pearsonr
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mutual_info_score


from scipy import spatial
from scipy import stats

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F
torch.manual_seed(1)
import torch.distributions as distb


import time
import datetime

from collections import defaultdict
from collections import Counter

from random import randint
import pickle



In [2]:
class BayesianSG(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super(BayesianSG, self).__init__()

        self.w_embeddings = nn.Embedding(vocab_size, embedding_size)
        self.dense_weights = nn.Linear(embedding_size*2, embedding_size)
        self.muLinear = nn.Linear(embedding_size, embedding_size)
        self.sigmaLinear = nn.Linear(embedding_size, embedding_size)

        self.Location = nn.Embedding(vocab_size, embedding_size)
        self.Scale = nn.Embedding(vocab_size, embedding_size)
        self.fLinear = nn.Linear(embedding_size, vocab_size)

        self.embedding_size = embedding_size

    def forward(self, center_words, context_words):
    
        context_words = context_words.view(1, -1)
        context_size = context_words.size(1)

        # Inference model
        x_stacked = center_words.repeat(context_size, 1).transpose(0, 1)
        center_embedding = self.w_embeddings(x_stacked)
        context_embedding = self.w_embeddings(context_words)
        out = self.dense_weights(torch.cat([center_embedding, context_embedding], -1))
        emb = F.relu(out)
        emb_sum = emb.sum(1)

        mu_posterior = self.muLinear(emb_sum)
        sigma_posterior = F.softplus(self.sigmaLinear(emb_sum))

        dims = mu_posterior.shape
        #epsilon = torch.distributions.multivariate_normal.MultivariateNormal \
        #(torch.zeros(dims),torch.diag(torch.ones(dims)))
       
   
        smean = torch.zeros(self.embedding_size)
        scov = torch.diag(torch.ones(self.embedding_size))

        
  
        epsilon = distb.MultivariateNormal(smean, scov)
        epsilon = epsilon.sample()

                
        
        z = mu_posterior +  epsilon *torch.sqrt(torch.exp(sigma_posterior))


        logprobs = F.log_softmax(self.fLinear(z), dim=-1).squeeze(0)

        mu = self.Location(center_words)
        sigma = F.softplus(self.Scale(center_words))

        loss_probs = torch.zeros_like(context_words).type(torch.FloatTensor)
 
        sum_probs = 0
        for context_word in context_words:
            sum_probs = sum_probs + logprobs[context_word]

        sum_probs = loss_probs.sum(-1)
        
        
        loss1 = torch.log(sigma/sigma_posterior)
        numerator = (sigma_posterior.pow(2) + (mu_posterior - sigma).pow(2))
        total_loss = (loss1 + numerator / (2*sigma.pow(2)) - 0.5).sum()


        final_loss = (total_loss - sum_probs)
        
        return final_loss, mu_posterior, sigma_posterior, mu, sigma

In [3]:
import string

def stop_words(file_name, punctuation=True):
    with open(file_name) as f:
        stop_word_list = [line.strip() for line in f.readlines()]
    
    for p in list(string.punctuation):
        stop_word_list.append(p)
    
    return stop_word_list

stop_word_list = stop_words("data/en_stopwords.txt")

In [4]:
def sentences_reader(dataset_path, stop_word_list):
    sentence_list = []
    with open(dataset_path) as f:
        for line in f:
            line = line.split()
            line = [word.lower() for word in line]
            line = [word for word in line if word not in stop_word_list]
            sentence_list.append(line)
    
    return sentence_list[:100000]

sentences = sentences_reader("data/training.en", stop_word_list )

In [49]:
sentences[10]

['people', 'disabilities']

In [5]:
from collections import defaultdict


def UnigramTable(sentences, max_size):
    table = {}
    frequency = collections.defaultdict(int)
    n = 0
    
    
    for sentence in sentences:
        for word in sentence:
            table[n] = word
            frequency[word] += 1
            n+= 1
    
    return table, frequency

In [6]:

def vocabulary_creation(sentences, max_size = 10000):
    special_tokens = {"$UNK$", "$EOS$", "$SOS$", "$PAD"}
    index  = {}

    sentence_count = len(sentences)

    table, frequency = UnigramTable(sentences, max_size)
    
    num_tokens =  sum(frequency.values())

    counts = list(frequency.items())
    
    counts.sort(key=lambda _: -_[1])
    
    most_freq = [w[0] for w in counts[: max_size - len(special_tokens)]]


    index = dict([ (w, i) for i, w in enumerate(most_freq)])
    
    for special_token in special_tokens:
        assert special_token not in index
        index[special_token] = len(index)

    inverse_index = dict([(v, k) for (k, v) in index.items()])

    N = len(index)
    
    return index, inverse_index, N

index, inverse_index, N = vocabulary_creation(sentences, max_size = 10000)
        

def one_hot_vector(word, index, N):
    if word not in index:
        word = "$UNK$"
    vector = np.zeros(N)
    vector[index[word]] = 1
    
    return vector
    
def unknown_check(sentence, index):
    output = []
    for word in sentence:
        if word not in index:
            word = "$UNK$"
        output.append(word)
    return output
    

def retrieve_contexts(sentence, index, context_window):
    n_ = context_window
    context = set()
    for i in range(index - n_, index + n_ + 1):
        if i == index or i < 0 or i >= len(sentence):
            continue
        context.add(sentence[i])
    return context

In [8]:
sentences = sentences_reader("data/training.en", stop_word_list)

index, inverse_index, N = vocabulary_creation(sentences, max_size = vocab_size)



NameError: name 'vocab_size' is not defined

In [56]:
index['$UNK$']

9999

In [30]:
embedding_dim = 100
vocab_size = 10000
window_size = 5
epochs = 3
dataset_path = "data/training.en"
model_name = "bayesian_skipgram"

sentences = sentences_reader(dataset_path, stop_word_list)

index, inverse_index, N = vocabulary_creation(sentences, max_size = vocab_size)
sentences1 = sentences[:10000]

model = BayesianSG(N, embedding_dim)

optimizer = optim.Adam(model.parameters(), lr=1e-4)

epoch_losses = []
for epoch in range(1, epochs + 1):
    print("Running epoch: ", epoch)
    then = time.time()

    epoch_loss = 0
    count = 0
    s = 0
    t = time.time()
    for sentence in sentences1: 

        center_index = 0
        for center_word in sentence:
            if center_word not in index:
                continue
            center_vec = one_hot_vector(center_word, index, N)
            context_words = []
            context_idx = []
            for word in retrieve_contexts(unknown_check(sentence, index), center_index, window_size):
                if word not in index:
                    continue
                context_idx.append(index[word])
                context_words.append(one_hot_vector(word, index, N))
            if len(context_words) == 0:
                continue
            optimizer.zero_grad()
            
            center_w = torch.LongTensor(np.array([index[center_word]]))
            context_w = torch.LongTensor(context_idx)
            loss, mu, sigma, mu_prior, sigma_prior = model(center_w, context_w )
         
            epoch_loss = epoch_loss + loss.item()
            count = count+1

            loss.backward()
            optimizer.step()
            center_index += 1
   
        s = s+1
        if s == 1000:
            tt = time.time()
            print("time for 1000 sentence: ", tt-t)
            s = 0
            t = time.time()



    
    now = time.time()
    epoch_loss_avg = epoch_loss / count
    epoch_losses.append(epoch_loss_avg)
    print("average loss: ", epoch_loss_avg, "time: ",now-then)

    
#TODO save model 
file_name = "model-bayesian"
torch.save(model, 'bayesian.pt')
    

Running epoch:  1
time for 1000 sentence:  414.78591775894165
time for 1000 sentence:  602.4315011501312
time for 1000 sentence:  733.8026208877563
time for 1000 sentence:  830.7863399982452
time for 1000 sentence:  885.4374361038208
time for 1000 sentence:  944.0311789512634
time for 1000 sentence:  934.6640110015869
time for 1000 sentence:  1014.0258810520172
time for 1000 sentence:  1034.146152973175
time for 1000 sentence:  1111.5670111179352
average loss:  113.60854385108631 time:  8505.683596134186
Running epoch:  2
time for 1000 sentence:  1095.3764300346375
time for 1000 sentence:  1058.7119207382202
time for 1000 sentence:  1105.174332857132
time for 1000 sentence:  1109.101114988327
time for 1000 sentence:  1101.592008113861
time for 1000 sentence:  1102.4495029449463
time for 1000 sentence:  1043.7281131744385
time for 1000 sentence:  1090.6386618614197
time for 1000 sentence:  1077.657147884369
time for 1000 sentence:  1124.9353158473969
average loss:  56.288616923450654 ti

  "type " + obj.__name__ + ". It won't be checked "


In [171]:
epoch_losses

[113.60854385108631, 56.288616923450654, 46.511655645123305]

In [16]:
#TODO save model 
file_name = "model-bayesian"
torch.save(model, 'filename.pt')

  "type " + obj.__name__ + ". It won't be checked "


In [18]:
embedding_dim = 100
vocab_size = 10000
window_size = 5
epochs = 5
dataset_path = "data/training.en"
model_name = "bayesian_skipgram"

sentences = sentences_reader(dataset_path, stop_word_list)

index, inverse_index, N = vocabulary_creation(sentences, max_size = vocab_size)


model = BayesianSG(N, embedding_dim)

optimizer = optim.Adam(model.parameters(), lr=1e-4)

epoch_losses = []
for epoch in range(1, epochs + 1):
    print("Running epoch: ", epoch)
    then = time.time()

    epoch_loss = 0
    count = 0
    
    s = 0
    t = time.time()
    for sentence in sentences: 
        
        center_index = 0
        for center_word in sentence:
            if center_word not in index:
                continue
            center_vec = one_hot_vector(center_word, index, N)
            context_words = []
            context_idx = []
            for word in retrieve_contexts(unknown_check(sentence, index), center_index, window_size):
                if word not in index:
                    continue
                context_idx.append(index[word])
                context_words.append(one_hot_vector(word, index, N))
            if len(context_words) == 0:
                continue
            optimizer.zero_grad()
            
            center_w = torch.LongTensor(np.array([index[center_word]]))
            context_w = torch.LongTensor(context_idx)
            loss, mu, sigma, mu_prior, sigma_prior = model(center_w, context_w )
         
            epoch_loss = epoch_loss + loss.item()
            count = count+1

            loss.backward()
            optimizer.step()
            center_index += 1
        
        s = s+1
        if s == 20:
            tt = time.time()
            print("time for 20 sentence: ", tt-t)

    
    now = time.time()
    epoch_loss_avg = epoch_loss / count
    epoch_losses.append(epoch_loss_avg)
    print("average loss: ", epoch_loss_avg, "time: ",now-then)

    
#TODO save model 
file_name = "model-bayesian"
torch.save(model, 'filename.pt')
    

Running epoch:  1
time:  0.04199337959289551
time:  0.006774187088012695
time:  0.029964685440063477
time:  0.024204015731811523
time:  0.035803794860839844
time:  0.012015104293823242
time:  0.029833078384399414
time:  0.02626800537109375
time:  0.04402899742126465
time:  0.008504867553710938
time:  0.04642200469970703
time:  0.04704999923706055
time:  0.026363849639892578
time:  0.03599405288696289
time:  0.02994513511657715
time:  0.018034934997558594
time:  0.022803068161010742
time:  0.02317214012145996
time:  0.024409770965576172
time:  0.02239513397216797
time:  0.009506940841674805
time:  0.0206451416015625
time:  0.02037215232849121
time:  0.011932849884033203
time:  0.021779298782348633
time:  0.0321650505065918
time:  0.03277325630187988
time:  0.03829002380371094
time:  0.023352861404418945
time:  0.04120993614196777
time:  0.010403871536254883
time:  0.04748892784118652
time:  0.026027917861938477
time:  0.0295102596282959
time:  0.030609130859375
time:  0.0291562080383300

KeyboardInterrupt: 

In [31]:
model = torch.load('bayesian.pt')


In [32]:
#model.muLinear.weight
len(model.Location.weight[22])

100

In [33]:
with open('data/lst/lst.gold.candidates') as file:
    whole_gold = file.read().splitlines() 
    
    gold_cands_l = [l.split('::') for l in whole_gold]

In [34]:
gold_cands = defaultdict(dict)

for g in gold_cands_l:
    word_pos = g[0]
    
    word, postag = word_pos.split('.')
    
    candidates = g[1].split(';')
    
    gold_cands[word] = {'postag':postag, 'candidates':candidates}

In [35]:
with open('data/lst/lst_test.preprocessed') as file:
    
    lst_sentences = [l.split() for l in file.readlines()]

In [36]:
lst_test_preprocessed = defaultdict(dict)
puncs = set(string.punctuation)

for l in lst_sentences:
    word, postag = l[0].split('.')
    sentence_no = int(l[1])
    word_position = int(l[2])
    sentence_tokens = l[3:]
    
    processed_tokens = []
    
    #remove punctuations TODO position might change
    punc_inds = []
    for s in range(len(sentence_tokens)):
        
        if sentence_tokens[s] not in puncs:
            processed_tokens.append(sentence_tokens[s])
        else:
            punc_inds.append(s)
                
    temp_word_position = word_position
    
    for d in punc_inds:
        if d < temp_word_position:
            word_position -= 1
        
    lst_test_preprocessed[(word,sentence_no)] = {'postag':postag, 'word_position':word_position,'sentence':processed_tokens}

In [37]:
def out_ranked_results(word, postag, sentence_id, sorted_d):
    
    word_pos = str(word+'.'+postag)
    res = 'RANKED\t' + word_pos + ' '+ str(sentence_id)
    
    for d in sorted_d:
        res += '\t' + d[0]+' '+ str(d[1])
    
    return res

In [70]:
muLinear = model.muLinear.weight.detach().numpy()
sigmaLinear = model.sigmaLinear.weight.detach().numpy()

Location = model.Location.weight.detach().numpy()
Scale = model.Scale.weight.detach().numpy()
        

In [75]:
def get_embed(word):
    idx = index[word]
    
    mu = Location[idx]
    
    sample = mu
    
    return sample

In [145]:
index["animal"]

6226

In [170]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


def kl(p, q):
    V = p - min(p)
    W = V/np.linalg.norm(V)
    
    Q = q - min(q)
    Z = Q/(np.linalg.norm(Q))

    
    W[W == 0] = 0.0000001
    Z[Z == 0] = 0.0000001
    return np.sum(np.where(p != 0, p * np.log(W / Z), 0))
                  
                  
p = get_embed("wind")
q = get_embed("air")

print(kl(p,q))

cosine_similarity = 1 - spatial.distance.cosine(p, q)

print(cosine_similarity)



97.587654
-0.15386894345283508


In [134]:

window_size = 5

# #embedding parameters
# weights = model.w_embeddings.weight.detach()

results_to_write = []

for ls in lst_test_preprocessed:
    
    #GET WORD AND SENTENCE RELATED INFO
    item = lst_test_preprocessed[ls]
    central_word = ls[0]
    sentence_id = ls[1]
    
    postag = item['postag']
    word_position = item['word_position']
    sentence = item['sentence']
    
    #get the list of candidate gold annotations
    cands = gold_cands[central_word]['candidates']

    cos_sims = defaultdict(float)
    add_heur = defaultdict(float)
    mul_heur = defaultdict(float)

    if central_word in index:
        id_token = index[central_word]

        #get the embedding of the word
        embed1 = get_embed(central_word)
        
        # GET CONTEXT WORDS HERE
        context_words = retrieve_contexts(unknown_check(sentence, index), word_position, window_size)

        context_ids = []
        context_vectors = []

        for cw in context_words:
            ctx_id = index[cw]
            context_ids.append(ctx_id)
            context_vectors.append(Location[ctx_id])

        for c in cands:

            #for each candidate find the cosine similarity between target and central
            if c in index.keys():
                id_for_cand = index[c]

                if id_for_cand < len(Scale):
                    embed2 = get_embed(c)

                    #cosine_similarity = 1 - spatial.distance.cosine(embed1, embed2)
                    KL_sim = kl(embed1, embed2)
                    #print(cosine_similarity, KL_sim)
                    cos_sims[c] = KL_sim
                else:
                    cos_sims[c] = 0

            #USING FORMULAS IN LEVY - LEXICAL SUBS.
    #         add_res = 
    #         add_heur.append(add_res)

    #         mul_res = 
    #         mul_heur.append(mul_res)

        sorted_d = sorted(cos_sims.items(), key=lambda x: x[1], reverse=True)

        ranked_res = out_ranked_results(central_word, postag, sentence_id, sorted_d)

        results_to_write.append(ranked_res)
    else:
        
        for c in cands:

            #for each candidate find the cosine similarity between target and central
            if c in index.keys():
                id_for_cand = index[c]

                cos_sims[c] = 0


        sorted_d = sorted(cos_sims.items(), key=lambda x: x[1], reverse=True)

        ranked_res = out_ranked_results(central_word, postag, sentence_id, sorted_d)

        results_to_write.append(ranked_res)

file_name = 'bayesian_eval' + '.txt'

with open (file_name, 'w') as f:
 
    for r in range(len(results_to_write)):

        if r == len(results_to_write) - 1:
            f.write(results_to_write[r])
        else:
            f.write(results_to_write[r])
            f.write('\n')

In [135]:
! python data/lst/lst_gap.py data/lst/lst_test.gold bayesian_eval.txt out no-mwe


MEAN_GAP	0.2467359311070365



In [6]:
#### Evaluation Reading #####

class Sentence:
    def __init__(self, target,complete,sent_id,position,tokens):
        self.target = target
        self.complete = complete
        self.sent_id = sent_id
        self.position = position
        self.tokens = tokens

        
        
with open('data/lst/lst_test.preprocessed', 'r') as myfile:
    data = myfile.readlines()

    
sentence_list = []
    
for line in data:
    line = line.split()
    target = line[0].split(".")[0]
    complete = line[0]
    sent_id = line[1]
    position = line[2]
    tokens = line[3:]
    sent_example = Sentence(target,complete,sent_id,position,tokens)
    
    sentence_list.append(sent_example)



In [2]:
with open('data/lst/lst.gold.candidates', 'r') as myfile:
    data = myfile.readlines()

    
word_candidates = {}
    
for line in data:
    
    word = line.split(":")[0][:-2]
    #print(word[:-2])
    candidates = line[:-1].split(":")[2].split(";")
    #print(candidates)
    
    word_candidates[word] = candidates


In [3]:
word_candidates

{'about': ['here and there',
  'regarding',
  'around',
  'of',
  'concerning',
  'arise',
  'discussed',
  'dealing with',
  'approximately',
  'roughly',
  'cope with',
  'nearly',
  'somewhat',
  'more or less',
  'occur',
  'happen',
  'consider',
  'round',
  'concerned with'],
 'account': ['access',
  'balance',
  'description',
  'chronicle',
  'facility',
  'bank balance',
  'explanation',
  'ledger',
  'finance',
  'banking facility',
  'subscriber',
  'fund',
  'synopsis',
  'asset',
  'statement',
  'narrative',
  'report',
  'consideration',
  'subscription',
  'logon',
  'banking arrangement'],
 'acquire': ['amass',
  'purchase',
  'buy',
  'secure',
  'get',
  'receive',
  'gather',
  'procure',
  'obtain',
  'collect',
  'bring in',
  'gain',
  'learn',
  'find',
  'achieve'],
 'acute': ['heightened',
  'emergency',
  'sensitive',
  'sudden',
  'urgent',
  'severe',
  'critical',
  'serious',
  'sharp',
  'keen',
  'pn',
  'intense',
  'grave'],
 'apparently': ['supposed

In [152]:
label = '_hansard_en' 
#label = '_hansards_fr'

with open('tokens2id' +label +'_smaller'+'.pickle', 'rb') as f:
    tokens2id = pickle.load(f)
    
    
with open('wordvecs_skipgram_context_300_hansard_en_smaller_120.pickle', 'rb') as file:
    c_embeds = pickle.load(file)

with open('wordvecs_skipgram_word_300_hansard_en_smaller_120.pickle', 'rb') as file:
    w_embeds = pickle.load(file)

In [156]:
def get_similarity(central_word, context_words, candidate,window_size):
    central_w = get_embed(central_word)
    
    sum_ = central_w
    for c in context_words:
        embed_c = get_embed(c)
        sum_ = sum_ + embed_c
        
    sum_ = sum_ / window_size
    cand_em = get_embed(candidate)
    cosine_similarity = 1 - spatial.distance.cosine(central_w, cand_em)
    return cosine_similarity
    

In [158]:

window_size = 5

# #embedding parameters
# weights = model.w_embeddings.weight.detach()

results_to_write = []

for ls in lst_test_preprocessed:
    
    #GET WORD AND SENTENCE RELATED INFO
    item = lst_test_preprocessed[ls]
    central_word = ls[0]
    sentence_id = ls[1]
    
    postag = item['postag']
    word_position = item['word_position']
    sentence = item['sentence']
    
    #get the list of candidate gold annotations
    cands = gold_cands[central_word]['candidates']

    cos_sims = defaultdict(float)
    add_heur = defaultdict(float)
    mul_heur = defaultdict(float)

    if central_word in index:
        id_token = index[central_word]

        #get the embedding of the word
        embed1 = get_embed(central_word)
        
        # GET CONTEXT WORDS HERE
        context_words = retrieve_contexts(unknown_check(sentence, index), word_position, window_size)

        context_ids = []
        context_vectors = []

        for cw in context_words:
            ctx_id = index[cw]
            context_ids.append(ctx_id)
            context_vectors.append(cw)

        for c in cands:

            #for each candidate find the cosine similarity between target and central
            if c in index.keys():
                id_for_cand = index[c]

                if id_for_cand < len(Scale):
                    embed2 = get_embed(c)

                    #cosine_similarity = 1 - spatial.distance.cosine(embed1, embed2)
                    #KL_sim = kl(embed1, embed2)
                    #print(cosine_similarity, KL_sim)
                    
                    sim = get_similarity(central_word, context_words, c,window_size)
                    cos_sims[c] = sim
                else:
                    cos_sims[c] = 0

            #USING FORMULAS IN LEVY - LEXICAL SUBS.
    #         add_res = 
    #         add_heur.append(add_res)

    #         mul_res = 
    #         mul_heur.append(mul_res)

        sorted_d = sorted(cos_sims.items(), key=lambda x: x[1], reverse=True)

        ranked_res = out_ranked_results(central_word, postag, sentence_id, sorted_d)

        results_to_write.append(ranked_res)
    else:
        
        for c in cands:

            #for each candidate find the cosine similarity between target and central
            if c in index.keys():
                id_for_cand = index[c]

                cos_sims[c] = 0


        sorted_d = sorted(cos_sims.items(), key=lambda x: x[1], reverse=True)

        ranked_res = out_ranked_results(central_word, postag, sentence_id, sorted_d)

        results_to_write.append(ranked_res)

file_name = 'skipgram' + '.txt'

with open (file_name, 'w') as f:
 
    for r in range(len(results_to_write)):

        if r == len(results_to_write) - 1:
            f.write(results_to_write[r])
        else:
            f.write(results_to_write[r])
            f.write('\n')

In [159]:
! python data/lst/lst_gap.py data/lst/lst_test.gold skipgram.txt out no-mwe


MEAN_GAP	0.23732634081680476

