In [1]:
import numpy as np

from scipy.stats import pearsonr
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
from scipy import stats

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F
torch.manual_seed(1)

import time
from datetime import datetime

from collections import defaultdict
from collections import Counter

from random import randint
import pickle

import warnings
warnings.filterwarnings('error')

from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial

import string
puncs = set(string.punctuation)

In [76]:
label = '_hansard_en' 
#label = '_hansards_fr'

with open('tokens2id' +label +'_smaller'+'.pickle', 'rb') as f:
    tokens2id = pickle.load(f)
with open('id2tokens' +label+'_smaller'+'.pickle', 'rb') as f:
    id2tokens = pickle.load(f)

In [77]:
len(tokens2id)

2949

In [72]:
with open('wordvecs_skipgram_word' + label + '_smaller' + str(100) + '.pickle', 'rb') as file:
    w_embeds = pickle.load(file)

with open('wordvecs_skipgram_context' + label +  '_smaller' + str(100) + '.pickle', 'rb') as file:
    c_embeds = pickle.load(file)

In [53]:
w_embeds[1].shape

(100,)

In [54]:
with open('lst/lst.gold.candidates') as file:
    whole_gold = file.read().splitlines() 
    
    gold_cands_l = [l.split('::') for l in whole_gold]

In [55]:
gold_cands = defaultdict(dict)

for g in gold_cands_l:
    word_pos = g[0]
    
    word, postag = word_pos.split('.')
    
    candidates = g[1].split(';')
    
    gold_cands[word] = {'postag':postag, 'candidates':candidates}

In [56]:
gold_cands['bright']

{'candidates': ['alight',
  'skilled',
  'deep',
  'good',
  'sharp',
  'luminous',
  'colourful',
  'optimisitc',
  'vivid',
  'capable',
  'positive',
  'hopeful',
  'shining',
  'intelligent',
  'smart',
  'clever',
  'motivated',
  'vibrant',
  'up-and-coming',
  'well-lit',
  'gleam',
  'most talented',
  'great',
  'talented',
  'most able',
  'brilliant',
  'light',
  'clear',
  'gifted',
  'promising'],
 'postag': 'a'}

In [57]:
with open('lst/lst_test.preprocessed') as file:
    
    lst_sentences = [l.split() for l in file.readlines()]

In [58]:
lst_test_preprocessed = defaultdict(dict)

for l in lst_sentences:
    word, postag = l[0].split('.')
    sentence_no = int(l[1])
    word_position = int(l[2])
    sentence_tokens = l[3:]
    
    processed_tokens = []
    
    #remove punctuations TODO position might change
    punc_inds = []
    for s in range(len(sentence_tokens)):
        
        if sentence_tokens[s] not in puncs:
            processed_tokens.append(sentence_tokens[s])
        else:
            punc_inds.append(s)
                
    temp_word_position = word_position
    
    for d in punc_inds:
        if d < temp_word_position:
            word_position -= 1
        
    lst_test_preprocessed[(word,sentence_no)] = {'postag':postag, 'word_position':word_position,'sentence':processed_tokens}

In [59]:
lst_test_preprocessed[('bull',589)]

{'postag': 'n',
 'sentence': ['some',
  'darting',
  'terms',
  'originated',
  'outside',
  'the',
  'sport',
  'such',
  'as',
  'hat',
  'trick',
  'meaning',
  'all',
  'three',
  'darts',
  'of',
  'a',
  'player',
  "'s",
  'round',
  'landing',
  'in',
  'the',
  'bull'],
 'word_position': 23}

In [60]:
# base word embed
#word+context combination
#levy lexical subs paper has the formulas

#add
# mul
# baladd
# balmul
# kl
#gold standard is lemmatized
#noun verb adverb adj
#skipgram context insensitive but we can make it sensitive via using context embeds

In [61]:
class SkipGramNeg(nn.Module):
    def __init__(self, vocabulary_size, embedding_dim):
        super(SkipGramNeg, self).__init__()
        
        #sparse embeddings for word and context vectors
        
        self.w_embeddings = nn.Embedding(vocabulary_size, embedding_dim, sparse = True)
        self.c_embeddings = nn.Embedding(vocabulary_size, embedding_dim, sparse = True)
        
        # initialization of embeds
        # https://adoni.github.io/2017/11/08/word2vec-pytorch/

        initrange = 0.5 / embedding_dim
        self.w_embeddings.weight.data.uniform_(-initrange, initrange)
        self.c_embeddings.weight.data.uniform_(-0, 0)

    def forward(self, pos_words, pos_conts, neg_conts):
        
        #Loss calculation, Levy&Goldberg word2vec Explained
        #https://adoni.github.io/2017/11/08/word2vec-pytorch/
        
        w_out = self.w_embeddings(pos_words)
        
        pos_out = self.c_embeddings(pos_conts)
        neg_out = self.c_embeddings(neg_conts)
        
#         print(neg_conts)
#         print(pos_conts)
#         print(pos_words)
               
        pos_val = torch.mul(w_out, pos_out).squeeze()
        pos_val = torch.sum(pos_val, dim = 1)
        pos_loss = F.logsigmoid(pos_val)
        
        neg_val = torch.bmm(neg_out, w_out.unsqueeze(2)).squeeze()
        neg_val = torch.sum(neg_val, dim = 1)
        neg_loss = F.logsigmoid(-neg_val)
        
        final_out = pos_loss + neg_loss.sum()
        final_out = -final_out.sum()/len(pos_words) #neg and mean
         
        return final_out
    

In [86]:
def get_context_window(sentence, central_word_index, window_size):
    
    context = []
    word = sentence[central_word_index]
    
    for w in range(1,window_size+1):
        
        left_cont = central_word_index - w
        right_cont = central_word_index + w
        
        #find the window words to the left and right
        #add as pair if they are inside sentence boundaries
        
        if left_cont > -1:
            context.append(sentence[left_cont])
            
        if right_cont < len(sentence):
            context.append(sentence[right_cont])
            
    return context

In [87]:
def out_ranked_results(word, postag, sentence_id, sorted_d):
    
    word_pos = str(word+'.'+postag)
    res = 'RANKED\t' + word_pos + ' '+ str(sentence_id)
    
    for d in sorted_d:
        res += '\t' + d[0]+' '+ str(d[1])
    
    return res

In [88]:
# lst_test_nouns = defaultdict(dict)
# lst_test_verbs = defaultdict(dict)
# lst_test_adjectives = defaultdict(dict)

# with open('tokens2id.pickle') as f:
#     tokens2id = pickle.load(f)
# with open('id2tokens.pickle') as f:
#     id2tokens = pickle.load(f)
    
window_size = 5

# #embedding parameters
# weights = model.w_embeddings.weight.detach()

results_to_write = []

for ls in lst_test_preprocessed:
    
    #GET WORD AND SENTENCE RELATED INFO
    item = lst_test_preprocessed[ls]
    central_word = ls[0]
    sentence_id = ls[1]
    
    postag = item['postag']
    word_position = item['word_position']
    sentence = item['sentence']
    
    #get the list of candidate gold annotations
    cands = gold_cands[central_word]['candidates']

    cos_sims = defaultdict(float)
    add_heur = defaultdict(float)
    mul_heur = defaultdict(float)

    if central_word in tokens2id:
        id_token = tokens2id[central_word]

        #get the embedding of the word
        embed1 = w_embeds[tokens2id[central_word]]
        
        # GET CONTEXT WORDS HERE
        context_words = get_context_window(sentence, word_position, window_size)

        context_ids = []
        context_vectors = []

        for cw in context_words:
            ctx_id = tokens2id[cw]
            if ctx_id in c_embeds:
                context_ids.append(ctx_id)
                context_vectors.append(c_embeds[ctx_id])

        for c in cands:

            #for each candidate find the cosine similarity between target and central

            id_for_cand = tokens2id[c]

            if id_for_cand < len(w_embeds):
                embed2 = w_embeds[id_for_cand]

                cosine_similarity = 1 - spatial.distance.cosine(embed1, embed2)
                cos_sims[c] = cosine_similarity
            else:
                cos_sims[c] = 0

            #USING FORMULAS IN LEVY - LEXICAL SUBS.
    #         add_res = 
    #         add_heur.append(add_res)

    #         mul_res = 
    #         mul_heur.append(mul_res)

        sorted_d = sorted(cos_sims.items(), key=lambda x: x[1], reverse=True)

        ranked_res = out_ranked_results(central_word, postag, sentence_id, sorted_d)

        results_to_write.append(ranked_res)
    else:
        
        for c in cands:

            #for each candidate find the cosine similarity between target and central

            id_for_cand = tokens2id[c]

            cos_sims[c] = 0


        sorted_d = sorted(cos_sims.items(), key=lambda x: x[1], reverse=True)

        ranked_res = out_ranked_results(central_word, postag, sentence_id, sorted_d)

        results_to_write.append(ranked_res)

file_name = 'skipgram' + label + '.txt'

with open (file_name, 'w') as f:
 
    for r in range(len(results_to_write)):

        if r == len(results_to_write) - 1:
            f.write(results_to_write[r])
        else:
            f.write(results_to_write[r])
            f.write('\n')

In [88]:
#multi word 
#words with no embeds

#embed align eval


In [68]:
id2tokens[0]

'division'

In [81]:
len(w_embeds)

2949