In [117]:
import numpy as np

from scipy.stats import pearsonr
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
from scipy import stats

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F
torch.manual_seed(1)

import time
from datetime import datetime

from collections import defaultdict
from collections import Counter

from random import randint
import pickle

import warnings
warnings.filterwarnings('error')

from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial

import string
puncs = set(string.punctuation)

In [98]:
with open('wordvecs_skipgram.pickle', 'rb') as file:
    w_embeds = pickle.load(file)

In [99]:
w_embeds[1].shape

(100,)

In [100]:
with open('lst/lst.gold.candidates') as file:
    whole_gold = file.read().splitlines() 
    
    gold_cands_l = [l.split('::') for l in whole_gold]

In [101]:
gold_cands = defaultdict(dict)

for g in gold_cands_l:
    word_pos = g[0]
    
    word, postag = word_pos.split('.')
    
    candidates = g[1].split(';')
    
    gold_cands[word] = {'postag':postag, 'candidates':candidates}

In [102]:
gold_cands['bright']

{'candidates': ['alight',
  'skilled',
  'deep',
  'good',
  'sharp',
  'luminous',
  'colourful',
  'optimisitc',
  'vivid',
  'capable',
  'positive',
  'hopeful',
  'shining',
  'intelligent',
  'smart',
  'clever',
  'motivated',
  'vibrant',
  'up-and-coming',
  'well-lit',
  'gleam',
  'most talented',
  'great',
  'talented',
  'most able',
  'brilliant',
  'light',
  'clear',
  'gifted',
  'promising'],
 'postag': 'a'}

In [103]:
with open('lst/lst_test.preprocessed') as file:
    
    lst_sentences = [l.split() for l in file.readlines()]

In [104]:
lst_test_preprocessed = defaultdict(dict)

for l in lst_sentences:
    word, postag = l[0].split('.')
    sentence_no = int(l[1])
    word_position = int(l[2])
    sentence_tokens = l[3:]
    
    processed_tokens = []
    
    #remove punctuations TODO position might change
    punc_inds = []
    for s in range(len(sentence_tokens)):
        
        min_deleted_ind = len(sentence_tokens)
        
        if sentence_tokens[s] not in puncs:
            processed_tokens.append(sentence_tokens[s])
        else:
            punc_inds.append(s)
            
            if s < min_deleted_ind:
                min_deleted_ind = s
    
    for d in punc_inds:
        if d < word_position:
            word_position -= 1
        
    lst_test_preprocessed[(word,sentence_no)] = {'postag':postag, 'word_position':word_position,'sentence':processed_tokens}

In [105]:
lst_test_preprocessed[('side',301)]

{'postag': 'n',
 'sentence': ['on',
  'sunday',
  'at',
  'craven',
  'cottage',
  'jose',
  'mourinho',
  'and',
  'his',
  'all',
  'stars',
  'exhibited',
  'all',
  'of',
  'the',
  'above',
  'symptoms',
  'and',
  'they',
  'were',
  'made',
  'to',
  'pay',
  'the',
  'price',
  'by',
  'a',
  'fulham',
  'side',
  'that',
  'had',
  'in',
  'previous',
  'weeks',
  'woken',
  'up',
  'after',
  'matches',
  'with',
  'their',
  'heads',
  'kicked',
  'in'],
 'word_position': 28}

In [106]:
# base word embed
#word+context combination
#levy lexical subs paper has the formulas

#add
# mul
# baladd
# balmul
# kl
#gold standard is lemmatized
#noun verb adverb adj
#skipgram context insensitive but we can make it sensitive via using context embeds

In [107]:
class SkipGram(nn.Module):
    def __init__(self, vocabulary_size, embedding_dim):
        super(SkipGram, self).__init__()
        
        #sparse embeddings for word and context vectors
        self.w_embeddings = nn.Embedding(vocabulary_size, embedding_dim) #, sparse = True
        self.lin1 = nn.Linear(embedding_dim, vocabulary_size, bias = False)
           
    def forward(self, pos_words):
        
        out = self.w_embeddings(pos_words)
        
        out = self.lin1(out)
        
        final_out = F.log_softmax(out, dim = 0)
        
        return final_out

In [108]:
with open('skipgram.pickle', 'rb') as file:
    model = pickle.load(file)

In [109]:
class SkipGramNeg(nn.Module):
    def __init__(self, vocabulary_size, embedding_dim):
        super(SkipGramNeg, self).__init__()
        
        #sparse embeddings for word and context vectors
        
        self.w_embeddings = nn.Embedding(vocabulary_size, embedding_dim, sparse = True)
        self.c_embeddings = nn.Embedding(vocabulary_size, embedding_dim, sparse = True)
        
        # initialization of embeds
        # https://adoni.github.io/2017/11/08/word2vec-pytorch/

        initrange = 0.5 / embedding_dim
        self.w_embeddings.weight.data.uniform_(-initrange, initrange)
        self.c_embeddings.weight.data.uniform_(-0, 0)

    def forward(self, pos_words, pos_conts, neg_conts):
        
        #Loss calculation, Levy&Goldberg word2vec Explained
        #https://adoni.github.io/2017/11/08/word2vec-pytorch/
        
        w_out = self.w_embeddings(pos_words)
        
        pos_out = self.c_embeddings(pos_conts)
        neg_out = self.c_embeddings(neg_conts)
        
#         print(neg_conts)
#         print(pos_conts)
#         print(pos_words)
               
        pos_val = torch.mul(w_out, pos_out).squeeze()
        pos_val = torch.sum(pos_val, dim = 1)
        pos_loss = F.logsigmoid(pos_val)
        
        neg_val = torch.bmm(neg_out, w_out.unsqueeze(2)).squeeze()
        neg_val = torch.sum(neg_val, dim = 1)
        neg_loss = F.logsigmoid(-neg_val)
        
        final_out = pos_loss + neg_loss.sum()
        final_out = -final_out.sum()/len(pos_words) #neg and mean
         
        return final_out
    

In [110]:
with open('skipgram_neg.pickle', 'rb') as file:
    model_neg = pickle.load(file)

FileNotFoundError: [Errno 2] No such file or directory: 'skipgram_neg.pickle'

In [125]:
def get_context_window(sentence, central_word_index, window_size):
    
    context = []
    word = sentence[central_word_index]
    
    for w in range(1,window_size+1):
        
        left_cont = central_word_index - w
        right_cont = central_word_index + w
        
        #find the window words to the left and right
        #add as pair if they are inside sentence boundaries
        
        if left_cont > -1:
            context.append(sentence[left_cont])
            
        if right_cont < len(sentence):
            context.append(sentence[right_cont])
            
    return context

In [132]:
# lst_test_nouns = defaultdict(dict)
# lst_test_verbs = defaultdict(dict)
# lst_test_adjectives = defaultdict(dict)

# with open('tokens2id.pickle') as f:
#     tokens2id = pickle.load(f)
# with open('id2tokens.pickle') as f:
#     id2tokens = pickle.load(f)
    
window_size = 5

#embedding parameters
weights = model.w_embeddings.weight.detach()

for ls in lst_test_preprocessed:
    
    #GET WORD AND SENTENCE RELATED INFO
    item = lst_test_preprocessed[ls]
    central_word = ls[0]
    sentence_id = ls[1]
    
    postag = item['postag']
    word_position = item['word_position']
    sentence = item['sentence']
    
    # GET CONTEXT WORDS HERE
    context_words = get_context_window(sentence, word_position, window_size)
    
    context_ids = []
    context_vectors = []
    
    for cw in context_words:
        ctx_id = tokens2id[cw]
        context_ids.append(ctx_id)
        context_vectors.append(weights(ctx_id))
        
    #get the embedding of the word
    embed = model.w_embeddings.weight[tokens2id[central_word]]
    
    #get the list of candidate gold annotations
    cands = gold_cands[central_word]['candidates']
    
    cos_sims = defaultdict(float)
    add_heur = defaultdict(float)
    mul_heur = defaultdict(float)
    
    for c in cands:
        
        #for each candidate find the cosine similarity between target and central
        id_for_cand = tokens2id[c]
        
        embed2 = weights[id_for_cand]
    
        cosine_similarity = 1 - spatial.distance.cosine(embed1, embed2)
        cos_sims[c] = cosine_similarity
        
        #USING FORMULAS IN LEVY - LEXICAL SUBS.
#         add_res = 
#         add_heur.append(add_res)
        
#         mul_res = 
#         mul_heur.append(mul_res)

    sorted_d = sorted(cos_sims.items(), key=lambda x: x[1], reverse=True)

NameError: name 'tokens2id' is not defined