In [210]:
import numpy as np
import json
import os
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import scipy.stats as stats
import torch
import torch.nn.functional as F
from sklearn.preprocessing import normalize


In [211]:
CUDA = torch.cuda.is_available()
print("CUDA: %s" % CUDA)

CUDA: True


In [212]:
porter = PorterStemmer()
stop_words = set(stopwords.words("english"))

In [213]:
model_fld = "model"

In [214]:

def skipgram_sim(embedding, w2i,target, r_ids, context, type=None):
    
    #target : word to be replaced
    #r_ids : replacement words
    #context: context words
    
    t = embedding[target]
    r_ids = np.array([embedding[a] for a in r_ids])
    
    
    c_len = len(context)
    c = np.array([embedding[w] for w in context])
#     print(c.shape, r_ids.shape, t.shape)
    c = normalize(c, axis=1, norm='l2')
    r_ids = normalize(r_ids, axis=1, norm='l2')
    t = t/np.linalg.norm(t)
#     print(c.shape, r_ids.shape, t.shape)
#     print(r_ids @ t)
    if type == 'cosine':
            scores = r_ids @ t
            scores = scores.tolist()

    elif type == 'add':
            scores = [
                (a @ t + np.sum(c @ a)) / (c_len + 1)
            for a
            in r_ids
    ]
    elif type == 'mult':
        scores = [
                (((t @ a + 1) / 2) * np.prod((c @ a + 1) / 2)) ** (1 / (c_len + 1))
                for a
                in r_ids
        ]
        
    return scores

In [222]:
#Lexical substituion task


def Lst_subs(w2i, win_size = 2):
    
    #model_type: embedding or whole model
    with open('data/lst/lst.gold.candidates', 'r') as f:
        lines = map(str.strip, f.readlines())

    w2i = defaultdict(lambda: UNK, w2i)
    candidates = {}
    for line in lines:
        target, rest = line.split('.',maxsplit=1)
        pos_tag, rest = rest.split('::',maxsplit=1)
        replacement = rest.split(';')
        candidates[target] = replacement


    with open('data/lst/lst_test.preprocessed', 'r') as f:
        lines = map(str.strip, f.readlines())

    skip = 0
#     with open((os.path.join(model_fld,name+'_lst.out')), 'w') as f_out:
    t_all = []
    r_id_all =[]
    c_all =[]
    sent_all=[]
    rep=[]
    target_all=[]
    
    for line in lines:
        target, sent_id, t_pos, sentence = line.split('\t')

        target_word = target.split('.')[0]
        sentence = sentence.split()
        t_pos = np.int(t_pos)

        if target_word in w2i.keys():
            target_id = w2i[target_word]
        else:
            skip+= 1
            continue

        #get contexts around target positon which are not stopwords and Punctuation
        a_context = [w for w in sentence[t_pos+1:] if w.isalpha() if w not in stop_words] 
        b_context = [w for w in sentence[:t_pos][::-1] if w.isalpha() if w not in stop_words]
        context = b_context[:win_size]+a_context[:win_size]

        context_ids = [w2i[w] for w in context if w in w2i.keys()]
        if len(context_ids)==0:
            continue

        replacement = candidates[target_word]
        r_ids = [w2i[w] for w in replacement if w in w2i.keys()]
        t_all.append(target_id)
        r_id_all.append(r_ids)
        c_all.append(context_ids)
        sent_all.append(sent_id)
        rep.append(replacement)
        target_all.append(target)
    return t_all, r_id_all, c_all, sent_all, rep, target_all
    
#             scores = func(model_type, w2i, target_id, r_ids, context_ids, type="kl")
#             break
#         #             print(scores)
            
#             print('RANKED\t{} {}'.format(target, sent_id), file=f_out, end='')

#             # Sort alternative by their scores
#             words_and_scores = list(zip(replacement, scores))
            
#             words_and_scores.sort(key=lambda t: t[1], reverse=True)

#             # Write ranked replacement and their scores to file
#             for w, s in words_and_scores:
#                 print('\t{} {}'.format(w, s), file=f_out, end='')
#             print(file=f_out)
#     print("done!!")

## SkipGram

In [232]:
def getout_skip(embedding,target_all, rep_all, context_all, sent_all, rep, word, metrics="cosine"):
    
    with open((os.path.join(model_fld,"skip_"+metrics+'_lst.out')), 'w') as f_out:
            for target_id, r_ids, context_ids, sent_id, replacement, target in zip(target_all, rep_all, \
                                                                           context_all, sent_all, rep, word):
#                 print(target_id, r_ids, context_ids, sent_id, replacement, target)
                scores =  skipgram_sim(embedding, w2i_skip, target_id, r_ids, context_ids, type=metrics)
#                 break

                print('RANKED\t{} {}'.format(target, sent_id), file=f_out, end='')

                # Sort alternative by their scores
                words_and_scores = list(zip(replacement, scores))

                words_and_scores.sort(key=lambda t: t[1], reverse=True)

                # Write ranked replacement and their scores to file
                for w, s in words_and_scores:
                    print('\t{} {}'.format(w, s), file=f_out, end='')
                print(file=f_out)

            

In [237]:
#load SkipGram
fl = "skipgram.npz"
with open(os.path.join(model_fld, "skipgram_w2i.txt"), 'rb') as f:
        w2i_skip = json.load(f)
embedding = np.load(os.path.join(model_fld, fl))["arr_0"]
target_all, rep_all, context_all, sent_all, rep, word = Lst_subs(w2i_skip, win_size=2)

getout_skip(embedding, target_all, rep_all, context_all, sent_all, rep,word,"cosine")
getout_skip(embedding, target_all, rep_all, context_all, sent_all, rep,word,"add")
getout_skip(embedding, target_all, rep_all, context_all, sent_all, rep,word,"mult")

## Bayesian SkipGram

In [239]:
def kldiv(mu, sig, mu_rep, sig_rep):
    t1 = (mu_rep)*0.5 - mu*0.5
    t2_num = (mu-mu_rep)**2 + torch.exp(mu)
    t2_den = 2*(torch.exp(sig_rep)) 
    t2 = ((t2_num/t2_den)-0.5)
    kl_loss = t2+t1
    kl_loss = torch.sum(kl_loss, dim=1)
    return kl_loss.cpu().detach().numpy()[0]
    

def bsg(model, w2i, target, r_ids, context, win_size = 2,type=None):
    #target : word to be replaced
    #r_ids : replacement words
    #context: context words
    
    model.eval()
    PAD = w2i["<pad>"]
    target = [target]
    
    score =[]
    
    if len(context)< win_size*2:
        context+=[PAD]*(win_size*2-len(context))
    #--------------------------------------
    #run a fwd pass for target word
    if CUDA:
        target  = torch.cuda.LongTensor(target)
        context = torch.cuda.LongTensor(context).unsqueeze(0)
        r_ids   = torch.cuda.LongTensor(r_ids)
    else:
        target  = torch.LongTensor(target)
        context = torch.LongTensor(context).unsqueeze(0)
        r_ids   = torch.LongTensor(r_ids)
    
    
    _, mu, sig,_, _ = model.forward(target, context, win_size)
    
   
    #---------------------------------------
    context = context.repeat(r_ids.size()[0],1)
    
    #run fwd pass for each of context word keeping same context
    _,mu_r, sig_r, _, _ = model.forward(r_ids, context,win_size)
    #----------------------------------------
    
    #---convert to numpy------
    #mu, sig = mu.cpu().detach().numpy(), sig.cpu().detach().numpy()
    ##-----------------------------
    #mu_r = mu_r.cpu().detach().numpy()
   
       
    #---------------------------
    
    
    if type == "kl":
        for mu_rep, sig_rep in zip(mu_r, sig_r):
            mu_rep = mu_rep.unsqueeze(0)
            sig_rep = sig_rep.unsqueeze(0)
            
            score.append(kldiv(mu, sig, mu_rep, sig_rep))
        
        return score    

In [234]:
def getout_bsg(model, target_all, rep_all, context_all, sent_all, rep, word,metrics=None, win_size=2):
    
    with open((os.path.join(model_fld,"bsg_"+metrics+'_lst.out')), 'w') as f_out:
            for target_id, r_ids, context_ids, sent_id, replacement, target in zip(target_all, rep_all, \
                                                                           context_all, sent_all, rep, word):
                scores =  bsg(model, w2i_skip, target_id, r_ids, context_ids, win_size, metrics)
#                 break

                print('RANKED\t{} {}'.format(target, sent_id), file=f_out, end='')

                # Sort alternative by their scores
                words_and_scores = list(zip(replacement, scores))

                words_and_scores.sort(key=lambda t: t[1], reverse=True)

                # Write ranked replacement and their scores to file
                for w, s in words_and_scores:
                    print('\t{} {}'.format(w, s), file=f_out, end='')
                print(file=f_out)

In [241]:
from bayesian_skip import BayesianGram

fl = "bsg"
with open(os.path.join(model_fld, "bsg_w2i.txt"), 'rb') as f:
        w2i_bsg = json.load(f)

target_all, rep_all, context_all, sent_all, rep, word = Lst_subs(w2i_skip, win_size=5)
model = torch.load(os.path.join(model_fld, fl))

getout_bsg(model, target_all, rep_all, context_all, sent_all, rep, word,metrics="kl", win_size = 5)


## EmbedAlign