In [55]:
# '''
# Steps: 
# -----:

# + get word embedding, for keyphrase (more than one word), take average of word embeddings
# + compute similarity score (e.g. using cosine) and find the max similarity score between a predicte keyphrase w.r.t all gold standard keyphrases
# + GM (pred, gold)= um of all similarity scores / number of predicted keyprhases .. Eq. (14)--> https://arxiv.org/abs/1910.07897

# Futher, we can compute symmetric score
# + T = GM (gold, pred) + GM (pred, gold)
# + symmGM(pred, gold)= T/2

# >> Finally Embedding-based GreedyMatching

# GM (pred, gold)= sum of scores (pred_i, gold) / (p+alpha . max (0, g-p))
# GM (gold, pred)= sum of scores (gold_i, pred) / (g+beta . max (0, p-g))
# T = GM (pred, gold) + GM (gold, pred)
# G(pred, gold)= T/2

# '''

In [1]:
import torch
from sentence_transformers import SentenceTransformer, util

import glob
import csv
from pathlib import Path

model = SentenceTransformer('all-MiniLM-L6-v2')

In [52]:
def greedy_matching(list_A, List_B, x): 
    
    sum_of_similarities= 0
    
    for keyphrase in list_A:         
        sum_of_similarities+= torch.max(util.pytorch_cos_sim(keyphrase, List_B))
               
    GM= sum_of_similarities / (len(list_A)+ x * max(0, len(list_A)-len(List_B)))
    
    return GM


def similarty_score(list_A, List_B, alpha=1, beta=1):     
    T= greedy_matching(list_A, List_B, x= alpha)+ greedy_matching(List_B,list_A, x= beta)
    score= T/2
    return round(score.item(), 3)


In [46]:
file_writer = csv.writer(open('./Output/GreedyMatching.csv', 'w', encoding='UTF8', newline=''), delimiter='\t')
file_writer.writerow(['id', 'greedMatching'])

18

In [53]:
# compute cosine similarity scores based on embeddings between two lists.

groundTruth_path= '../Inspec/keys/*.key'
predicted_path='./Output/Ranking/'

fNames= glob.glob(groundTruth_path)

for file in fNames:
    
    with open(file) as fileIn:
        groundtruth_keyphrases = fileIn.readlines()        
        groundtruth_embedding= model.encode(groundtruth_keyphrases, convert_to_tensor=True)                
        fileIn.close()

    fileName= file.split('/')[-1][:-3]+'txt'
    
    with open(predicted_path+fileName) as fileIn:
        predicted_keyphrases = fileIn.readlines()        
        predicted_keyphrases= [keyphrase.replace('\n', '') for keyphrase in predicted_keyphrases]        
        keyphrase_embedding = model.encode(predicted_keyphrases, convert_to_tensor=True)
        
        
    score=similarty_score(keyphrase_embedding, groundtruth_embedding, 1, 1) # set hyper-parameters alpha=1, beta=1 
    
    #save evaluation results into csv file    
    file_writer.writerow([fileName[:-4], score])
    


