In [1]:
import pandas as pd 
import glob

from pathlib import Path

In [2]:
#please specifiy the path of input dataset: 
input_dataset= './data/krapivin2009/test/'

Path("../Output/").mkdir(parents=True, exist_ok=True)
Path("../Output/PKE/").mkdir(parents=True,
                             exist_ok=True)  # present keyphrase output
Path("../Output/AKE/").mkdir(parents=True,
                             exist_ok=True)  # absent keyphrase output
Path("../Output/entities-URIs/").mkdir(parents=True,
                             exist_ok=True)  # for saving output of MAG (linked entities URIs) 

Path("../Output/AKE-babelnet/").mkdir(parents=True,
                                      exist_ok=True)  # absent keyphrase from BabelNet
Path("../Output/AKE-DBpedia/").mkdir(parents=True,
                                     exist_ok=True)  # absent keyphrase from BabelNet


## Present Keyphrase Generation (PKE):

In [3]:
import os

from keybert import KeyBERT # for present keyphrase extraction
import requests

from nltk.stem import WordNetLemmatizer
from py_babelnet.calls import BabelnetAPI

In [10]:
kw_model = KeyBERT()

#iterate over all files in the dataset ... 
fNames= glob.glob(input_dataset+'*.txt')

for file in fNames:

    # read the content of the input document.
    file='./Krapivin_Abstracts/'+file.split('/')[-1][:-4]+'.abstr'
    
    input_doc = open(file, mode='r').read()
    input_doc=input_doc.replace('\t', ' ').replace('\n', '')
    
    # extract present keyphrases
    keywords = kw_model.extract_keywords(input_doc, keyphrase_ngram_range=(2, 4), 
                                     stop_words='english')
    
    # save keywods without relevance score into file
    final_keywords=""
    for keyword in keywords: 
        final_keywords+=keyword[0]+"\n"
    
    with open('./Output/PKE/'+file.split('/')[-1], 'w') as outFile:
        outFile.writelines(final_keywords.rstrip())
    outFile.close()

## Absent Keyphrase Genration:

### helper functions:

In [5]:
# process keyphrases to link with DBpedia based on ngram matching
def preprocess_keywords_ngrams(inputFile):
    present_keyphrase = pd.read_csv(inputFile, header= None)
    
    keywordsfull= present_keyphrase[0].tolist()  
    keywords=[]
    for keyword in keywordsfull:
        keyword=keyword.replace("'","")
        words=keyword.split(" ")

        keywords.append(keyword)
        lastindex=len(words)-1
        currentlen=len(words)-1
        firstind=0
        while currentlen>0:
            lastind=firstind+currentlen-1
            if lastind <= lastindex:
                keywords.append(" ".join(words[firstind:lastind+1]))
                firstind=firstind+1
            else:
                currentlen=currentlen-1
                firstind=0
    output= ""
    for word in keywords: 
        
        output+="<entity>"+word+"</entity> "
                
    return output

# save named entities URIs from DBpedia into file
def save_dict_to_file(dic, fName):
    
    f = open('./Output/entities-URIs/'+fName,'w')
    f.write(str(dic))
    f.close()  

#### Finding Entities URIs using MAG system

In [7]:
fNames= glob.glob("./Output/PKE/*.txt")

for file in fNames:
    keywords = preprocess_keywords_ngrams(file)
    #print(keywords)
    mydata= 'text={"agstring":"'+keywords+'","maxkeywords":10,"topics":[]}&type=json'
    #print(mydata)
    resp=requests.post("http://localhost:8080/AGDISTIS",data=mydata)
    
    json_data= resp.json()
    
    linked_entities=""
    
    for url in json_data['topNodes']: 
        
        linked_entities+=url['entityURL']+"\n"
        
    save_dict_to_file(linked_entities, file.split('/')[-1])

### Absent Keyphrase Generation with BabelNet

### Setup BabelNet API

please change this code to your BabelNet access token. More details can be found here 
https://babelnet.org/guide

In [8]:
api = BabelnetAPI('6a01c7b8-50a2-4a18-9385-635ab5e8e489')

lemmatizer = WordNetLemmatizer()

def babelNet_linking(word):     

    word_lemma= lemmatizer.lemmatize(word)
    senses = api.get_senses(lemma = word_lemma, pos="NOUN", searchLang = "EN")    
    related_terms= set()
    
    for sens in senses: 
                   
        related_term= sens['properties']['fullLemma'].lower() #align all terms in lowercase         
        related_term=lemmatizer.lemmatize(related_term)
        related_terms.add(related_term)
        
    return related_terms

def save_to_file(list_related_terms, fName):
    f = open('./Output/AKE-babelnet/'+fName,'w')
    f.write(str(list_related_terms))
    f.close() 

In [10]:
fNames = glob.glob("../Output/entities-URIs/*.txt")

# for each document which may contain linked_entities: 
for file in fNames:
    if os.stat(file).st_size > 0: # skipp empty documents
        linking_df = pd.read_csv(file, header= None, on_bad_lines='skip')    
        
        linked_entities=[x.split('/')[-1] for x in linking_df[0].tolist()]
         
        for entity in linked_entities:             
            related_terms=babelNet_linking(entity)                    

        ## save the output of babelNet linking:         
        save_to_file(related_terms, file.split('/')[-1])
        

### Absent Keyphrase Generation with DBpedia

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON

Path("../Output/AKE-DBpedia/").mkdir(parents=True,
                                     exist_ok=True)  # absent keyphrase from DBpedia


#### Linking with DBpedia to get related terms (dct:subject, gold:hpernyms)

In [None]:
sparql = SPARQLWrapper("http://dbpedia.org/sparql")

def dbpedia_linking(uri):

    ### Execute SPARQL Query to get dct:subjects for a uri ###
    sparql.setQuery("""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        SELECT ?subject
        WHERE { """+uri+""" dct:subject ?subject }
        """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    related_terms = []
    for result in results["results"]["bindings"]:
        related_terms.append(
            result['subject']['value'].split('/')[-1].split(':')[-1])

    ### Execute another SPARQL Query to get hypernyms of a uri ###
    sparql.setQuery("""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        SELECT ?subject
        WHERE { """+uri+""" gold:hypernym ?subject }
        """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    for result in results["results"]["bindings"]:
        related_terms.append(
            result['subject']['value'].split('/')[-1].split(':')[-1])

    return related_terms

def save_to_file(list_related_terms, fName):
    f = open('./Output/AKE-DBpedia/'+fName,'w')
    for word in list_related_terms:  
        f.write(word+"\n")
    f.close()

In [None]:
fNames = glob.glob("./Output/entities-URIs/*.txt")

# for each document which may contain linked_entities:
for file in fNames:
    if os.stat(file).st_size > 0:  # skipp empty documents
        linking_df = pd.read_csv(file, header=None, on_bad_lines='skip')

        linked_entities = [x for x in linking_df[0].tolist()]

        for uri in linked_entities:
            related_terms = dbpedia_linking("<"+uri+">")

        # save the output of DBpedia linking:
        save_to_file(related_terms, file.split('/')[-1])


# Final Keyphrase Generation (Semantic Matching):  

In [12]:
# Get document (abstract) embedding representation from BERT model.
# Get words (present and absent keyphrases) embeddding representation from BERT Model.
# Compute the cosine similarity between doc2vec and words2vec, then return a sorted list as an output.

from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

# check if the output folder of absent keyphrases already exisit
Path("./Output/AKE/").mkdir(parents=True, exist_ok=True) 


In [13]:
def get_Keyphrases(fileName):

    DBpedia_AKE = './Output/AKE-DBpedia/'+fileName

    dbpedia_keyphrases = open(DBpedia_AKE, mode='r').readlines()
    dbpedia_keyphrases = [keyphrase.replace(
        '\n', '') for keyphrase in dbpedia_keyphrases]

    # check if there is absent keyphrases for the input file:
    babelNet_keyphrases = []

    babelNet_AKE = Path('./Output/AKE-babelnet/'+fileName)

    if babelNet_AKE.is_file():
        babelNet_keyphrases = open(babelNet_AKE, mode='r').read()

        babelNet_keyphrases = [
            keyphrase[1:-1].replace('\'', '') for keyphrase in babelNet_keyphrases[1:-1].split(',')]

    # return list
    absent_keyphrases = dbpedia_keyphrases+babelNet_keyphrases

    return absent_keyphrases

In [14]:
#iterate over all files in the dataset ... 
fNames= glob.glob(input_dataset+'*.abstr')

for file in fNames:
    
    # read the content of the input document.
    input_doc = open(file, mode='r').read()
    input_doc=input_doc.replace('\t', ' ').replace('\n', '')        
    doc_embedding = model.encode(input_doc, convert_to_tensor=True)

    #---- get the predicted keyphrases ---#
    fileName= file.split('/')[-1]
    
    predicted_keyphrases = get_Keyphrases(fileName)
    
    keyphrase_embedding = model.encode(predicted_keyphrases, convert_to_tensor=True)

    #----- Compute cosine-similarits -----#
    cosine_scores = util.pytorch_cos_sim(doc_embedding, keyphrase_embedding)

    #--- Output the pairs with their score ----#
    similar_keyphrases={}
    
    for i in range(len(predicted_keyphrases)):
        similar_keyphrases[predicted_keyphrases[i]]= cosine_scores[0][i]
    
    sorted_keyphrase=sorted(similar_keyphrases.items(), key=lambda x: x[1], reverse=True)
    
    final_keyphrases= [keyphrase[0] for keyphrase in sorted_keyphrase]
    
    #--- save ranked keyphrases into file ----#    
    with open('./Output/AKE/'+file.split('/')[-1], 'w') as outFile:        
        outFile.writelines("%s\n" % keyphrase for keyphrase in final_keyphrases)
    outFile.close()

