In [None]:
%pip install -U sentence-transformers
%pip install hnswlib
%pip install SPARQLWrapper
%pip install rdflib
%pip install spacy
# python -m spacy download en_core_web_sm

In [18]:
import os
import sys
import hnswlib
import pickle
import rdflib
import SPARQLWrapper
import numpy as np
import pandas as pd
import spacy
from io import StringIO
from rdflib import Graph
from SPARQLWrapper import SPARQLWrapper, JSON, RDF, XML 
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L12-v2')
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words

data_path = os.getcwd()+'\\data'

In [32]:
with open(os.path.join(data_path, 'id2pred.pickle'), "rb") as input_file:
    id2pred = pickle.load(input_file) 
    
with open(os.path.join(data_path, 'pred2id.pickle'), "rb") as input_file:
    pred2id = pickle.load(input_file)    
    
with open(os.path.join(data_path, 'pred2emb.pickle'), "rb") as input_file:
    pred2emb = pickle.load(input_file)  

with open(os.path.join(data_path, 'id2res.pickle'), "rb") as input_file:
    id2res = pickle.load(input_file)
    
# with open(os.path.join(data_path, 'res2id.pickle'), "rb") as input_file:
#     res2id = pickle.load(input_file)    

# with open(os.path.join(data_path, 'res2emb.pickle'), "rb") as input_file:   
#     res2emb = pickle.load(input_file)

with open(os.path.join(data_path, 'contrib2emb.pickle'), "rb") as input_file: 
    contrib2emb = pickle.load(input_file) 

with open(os.path.join(data_path, 'contrib2id.pickle'), "rb") as input_file: 
    contrib2id = pickle.load(input_file)

with open(os.path.join(data_path, 'contrib2emb.pickle'), "rb") as input_file:    
    contrib2emb = pickle.load(input_file)    
    
      
with open(os.path.join(data_path, 'id2paper.pickle'), "rb") as input_file:     
    id2paper = pickle.load(input_file)
    
with open(os.path.join(data_path, 'paper2id.pickle'), "rb") as input_file: 
    paper2id = pickle.load(input_file)    
    
with open(os.path.join(data_path, 'paper2emb.pickle'), "rb") as input_file:    
    paper2emb = pickle.load(input_file)
    

with open(os.path.join(data_path, 'pred_index.pickle'), "rb") as input_file:     
    pred_index = pickle.load(input_file) 

# with open(os.path.join(data_path, 'res_index.pickle'), "rb") as input_file:    
#     res_index = pickle.load(input_file)  

with open(os.path.join(data_path, 'contrib_index.pickle'), "rb") as input_file:    
    contrib_index = pickle.load(input_file)     

with open(os.path.join(data_path, 'paper_index.pickle'), "rb") as input_file: 
    paper_index = pickle.load(input_file)    

In [67]:
def filter_q(question:str):
    """
    filter out unnecessary words from a question
    """
    res = []
    tok_pos = []
    for token in nlp(question):
        tok_pos.append((token.lemma_, token.pos_))
        if token.pos_ in ["NOUN", "PROPN", "ADJ", "VERB"]: # keep only nouns, adjectives and verbs
            if token.lemma_ not in stopwords:
                res.append(str(token))
    return " ".join(res)

def create_n_grams(sentence, n=2):
    """
    create n_grams for a sentence after its filtering
    """
    filtered_sentence = filter_q(sentence)
    words = filtered_sentence.split(" ")
    n_grams = []
    text_l = len(words)
    for i in range(2, n+1):
        for j in range(text_l-i):
            n_gram = words[j:j+i]
            n_gram_text = " ".join(n_gram)
            n_grams.append(n_gram_text)
    return n_grams

def set_prefixes(graph, prefixes=[]):
    """
    set all prefixes from a list for a graph
    """
    for prefix in prefixes:
        graph.bind(prefix[0], prefix[1])

In [68]:
def n_closest(question, source, n, n_gram):
    """
    return n elements of orkg relevant for a question
    """
    if n_gram:
        filteted_text = filter_q(question)
        texts = create_n_grams(filteted_text)
    else:
        texts = [filter_q(question)]
    if source=="pred":
        tuples = []
        for text in texts:
            out, scores = pred_index.knn_query(model.encode(text, show_progress_bar=False), k=n)
            tuples.extend(list(zip(out[0], scores[0])))
        sorted_out = sort_tuples(tuples)[0:n]
        all_preds = list(pred2emb.keys())
        output = [all_preds[pair[0]] for pair in sorted_out]
        return output
    elif source=="res":
        tuples = []
        for text in texts:
            out, scores = res_index.knn_query(model.encode(text, show_progress_bar=False), k=n)
            tuples.extend(list(zip(out[0], scores[0])))
        sorted_out = sort_tuples(tuples)[0:n]
        all_res = list(res2emb.keys())
        output = [all_res[pair[0]] for pair in sorted_out]
        return output
    elif source=="paper":
        tuples = []
        for text in texts:
            out, scores = paper_index.knn_query(model.encode(text, show_progress_bar=False), k=n)
            tuples.extend(list(zip(out[0], scores[0])))
        sorted_out = sort_tuples(tuples)[0:n]
        all_papers = list(paper2emb.keys())
        papers = [all_papers[pair[0]] for pair in sorted_out]
        output = [str(uri) for uri in papers]
        return output
    elif source=="contrib":
        tuples = []
        for text in texts:
            out, scores = contrib_index.knn_query(model.encode(text, show_progress_bar=False), k=n)
            tuples.extend(list(zip(out[0], scores[0])))
        sorted_out = sort_tuples(tuples)[0:n]
        all_contribs = list(contrib2emb.keys())
        contribs = [all_contribs[pair[0]] for pair in sorted_out]
        output = [str(uri) for uri in contribs]
        return output
    
def sort_tuples(l):
    out = []
    black_list = []
    tuples = sorted(l, key=lambda x: x[1], reverse=False)
    for tup in tuples:
        if tup[0] in black_list:
            continue
        else:
            out.append(tup)
            black_list.append(tup[0])
    return out

In [69]:
def construct_subgraph(triples=[], predicates_list=[]):
    """
    return a graph for a list of triples
    """
    res_graph = Graph()
    set_prefixes(res_graph, prefixes)
    for ind, triple in enumerate(triples):
        pred_id = str(triple[1]).split("/")[-1]
        try:
            pred = id2pred[pred_id]
        except Exception as e:
            continue
        if pred not in predicates_list:
            continue
        else:
            text_triple = swap_prefixes(triple)
            res_graph.add(text_triple) 
    return res_graph

def get_subgraph_string(graph):
    """
    return string representation of a graph
    """
    tmp = sys.stdout
    my_result = StringIO()
    sys.stdout = my_result
    graph.print() 
    sys.stdout = tmp
    return my_result.getvalue()

def swap_prefixes(triple):
    """
    replace ids in a triple with textual representation
    """
    subj_id = str(triple[0]).split("/")[-1]
    subj_base = str(triple[0]).split("/")[0:-1]
    if subj_id in id2res.keys():
        subj = id2res[subj_id].replace(" ", "_").replace(":", "_")
        subj_base.append(subj)
        triple[0] = rdflib.term.URIRef("/".join(subj_base))
    

        pred_id = str(triple[1]).split("/")[-1]
        pred_base = str(triple[1]).split("/")[0:-1]
    if pred_id in id2pred.keys():
        pred = id2pred[pred_id].replace(" ", "_").replace(":", "_")
        pred_base.append(pred)
        triple[1] = rdflib.term.URIRef("/".join(pred_base))
    
    obj_id = str(triple[2]).split("/")[-1]
    obj_base = str(triple[2]).split("/")[0:-1]
    if obj_id in id2res.keys():
        obj = id2res[obj_id].replace(" ", "_").replace(":", "_")
        obj_base.append(obj)
        triple[2] = rdflib.term.URIRef("/".join(obj_base))
    return triple    

In [70]:
def process_paper(paper_title, predicates_list, graph):
    """
    return a subgraph of orkg for a paper using only relevant predicates
    """
    paper_id = paper2id[paper_title]
    left = '{'
    right = '}'
    middle = f'orkgr:{paper_id} ?x ?y. ?y ?pred ?label'
    template = f'PREFIX orkgp: <http://orkg.org/orkg/predicate/> PREFIX orkgc: <http://orkg.org/orkg/class/> PREFIX orkgr: <http://orkg.org/orkg/resource/> SELECT ?pred ?label WHERE {left}{middle}{right}'
    result = graph.query(template)
    triples = []
    for triple in result:
        triples.append([rdflib.term.URIRef(f'http://orkg.org/orkg/resource/{paper_id }'), triple[0], triple[1]])
    graph = construct_subgraph(triples, predicates_list)
    return graph

def process_contrib(contrib_title, predicates_list, graph):
    """
    return a subgraph of orkg for a contribution using only relevant predicates
    """
    contrib_id = contrib2id[contrib_title]
    left = '{'
    right = '}'
    middle = f'orkgr:{contrib_id} ?x ?y'
    template = f'PREFIX orkgp: <http://orkg.org/orkg/predicate/> PREFIX orkgc: <http://orkg.org/orkg/class/> PREFIX orkgr: <http://orkg.org/orkg/resource/> SELECT ?x ?y WHERE {left}{middle}{right}'
    result = graph.query(template)
    triples = []
    for triple in result:
        triples.append([rdflib.term.URIRef(f'http://orkg.org/orkg/resource/{contrib_id }'), triple[0], triple[1]])
    graph = construct_subgraph(triples, predicates_list)
    return graph

In [58]:
orkg = Graph()
os.path.join(data_path, 'rdf-export-orkg.nt')
orkg.parse(os.path.join(data_path, 'rdf-export-orkg.nt'))
prefixes = [
['orkgp', 'http://orkg.org/orkg/predicate/'],
['orkgc', 'http://orkg.org/orkg/class/'],
['orkgr', 'http://orkg.org/orkg/resource/'],
['rdfs', 'http://www.w3.org/2000/01/rdf-schema#>'],
['rdf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'],
 ['xsd', 'http://www.w3.org/2001/XMLSchema#']   
]
set_prefixes(g, prefixes)

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#integer, Converter=<class 'int'>
Traceback (most recent call last):
  File "c:\Users\Dan\.conda\envs\spike\lib\site-packages\rdflib\term.py", line 2119, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
ValueError: invalid literal for int() with base 10: '2.0'
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#decimal, Converter=<class 'decimal.Decimal'>
Traceback (most recent call last):
  File "c:\Users\Dan\.conda\envs\spike\lib\site-packages\rdflib\term.py", line 2119, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
decimal.InvalidOperation: [<class 'decimal.ConversionSyntax'>]
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#integer, Converter=<class 'int'>
Traceback (most recent call last):
  File "c:\Users\Dan\.conda\envs\spike\lib\site-packages\rdf

In [62]:
def retrieve_subgraph(question:str, main_graph:rdflib.Graph, n_preds:int=1000, n_papers:int=10, n_contribs:int=10):
    """
    return a number of orkg subrgraphs related to a question
    """
    subgraphs = []
    predicates_list = n_closest(question, 'pred', n_preds, True)
    papers_list = n_closest(question, 'paper', n_papers, True)
    contribs_list = n_closest(question, 'contrib', n_contribs, True)
    for paper in papers_list:
        graph = process_paper(paper, predicates_list, main_graph)
        if len(graph) > 0:
            graph_text = get_subgraph_string(graph)
            subgraphs.append(graph_text) 
    for contrib in contribs_list:
        graph = process_contrib(contrib, predicates_list, main_graph)
        if len(graph) > 0:
            graph_text = get_subgraph_string(graph)
            subgraphs.append(graph_text) 
    return subgraphs

In [40]:
q1 = "Which model has achieved the highest Accuracy score on the Story Cloze Test benchmark dataset?"
q2 = "List the title and ID of research papers that contain a benchmark over the Penn Treebank (Word Level) dataset?"
q3 = "What models are being evaluated on the UrbanSound8k dataset?"
q4 = "Provide a list of research paper titles and IDs that have benchmarked models on the Penn Treebank dataset?"
q5 = "What models are being evaluated on the TDMSci dataset?"
q6 = "What is the mean capacity of a carbon-based fuel?"
q7 = "Give me a list of research papers along with their titles and IDs, that have performed benchmarks on the MLDoc Zero-Shot English-to-Russian dataset?"
q8 = "Indicate the model that performed best in terms of Accuracy metric on the Kuzushiji-MNIST benchmark dataset?"
q9 = "Which model has achieved the highest BLEU score score on the WMT2016 Romanian-English benchmark dataset?"
q10 = "What is the highest benchmark result achieved on the Ball in cup, catch (DMControl500k) dataset, including the metric and its value?"

In [66]:
retrieve_subgraph(q1, orkg)

http://www.w3.org/2000/01/rdf-schema#> does not look like a valid URI, trying to serialize this will break.
http://www.w3.org/2000/01/rdf-schema#> does not look like a valid URI, trying to serialize this will break.
http://www.w3.org/2000/01/rdf-schema#> does not look like a valid URI, trying to serialize this will break.
http://www.w3.org/2000/01/rdf-schema#> does not look like a valid URI, trying to serialize this will break.
http://www.w3.org/2000/01/rdf-schema#> does not look like a valid URI, trying to serialize this will break.
http://www.w3.org/2000/01/rdf-schema#> does not look like a valid URI, trying to serialize this will break.
http://www.w3.org/2000/01/rdf-schema#> does not look like a valid URI, trying to serialize this will break.
http://www.w3.org/2000/01/rdf-schema#> does not look like a valid URI, trying to serialize this will break.
http://www.w3.org/2000/01/rdf-schema#> does not look like a valid URI, trying to serialize this will break.
http://www.w3.org/2000/01/rd

['@prefix orkgp: <http://orkg.org/orkg/predicate/> .\n@prefix orkgr: <http://orkg.org/orkg/resource/> .\n\norkgr:A_Simple_and_Effective_Approach_to_the_Story_Cloze_Test orkgp:P1004 orkgr:Val-ls-skip ;\n    orkgp:PWC_HAS_BENCHMARK orkgr:Benchmark_Story_Cloze_Test .\n\n\n',
 '@prefix orkgp: <http://orkg.org/orkg/predicate/> .\n@prefix orkgr: <http://orkg.org/orkg/resource/> .\n@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .\n\norkgr:A_benchmarking_method_for_information_systems orkgp:description "Covers design tools, software metrics, testing and debugging, programming environments, etc"^^xsd:string .\n\n\n',
 '@prefix orkgp: <http://orkg.org/orkg/predicate/> .\n@prefix orkgr: <http://orkg.org/orkg/resource/> .\n\norkgr:Towards_Better_Accuracy-efficiency_Trade-offs__Divide_and_Co-training orkgp:P1004 orkgr:Densenet-bc-190_s_4,\n        orkgr:Pyramidnet-272_s_4,\n        orkgr:Resnext-101_64x4d_s_2_224px,\n        orkgr:Se-resnext-101_64x4d_s_2_416px,\n        orkgr:Shake-shake_26_2x96