In [1]:
%pip install -U sentence-transformers
%pip install hnswlib
%pip install SPARQLWrapper
%pip install rdflib
%pip install spacy
# python -m spacy download en_core_web_sm

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125926 sha256=7a3f23e6462c7a6dca3d30dd70ca0b20330d234781313cf1685bfa5da2b32128
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2
Note: you may need to restart the kernel to use updated packages.
Collecting hnswlib
  Downloading hnswlib-0.7.0.tar.gz (33 kB)
  Installing build dependencies ... [?

In [75]:
import os
import re
import sys
import hnswlib
import pickle
import rdflib
import SPARQLWrapper
import numpy as np
import pandas as pd
import spacy
from io import StringIO
from rdflib import Graph
from SPARQLWrapper import SPARQLWrapper, JSON, RDF, XML 
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L12-v2')
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words

data_path = os.getcwd()+'\\data\'

In [3]:
with open(os.path.join(data_path, 'id2pred.pickle'), "rb") as input_file:
    id2pred = pickle.load(input_file) 
    
with open(os.path.join(data_path, 'pred2id.pickle'), "rb") as input_file:
    pred2id = pickle.load(input_file)    
    
with open(os.path.join(data_path, 'pred2emb.pickle'), "rb") as input_file:
    pred2emb = pickle.load(input_file)  

with open(os.path.join(data_path, 'id2res.pickle'), "rb") as input_file:
    id2res = pickle.load(input_file)
    
# with open(os.path.join(data_path, 'res2id.pickle'), "rb") as input_file:
#     res2id = pickle.load(input_file)    

# with open(os.path.join(data_path, 'res2emb.pickle'), "rb") as input_file:   
#     res2emb = pickle.load(input_file)

with open(os.path.join(data_path, 'contrib2emb.pickle'), "rb") as input_file: 
    contrib2emb = pickle.load(input_file) 

with open(os.path.join(data_path, 'contrib2id.pickle'), "rb") as input_file: 
    contrib2id = pickle.load(input_file)

with open(os.path.join(data_path, 'contrib2emb.pickle'), "rb") as input_file:    
    contrib2emb = pickle.load(input_file)    
    
      
with open(os.path.join(data_path, 'id2paper.pickle'), "rb") as input_file:     
    id2paper = pickle.load(input_file)
    
with open(os.path.join(data_path, 'paper2id.pickle'), "rb") as input_file: 
    paper2id = pickle.load(input_file)    
    
with open(os.path.join(data_path, 'paper2emb.pickle'), "rb") as input_file:    
    paper2emb = pickle.load(input_file)
    

with open(os.path.join(data_path, 'pred_index.pickle'), "rb") as input_file:     
    pred_index = pickle.load(input_file) 

# with open(os.path.join(data_path, 'res_index.pickle'), "rb") as input_file:    
#     res_index = pickle.load(input_file)  

with open(os.path.join(data_path, 'contrib_index.pickle'), "rb") as input_file:    
    contrib_index = pickle.load(input_file)     

with open(os.path.join(data_path, 'paper_index.pickle'), "rb") as input_file: 
    paper_index = pickle.load(input_file)    

In [5]:
def filter_q(question:str):
    """
    filter out unnecessary words from a question
    """
    res = []
    tok_pos = []
    for token in nlp(question):
        tok_pos.append((token.lemma_, token.pos_))
        if token.pos_ in ["NOUN", "PROPN", "ADJ", "VERB"]: # keep only nouns, adjectives and verbs
            if token.lemma_ not in stopwords:
                res.append(str(token))
    return " ".join(res)

def create_n_grams(sentence, n=2):
    """
    create n_grams for a sentence after its filtering
    """
    filtered_sentence = filter_q(sentence)
    words = filtered_sentence.split(" ")
    n_grams = []
    text_l = len(words)
    for i in range(2, n+1):
        for j in range(text_l-i):
            n_gram = words[j:j+i]
            n_gram_text = " ".join(n_gram)
            n_grams.append(n_gram_text)
    return n_grams

def set_prefixes(graph, prefixes=[]):
    """
    set all prefixes from a list for a graph
    """
    for prefix in prefixes:
        graph.bind(prefix[0], prefix[1])

In [6]:
def n_closest(question, source, n, n_gram):
    """
    return n elements of orkg relevant for a question
    """
    if n_gram:
        filteted_text = filter_q(question)
        texts = create_n_grams(filteted_text)
    else:
        texts = [filter_q(question)]
    if source=="pred":
        tuples = []
        for text in texts:
            out, scores = pred_index.knn_query(model.encode(text, show_progress_bar=False), k=n)
            tuples.extend(list(zip(out[0], scores[0])))
        sorted_out = sort_tuples(tuples)[0:n]
        all_preds = list(pred2emb.keys())
        output = [all_preds[pair[0]] for pair in sorted_out]
        return output
    elif source=="res":
        tuples = []
        for text in texts:
            out, scores = res_index.knn_query(model.encode(text, show_progress_bar=False), k=n)
            tuples.extend(list(zip(out[0], scores[0])))
        sorted_out = sort_tuples(tuples)[0:n]
        all_res = list(res2emb.keys())
        output = [all_res[pair[0]] for pair in sorted_out]
        return output
    elif source=="paper":
        tuples = []
        for text in texts:
            out, scores = paper_index.knn_query(model.encode(text, show_progress_bar=False), k=n)
            tuples.extend(list(zip(out[0], scores[0])))
        sorted_out = sort_tuples(tuples)[0:n]
        all_papers = list(paper2emb.keys())
        papers = [all_papers[pair[0]] for pair in sorted_out]
        output = [str(uri) for uri in papers]
        return output
    elif source=="contrib":
        tuples = []
        for text in texts:
            out, scores = contrib_index.knn_query(model.encode(text, show_progress_bar=False), k=n)
            tuples.extend(list(zip(out[0], scores[0])))
        sorted_out = sort_tuples(tuples)[0:n]
        all_contribs = list(contrib2emb.keys())
        contribs = [all_contribs[pair[0]] for pair in sorted_out]
        output = [str(uri) for uri in contribs]
        return output
    
def sort_tuples(l):
    out = []
    black_list = []
    tuples = sorted(l, key=lambda x: x[1], reverse=False)
    for tup in tuples:
        if tup[0] in black_list:
            continue
        else:
            out.append(tup)
            black_list.append(tup[0])
    return out

In [141]:
def construct_subgraph(triples=[], predicates_list=[]):
    """
    return a graph for a list of triples
    """
    res_graph = Graph()
    set_prefixes(res_graph, prefixes)
    for ind, triple in enumerate(triples):
        pred_id = str(triple[1]).split("/")[-1]
        try:
            pred = id2pred[pred_id]
        except Exception as e:
            continue
        if pred not in predicates_list:
            continue
        else:
            text_triple = swap_prefixes(triple)
            res_graph.add(text_triple) 
    return res_graph

def get_subgraph_string(graph):
    """
    return string representation of a graph
    """
    tmp = sys.stdout
    my_result = StringIO()
    sys.stdout = my_result
    graph.print()
    sys.stdout = tmp
    res = my_result.getvalue()
    my_result.close()
    return res

def swap_prefixes(triple):
    """
    replace ids in a triple with textual representation
    """
    subj_id = str(triple[0]).split("/")[-1]
    subj_base = str(triple[0]).split("/")[0:-1]
    if subj_id in id2res.keys():
        subj = re.sub(r"[^a-zA-Z\d]+", "_", id2res[subj_id])
        subj_base.append(subj)
        triple[0] = rdflib.term.URIRef("/".join(subj_base))
    
    pred_id = str(triple[1]).split("/")[-1]
    pred_base = str(triple[1]).split("/")[0:-1]
    if pred_id in id2pred.keys():
        pred = re.sub(r"[^a-zA-Z\d]+", "_", id2pred[pred_id])
        pred_base.append(pred)
        triple[1] = rdflib.term.URIRef("/".join(pred_base))
    
    obj_id = str(triple[2]).split("/")[-1]
    obj_base = str(triple[2]).split("/")[0:-1]
    if obj_id in id2res.keys():
        obj = re.sub(r"[^a-zA-Z\d]+", "_", id2res[obj_id])
        obj_base.append(obj)
        triple[2] = rdflib.term.URIRef("/".join(obj_base))
    return triple    

In [199]:
def process_paper(paper_title, predicates_list, graph):
    """
    return a subgraph of orkg for a paper using only relevant predicates
    """
    paper_id = paper2id[paper_title]
    left = '{'
    right = '}'
    middle = f'orkgr:{paper_id} ?x ?y. ?y ?pred ?label'
    template = f'PREFIX orkgp: <http://orkg.org/orkg/predicate/> PREFIX orkgc: <http://orkg.org/orkg/class/> PREFIX orkgr: <http://orkg.org/orkg/resource/> SELECT ?pred ?label WHERE {left}{middle}{right}'
    try:
        result = graph.query(template)
        triples = []
        for triple in result:
            triples.append([rdflib.term.URIRef(f'http://orkg.org/orkg/resource/{paper_id }'), triple[0], triple[1]])
        graph = construct_subgraph(triples, predicates_list)
        return graph
    except Exception:
        return Graph()

def process_contrib(contrib_title, predicates_list, graph):
    """
    return a subgraph of orkg for a contribution using only relevant predicates
    """
    contrib_id = contrib2id[contrib_title]
    left = '{'
    right = '}'
    middle = f'orkgr:{contrib_id} ?x ?y'
    template = f'PREFIX orkgp: <http://orkg.org/orkg/predicate/> PREFIX orkgc: <http://orkg.org/orkg/class/> PREFIX orkgr: <http://orkg.org/orkg/resource/> SELECT ?x ?y WHERE {left}{middle}{right}'
    try:
        result = graph.query(template)
        triples = []
        for triple in result:
            triples.append([rdflib.term.URIRef(f'http://orkg.org/orkg/resource/{contrib_id }'), triple[0], triple[1]])
        graph = construct_subgraph(triples, predicates_list)
        return graph
    except Exception:
        return Graph()

def graph_text_postprocessing(graph_text:str):
    """
    postprocess to improve graph serialization
    """
    postprocessed_graph = graph_text.replace("\n", "").replace("  ", "").replace(" .", ". ").replace(" ;", "; ").replace(",", ", ")  
    return postprocessed_graph

def merge_subgraphs(subgraphs_list):
    for idx, subgraph in enumerate(subgraphs_list):
        if idx==0:
            res_graph = subgraph
        else:
            res_graph += subgraph
    try:
        graph_text = get_subgraph_string(res_graph) 
        graph_text_postprocessed = graph_text_postprocessing(graph_text)
        return graph_text_postprocessed
    except Exception:
        return ''

In [9]:
orkg = Graph()
orkg.parse(os.path.join(data_path, 'rdf-export-orkg.nt'))
prefixes = [
['orkgp', 'http://orkg.org/orkg/predicate/'],
['orkgc', 'http://orkg.org/orkg/class/'],
['orkgr', 'http://orkg.org/orkg/resource/'],
['rdfs', 'http://www.w3.org/2000/01/rdf-schema#>'],
['rdf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'],
['xsd', 'http://www.w3.org/2001/XMLSchema#']   
]
set_prefixes(orkg, prefixes)

In [73]:
def retrieve_subgraph(question:str, main_graph:rdflib.Graph, n_preds:int=1000, n_papers:int=10, n_contribs:int=10):
    """
    return a number of orkg subrgraphs related to a question
    """
    subgraphs = []
    predicates_list = n_closest(question, 'pred', n_preds, True)
    papers_list = n_closest(question, 'paper', n_papers, True)
    contribs_list = n_closest(question, 'contrib', n_contribs, True)
    for paper in papers_list:
        graph = process_paper(paper, predicates_list, main_graph)
        if len(graph) > 0:
            subgraphs.append(graph) 
    for contrib in contribs_list:
        graph = process_contrib(contrib, predicates_list, main_graph)
        if len(graph) > 0:
            subgraphs.append(graph) 
    res_graph = merge_subgraphs(subgraphs)
    return res_graph

In [12]:
q1 = "Which model has achieved the highest Accuracy score on the Story Cloze Test benchmark dataset?"
q2 = "List the title and ID of research papers that contain a benchmark over the Penn Treebank (Word Level) dataset?"
q3 = "What models are being evaluated on the UrbanSound8k dataset?"
q4 = "Provide a list of research paper titles and IDs that have benchmarked models on the Penn Treebank dataset?"
q5 = "What models are being evaluated on the TDMSci dataset?"
q6 = "What is the mean capacity of a carbon-based fuel?"
q7 = "Give me a list of research papers along with their titles and IDs, that have performed benchmarks on the MLDoc Zero-Shot English-to-Russian dataset?"
q8 = "Indicate the model that performed best in terms of Accuracy metric on the Kuzushiji-MNIST benchmark dataset?"
q9 = "Which model has achieved the highest BLEU score score on the WMT2016 Romanian-English benchmark dataset?"
q10 = "What is the highest benchmark result achieved on the Ball in cup, catch (DMControl500k) dataset, including the metric and its value?"

In [214]:
q1

'Which model has achieved the highest Accuracy score on the Story Cloze Test benchmark dataset?'

In [215]:
retrieve_subgraph(q1, orkg)

'@prefix orkgp: <http://orkg.org/orkg/predicate/>. @prefix orkgr: <http://orkg.org/orkg/resource/>. @prefix xsd: <http://www.w3.org/2001/XMLSchema#>. orkgr:A_Simple_and_Effective_Approach_to_the_Story_Cloze_Test orkgp:Benchmark orkgr:Benchmark_Story_Cloze_Test; orkgp:model orkgr:Val_ls_skip. orkgr:A_benchmarking_method_for_information_systems orkgp:description "Covers design tools,  software metrics,  testing and debugging,  programming environments,  etc"^^xsd:string. orkgr:Automatic_Diagnosis_of_Attention_Deficit_Hyperactivity_Disorder_Using_Machine_Learning orkgp:contribution orkgr:Decision_tree, orkgr:K_nearest_Neighbour, orkgr:Naive_Bayes, orkgr:Random_forest, orkgr:Support_vector_machine, orkgr:logistic_regression. orkgr:Collaboration_of_Experts_Achieving_80_Top_1_Accuracy_on_ImageNet_with_100M_FLOPs orkgp:Benchmark orkgr:Benchmark_ImageNet; orkgp:model orkgr:Coe_large_194_mflops, orkgr:Coe_large_214_mflops, orkgr:Coe_small_100_mflops. orkgr:Contribution_1 orkgp:Algorithm_s_ orkg

In [216]:
q2

'List the title and ID of research papers that contain a benchmark over the Penn Treebank (Word Level) dataset?'

In [217]:
retrieve_subgraph(q2, orkg)

'@prefix orkgp: <http://orkg.org/orkg/predicate/>. @prefix orkgr: <http://orkg.org/orkg/resource/>. @prefix xsd: <http://www.w3.org/2001/XMLSchema#>. orkgr:A_benchmarking_method_for_information_systems orkgp:Data_analysis orkgr:analysis; orkgp:description "Covers design tools,  software metrics,  testing and debugging,  programming environments,  etc"^^xsd:string; orkgp:research_paradigm orkgr:exploratory; orkgp:research_problem orkgr:empirical_research_in_requirements_engineering; orkgp:research_question_ orkgr:Research_Questions_in_RE_Contribution; orkgp:research_question_answer orkgr:hidden_in_text. orkgr:Bio_ID_track_overview orkgp:Best_score "0.65 micro-F1 for normalized cell type"^^xsd:string, "0.76 micro-F1 for normalized species"^^xsd:string, "0.8 or better for cell type,  species and gene-or-protein at mention-level F1"^^xsd:string, "below 0.6 micro-F1 for other normalized entity types"^^xsd:string; orkgp:Evaluation_metrics orkgr:F1, orkgr:Precision, orkgr:Recall; orkgp:descri

In [218]:
q3 

'What models are being evaluated on the UrbanSound8k dataset?'

In [219]:
retrieve_subgraph(q3, orkg)

'@prefix orkgp: <http://orkg.org/orkg/predicate/>. @prefix orkgr: <http://orkg.org/orkg/resource/>. @prefix xsd: <http://www.w3.org/2001/XMLSchema#>. orkgr:A_Topic_Coverage_Approach_to_Evaluation_of_Topic_Models orkgp:Benchmark orkgr:Benchmark_Topic_modeling_topic_coverage_dataset, orkgr:Benchmark_Topic_modeling_topic_coverage_dataset_bio, orkgr:Benchmark_Topic_modeling_topic_coverage_dataset_news; orkgp:model orkgr:Aucdc, orkgr:Nmf_200, orkgr:Pyp. orkgr:Automatic_Diagnosis_of_Attention_Deficit_Hyperactivity_Disorder_Using_Machine_Learning orkgp:contribution orkgr:Decision_tree, orkgr:K_nearest_Neighbour, orkgr:Naive_Bayes, orkgr:Random_forest, orkgr:Support_vector_machine, orkgr:logistic_regression. orkgr:Contribution_1 orkgp:Capital_enviromental_impact_of_cities_reduction "no"^^xsd:string; orkgp:Data_analysis "No statistical analysis to verify whether the differences in subjective responses were caused by exposure to different indoor conditions"^^xsd:string; orkgp:Data_processing "T"

In [220]:
q4 

'Provide a list of research paper titles and IDs that have benchmarked models on the Penn Treebank dataset?'

In [221]:
retrieve_subgraph(q4, orkg)

'@prefix orkgp: <http://orkg.org/orkg/predicate/>. @prefix orkgr: <http://orkg.org/orkg/resource/>. @prefix xsd: <http://www.w3.org/2001/XMLSchema#>. orkgr:A_benchmarking_method_for_information_systems orkgp:Data_analysis orkgr:analysis; orkgp:Data_collection_method orkgr:case_study, orkgr:study; orkgp:description "Covers design tools,  software metrics,  testing and debugging,  programming environments,  etc"^^xsd:string; orkgp:research_paradigm orkgr:exploratory; orkgp:research_problem orkgr:empirical_research_in_requirements_engineering; orkgp:research_question_ orkgr:Research_Questions_in_RE_Contribution; orkgp:research_question_answer orkgr:hidden_in_text. orkgr:Benchmarking_Graph_Neural_Networks orkgp:Benchmark orkgr:Benchmark_CIFAR10_100k, orkgr:Benchmark_CLUSTER, orkgr:Benchmark_COLLAB, orkgr:Benchmark_MNIST, orkgr:Benchmark_PATTERN, orkgr:Benchmark_TSP_HCP_Benchmark_set, orkgr:Benchmark_ZINC_100k, orkgr:Benchmark_ZINC_500k; orkgp:model orkgr:Gatedgcn, orkgr:Gatedgcn_e, orkgr:G

In [222]:
q5

'What models are being evaluated on the TDMSci dataset?'

In [223]:
retrieve_subgraph(q5, orkg)

'@prefix orkgp: <http://orkg.org/orkg/predicate/>. @prefix orkgr: <http://orkg.org/orkg/resource/>. @prefix xsd: <http://www.w3.org/2001/XMLSchema#>. orkgr:A_Topic_Coverage_Approach_to_Evaluation_of_Topic_Models orkgp:Benchmark orkgr:Benchmark_Topic_modeling_topic_coverage_dataset, orkgr:Benchmark_Topic_modeling_topic_coverage_dataset_bio, orkgr:Benchmark_Topic_modeling_topic_coverage_dataset_news; orkgp:model orkgr:Aucdc, orkgr:Nmf_200, orkgr:Pyp. orkgr:Contribution_1 orkgp:Has_evaluation_metrics "PPL"^^xsd:string; orkgp:Test_Data_Languages "English"^^xsd:string; orkgp:Test_data """negativepolarity items"""^^xsd:string, "reflexive anaphora "^^xsd:string, "subject-verb agreement"^^xsd:string; orkgp:Training_data "Wikipedia"^^xsd:string; orkgp:evaluation orkgr:Task_Dataset_Metric_Score. orkgr:Dynamic_Evaluation_of_Neural_Sequence_Models orkgp:Benchmark orkgr:Benchmark_Hutter_Prize, orkgr:Benchmark_Penn_Treebank_Word_Level_, orkgr:Benchmark_Text8, orkgr:Benchmark_WikiText_2; orkgp:model 

In [133]:
import json
with open("/kaggle/input/sciqa-dataset/SciQA-dataset/test/questions.json", 'rb') as f:
    test_file = f.read()

with open("/kaggle/input/sciqa-dataset/SciQA-dataset/train/questions.json", 'rb') as f:
    train_file = f.read()    

with open("/kaggle/input/sciqa-dataset/SciQA-dataset/valid/questions.json", 'rb') as f:
    valid_file = f.read()  

test = json.loads(test_file)
train = json.loads(train_file)
valid = json.loads(valid_file)

test_q = test['questions']
train_q = train['questions']
valid_q = valid['questions']

In [197]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

def measure_num_tokens(datasets:list, tokenizer):
    res = []
    for dataset in datasets:
        for q in dataset:
            out = retrieve_subgraph(q['question']['string'], orkg)
            tokens = tokenizer(out)["input_ids"]
            res.append(len(tokens))  
    return pd.DataFrame(res)

In [200]:
measure_num_tokens([test_q, train_q, valid_q], tokenizer)

Unnamed: 0,0
count,2565.0
mean,1871.264327
std,1359.458444
min,0.0
25%,1115.0
50%,1528.0
75%,1906.0
max,17454.0


In [271]:
def aug_dataset(dataset):
    res = []
    for q in dataset:
        out = retrieve_subgraph(q['question']['string'], orkg)
        res.append(out) 
    return res

def write_json(dataset, filename):
    with open(filename) as file: 
        file_data = json.load(file)
    subgraphs = aug_dataset(dataset)
    for idx, q in enumerate(file_data['questions']):
        q['subgraph'] = subgraphs[idx]
    return file_data
#     with open(filename,'r+') as file:  
#         json.dump(file_data, file, indent = 4)