In [1]:
import pandas as pd
import re
import os

In [2]:
def extract_data(file_path):
    files = os.listdir(file_path)
    base_pattern = r'([^\\.]*\\{}{{[^}}]*}}[^\\.]*\.)'
    annotations = ['researchproblem', 'objective', 'method', 'result', 'conclusion']
    dataset = []
    for file_name in files:
        if os.path.isfile(os.path.join(file_path, file_name)):
            with open(os.path.join(file_path, file_name), 'r', encoding='utf-8') as file:
                file_content = file.read()
                for annotation in annotations:
                    pattern = base_pattern.format(annotation)
                    matches = re.findall(pattern, file_content)
                    if matches:
                        sentence = matches[0]
                        cleaned_sentence = re.sub(r'^begin\{abstract\}\\?\n?', '', sentence).strip()
                        cleaned_sentence = re.sub(r'\\uri{[^{}]*}{([^{}]*)}', r'\1', cleaned_sentence)
                        cleaned_sentence = re.sub(r'\\' + annotation + r'{([^{}]*)}', r'\1', cleaned_sentence).strip()
                        
                        uri_match = re.search(r'\\uri{([^}]*)}', sentence)
                        if uri_match:
                            uri = uri_match.group(1)
                            dataset.append({'Annotation': annotation, 'Sentence': cleaned_sentence, 'URI': uri})
                        else:
                            dataset.append({'Annotation': annotation, 'Sentence': cleaned_sentence, 'URI': None})
    df = pd.DataFrame(dataset)
    return df

df = extract_data('annotated_papers')
df

Unnamed: 0,researchproblem,objective,method,result,conclusion,metatitle,metaauthor,researchfield
0,A shortcoming of these existing assessment mod...,a graded maturity model for scholarly knowledg...,For developing and realizing the KGMM we follo...,Our model comprises 5 maturity stages with 20 ...,We demonstrate the implementation of our model...,KGMM - A Maturity Model for Scholarly Knowledg...,"Hassan Hussein, Allard Oelen, Oliver Karras, S...",(\uri{https://www.orkg.org/orkg/resource/R1121...
1,crowd-sourcing for scientific knowledge graphs,workflow for authors of scientific documents t...,"latex, luatex",score of 79 out of 100 on the System Usability...,SciKGTeX simplifies the process of manual sema...,\title{SciKGTeX - A \LaTeX{,"\uri{https://orcid.org/0000-0001-9778-8495, \u...","(\uri{https://orkg.org/resource/R278, Informat..."


In [None]:
# assuming get result from entity linking in format for orkg:
# "entities_orkg": [
#    {
#        "URI": "https://orkg.org/resource/R4322",
#        "surface form": "machine learning"
#    }

sample_text = "this is a sample text for the example to add an entity, in this case machine learning, to an text."

print(sample_text)

def entity_linking(text, linking_results):
    entities = linking_results["entities_orkg"]
    for entity in entities:
        uri = entity['URI']
        ent = entity['surface form']
        text = text.replace(ent, f"\\uri{{{uri}}}{{{ent}}}")
    return text

linking_results = {
    "entities_orkg": [
        {
            "URI": "https://orkg.org/resource/R4322",
            "surface form": "machine learning"
        },
        {
            "URI": "https://orkg.org/resource/R214",
            "surface form": "Chemical Engineering"
        }
    ]
}

entity_linking(sample_text, linking_results)

In [None]:
def calculate_probability(sentence, keywords):
    matched_keywords = sum(keyword in sentence.lower() for keyword in keywords)
    probability = matched_keywords / len(keywords) if len(keywords) > 0 else 0.0
    return probability

In [None]:
def extract_keywords(df):
    nlp = spacy.load("en_core_web_sm")
    
    section_keywords = {
        'researchproblem': [],
        "objective": [],
        "method": [],
        "result": [],
        'conclusion': []
    }
    for index, row in df.iterrows():
        annotation = row['Annotation']
        sentence = row['Sentence']
        
        doc = nlp(sentence)
        
        for token in doc:
            if token.pos_ in ["NOUN", "PROPN"] and token.text not in section_keywords[annotation]:
                section_keywords[annotation].append(token.text.lower())
    
    return section_keywords

In [None]:
def annotate_text(text):
    threshold = 0.05
    data = extract_data('annotated_papers')
    
    # Load the spaCy English model
    nlp = spacy.load("en_core_web_sm")

    # Process the text with spaCy
    doc = nlp(text)
    
    # Initialize variables to store labeled sections
    labeled_sections = {
        "researchproblem": [],
        "objective": [],
        "method": [],
        "result": [],
        "conclusion": []
    }

    # Define keywords that indicate different sections
    annotation_keywords = extract_keywords(df)
    
    set_annotations = []

    # Iterate through sentences in the processed text
    for annotation, keywords in annotation_keywords.items():
        probabilities = {}
        for sent in doc.sents:
            probability = calculate_probability(sent.text, keywords)
            probabilities[probability] = sent.text
        
        print(annotation, probabilities)
        best_prob = max(probabilities, key=probabilities.get)
        best_sent = probabilities[best_prob]
        
        labeled_sections[annotation].append(best_sent)
    """for sent in doc.sents:
        probabilities = {}
        for section, keywords in section_keywords.items():
            probability = calculate_probability(sent.text, keywords)
            probabilities[section] = probability
            
        max_section = max(probabilities, key=probabilities.get)
        max_probability = probabilities[max_section]
        print(probabilities)
        if max_section not in set_annotations and max_probability > threshold:
            labeled_sections[max_section].append(sent.text)
            set_annotations.append(max_section)"""
 
    # Format the annotated text
    annotated_text = ""
    for section, sentences in labeled_sections.items():
        annotated_text += f"\\{section}{{{'. '.join(sentences)}}} "

    return annotated_text.strip()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df = extract_data('annotated_papers')

# Tokenize and vectorize sentences using TF-IDF
#vectorizer = TfidfVectorizer(stop_words='english')
#sentence_vectors = vectorizer.fit_transform(df['Sentence'])

def get_score(input_sentence, vectorizer, sentence_vectors):
    input_vector = vectorizer.transform([input_sentence])
    similarity_scores  = cosine_similarity(input_vector, sentence_vectors)
    most_similar_index = similarity_scores.argmax()
    return sentence_vectors[most_similar_index]

# Example input sentence
input_texts = ["""
Feature models provide an effective way to organize and reuse requirements in a specific domain. A feature model 
consists of a feature tree and cross-tree constraints. Identifying features and then building a feature tree takes a 
lot of effort, and many semi-automated approaches have been proposed to help the situation. However, 
finding cross-tree constraints is often more challenging which still lacks the help of automation.
In this paper, we propose an approach to mining cross-tree binary constraints in the construction of feature models. 
Binary constraints are the most basic kind of cross-tree constraints that involve exactly two features and can be further
classified into two sub-types, i.e. requires and excludes. Given these two sub-types, a pair of any two features in a
feature model falls into one of the following classes: no constraints between them, a requires between them,
or an excludes between them. Therefore we perform a 3-class classification on feature pairs to mine binary
constraints from features. We incorporate a support vector machine as the classifier and utilize a genetic algorithm to
optimize it. We conduct a series of experiments on two feature models constructed by third parties, to 
evaluate the effectiveness of our approach under different conditions that might occur in practical use. 
Results show that we can mine binary constraints at a high recall (near 100\% in most cases),
which is important because finding a missing constraint is very costly in real, often large, feature models.
""", 
               
"""Modern requirements tracing tools employ information retrieval methods to automatically generate candidate links.
Due to the inherent trade-off between recall and precision, such methods cannot achieve a high coverage without also 
retrieving a great number of false positives, causing a significant drop in result accuracy.
In this paper, we propose an approach to improving the quality of candidate link generation for the requirements tracing
process. We base our research on the cluster hypothesis which suggests that correct and incorrect links can be
grouped in high-quality and low-quality clusters respectively.Result accuracy can thus be enhanced by identifying and
filtering out low-quality clusters. We describe our approach by investigating three open-source datasets, and further
evaluate our work through an industrial study. The results show that our approach outperforms a baseline pruning strategy
and that improvements are still possible""",
               
"""Context-aware applications monitor changes in
their operating environment and switch their behaviour to
keep satisfying their requirements. Therefore, they must be
equipped with the capability to detect variations in their
operating context and to switch behaviour in response to
such variations. However, specifying monitoring and
switching in such applications can be difficult due to their
dependence on varying contextual properties which need to
be made explicit. In this paper, we present a problem-
oriented approach to represent and reason about contextual
variability and assess its impact on requirements; to elicit
and specify concerns facing monitors and switchers, such as
initialisation and interference; and to specify monitoring and
switching behaviours that can detect changes and adapt in
response. We illustrate our approach by applying it to a
published case study.""",
               
"""Because of intense collaborative needs,
requirements engineering is a challenge in global
software development. How do distributed teams
manage the development of requirements in
environments that require significant cross-site
collaboration and coordination? In this paper, we
report research that used social network analysis to
explore collaboration and awareness among team
members during requirements management in an
industrial distributed software team. Using the lens of
a requirements-centred social network to group team
members who work on a particular requirement, we
collected data to characterize requirements-centric
collaborations in a project, and to examine aspects of
awareness of requirements changes within these
networks. Our findings indicate organic patterns of
collaboration involving considerable cross-site
interaction, in which communication of changes was
the most predominant reason for interaction. Although
we did not find evidence that distance affects
developers’ awareness of remote team members who
work on the same requirements, distance affected how
accessible the remote colleagues were. We discuss
implications for knowledge sharing and coordination
of work on a requirement in distributed teams, and
propose directions for the design of collaboration tools
that support awareness in distributed requirements
management."""
]

nlp = spacy.load("en_core_web_sm")

annotations = ['researchproblem', 'objective', 'method', 'result', 'conclusion']
annotation_vectors = {}

for annot in annotations:
    vectorizer = TfidfVectorizer(stop_words='english')
    sentence_vectors = vectorizer.fit_transform(df[df['Annotation'] == annot]['Sentence'])
    annotation_vectors[annot] = (vectorizer, sentence_vectors)


best_sentences = {
    'researchproblem': None,
    'objective': None,
    'method': None,
    'result': None,
    'conclusion': None
}

for text in input_texts:
    doc = nlp(text)
    for annot in annotations:
        vectorizer, sentence_vectors = annotation_vectors[annot]
        best_sent = None
        best_score = 0
        for s in doc.sents:
            score = get_score(s.text, vectorizer, sentence_vectors).mean()
            if score > best_score:
                best_score = score
                best_sent = s.text

        best_sentences[annot] = best_sent
    
    print(best_sentences)
    print()

### Fetch properties and resources from ORKG and create JSON to use with FALCON 2.0

In [None]:
import sys
import ssl
from SPARQLWrapper import SPARQLWrapper, JSON

def get_results(endpoint_url, query):
    ssl._create_default_https_context = ssl._create_unverified_context
    
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

def get_properties():
    endpoint_url = "https://orkg.org/triplestore"

    query = """
        PREFIX orkgp: <http://orkg.org/orkg/predicate/>
        PREFIX orkgc: <http://orkg.org/orkg/class/>
        PREFIX orkgr: <http://orkg.org/orkg/resource/>

        SELECT ?property, ?label
        WHERE {
            ?property rdf:type orkgc:Predicate ;
                rdfs:label ?label.
        }
    """

    results = get_results(endpoint_url, query)
    
    return results['results']['bindings']


def get_entities(lower_bound, upper_bound):
    endpoint_url = "https://orkg.org/triplestore"
    
    query = f"""
        PREFIX orkgp: <http://orkg.org/orkg/predicate/>
        PREFIX orkgc: <http://orkg.org/orkg/class/>
        PREFIX orkgr: <http://orkg.org/orkg/resource/>

        SELECT ?entity, ?label
        WHERE {{
            ?entity rdfs:label ?label .
            FILTER(STRSTARTS(STR(?entity), "http://orkg.org/orkg/resource/R") && 
            xsd:integer(STRAFTER(STR(?entity), "http://orkg.org/orkg/resource/R")) >= {lower_bound} &&
            xsd:integer(STRAFTER(STR(?entity), "http://orkg.org/orkg/resource/R")) <= {upper_bound})
        }}
    """

    results = get_results(endpoint_url, query)
    
    return results['results']['bindings']

In [None]:
properties = get_properties()

In [None]:
# sparql queries are limited to 100000 outputs, so have to get enities in batches
ranges = [
    (0, 100000), 
    (100001, 200000),
    (200001, 300000), 
    (300001, 400000),
    (400001, 500000), 
    (500001, 600000), 
    (600001, 700000)
]

entities = []
for r in ranges:
    entities.extend(get_entities(r[0], r[1]))
len(entities)

In [None]:
def create_json(data, data_type):
    if data_type == "entity":
        file_path = "orkgentity.json"
        index = "orkgentityindex"
    elif data_type == "property":
        file_path = "orkgpropertyindex.json"
        index = "orkgpropertyindex"
        
    for entry in data:
        new_json = {
            "_index": index,
            "_type": "doc",
            "_score": 1,
            "_source":{
                "uri": "<"+entry[data_type]['value']+">",
                "label": entry['label']['value']
            }
        }
        with open(file_path, 'a') as json_file:
            json.dump(new_json, json_file, indent=4)
            json_file.write('\n')

In [None]:
create_json(properties, 'property')

In [None]:
create_json(entities, 'entity')