In [2]:
import pandas as pd

# load the total dataframe for eb from NLS with uris
eb_total_nls_df_with_uris = pd.read_json("../dataframe_with_uris/eb_total_lq_dataframe_uris", orient="index")

In [18]:
eb_total_nls_df_with_uris[(eb_total_nls_df_with_uris["MMSID"] == 992277653804341) & (eb_total_nls_df_with_uris["id"] == 0)]["uri"].iloc[0]

ValueError: Location based indexing can only have [integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types

In [56]:
import re
def remove_extra_spaces(text):
    text = text.strip()
    # Remove extra spaces before punctuation
    text = re.sub(r'\s+([,.;:])', r'\1', text)
    # Remove extra spaces around slashes and hyphens that are not part of words
    #text = re.sub(r'\s*/\s*', '/', text)
    text = re.sub(r'\s*-\s*', '-', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    return text

In [45]:
def get_uri_cleaned_definition(clean_dataframe):
    cleaned_definitions = []
    for index in range(0, len(clean_dataframe)):
        MMSID = clean_dataframe.loc[index, "MMSID"]
        uri = eb_total_nls_df_with_uris[(eb_total_nls_df_with_uris["MMSID"] == MMSID) & (eb_total_nls_df_with_uris["id"] == index)]["uri"].iloc[0]
        cleaned_definitions.append({
            "uri": uri,
            "definition": remove_extra_spaces(clean_dataframe.loc[index, "definition"])
        })
    return  cleaned_definitions

In [10]:
elmo_cleaned_eb_2_df = pd.read_json("../../source_dataframes/eb/final_eb_2_dataframe_ElmosclstmChecker", orient="index")

In [57]:
clean_definitions = get_uri_cleaned_definition(elmo_cleaned_eb_2_df)

In [73]:
clean_definitions[10]

{'uri': 'https://w3id.org/hto/ArticleTermRecord/997902523804341_144850370_9798985120_0',
 'definition': 'the eleventh month of the civil year of the Hebrews, and the fifth of their ecclesiastical year, which begins with the month Nations. It answers to the moon of July; that is, to part of our month of the same name, and to the beginning of August: it consists of thirty days. The Jews face on the first of this month, in memory of Aaron its death; and on the ninth, because on that day both the temple of Solomon, and that erected after the captivity, were burnt; the former by the Chaldeans, and the latter by the Romans. The same day is also remarkable among the people for the publication of Adrian is left, where they were forbid to continue in Judea, or even to lookback when at a distance from Jerusalem in order to lament the desolation of that city. The 18th of the same month is also a fast among the Jews; because the lamp in the sanctuary was that night extinguiftied, in the time of Ah

In [34]:
from rdflib import Graph, URIRef, Namespace

# Load the graph with NLS EB definitions
graph = Graph()

ontology_file = "../../results/hto_eb_total.ttl"
graph.parse(ontology_file, format="turtle")
hto = Namespace("https://w3id.org/hto#")

In [60]:
len(graph)

2779538

In [61]:
from rdflib import RDF
Neuspell = URIRef("https://github.com/neuspell/neuspell")
graph.add((Neuspell, RDF.type, hto.SoftwareAgent))

<Graph identifier=N4ae0ed11ad05435b955f2327b8ff9051 (<class 'rdflib.graph.Graph'>)>

In [26]:
from rdflib import RDF, Literal, XSD, PROV
def add_definition_and_source_to_graph(clean_definitions):
    for clean_definition in clean_definitions:
        uri = clean_definition["uri"]
        definition = clean_definition["definition"]
        term_uri_ref = URIRef(uri)
        definition_uri_ref = URIRef(uri + "Neuspell")
        graph.add((definition_uri_ref, RDF.type, hto.OriginalDescription))
        graph.add((definition_uri_ref, hto.hasTextQuality, hto.Moderate))
        graph.add((definition_uri_ref, hto.text, Literal(definition, datatype=XSD.string)))
        graph.add((definition_uri_ref, PROV.wasAttributedTo, Neuspell))
        graph.add((term_uri_ref, hto.hasOriginalDescription, definition_uri_ref))
        nls_definition_uri_ref = URIRef(uri + "NLS")
        graph.add((definition_uri_ref, PROV.wasDerivedFrom, nls_definition_uri_ref))


In [62]:
add_definition_and_source_to_graph(clean_definitions)

In [76]:
from rdflib.plugins.sparql import prepareQuery
# term_uri = "<https://w3id.org/hto/ArticleTermRecord/997902523804341_144850370_6414251593_0>"
term_uri = "<https://w3id.org/hto/ArticleTermRecord/997902523804341_144850370_9798985120_0>"
q1 = prepareQuery('''
    SELECT * WHERE {
        %s hto:hasOriginalDescription ?desc;
            hto:name ?name.
        ?desc hto:text ?text;
            ?pre ?obj.
}
  ''' % term_uri,
  initNs = { "hto": hto}
)

for r in graph.query(q1):
      print("%s %s %s" % (r.name, r.pre, r.obj))

AB http://www.w3.org/1999/02/22-rdf-syntax-ns#type https://w3id.org/hto#OriginalDescription
AB http://www.w3.org/ns/prov#wasAttributedTo https://github.com/defoe-code/defoe
AB https://w3id.org/hto#hasTextQuality https://w3id.org/hto#Low
AB https://w3id.org/hto#text the eleventh month of the civil year of the Hebrews, and the fifth of their ecclesiastical year, which begins with the month Nifan. It answers to the moon of July; that is, to part of our month of the same name, and to the beginning of August : it consists of thirty days. The Jews faff on the first of this month, in memory of Aaron’s death; and on the ninth, because on that day both the temple of Solomon, and that erefted after the captivity, were burnt; the former by the Chaldeans, and the latter by the Romans. The same day is also remarkable among that people for the publication of Adrian’s edift, wherein they were forbid to continue in Judea, or even to lookback when at a distance from Jerusalem in order to lament the des

In [77]:
graph.serialize(format="turtle", destination="../../results/hto_eb_total_elmo2.ttl")

<Graph identifier=N4ae0ed11ad05435b955f2327b8ff9051 (<class 'rdflib.graph.Graph'>)>