In [6]:
# Load the graph
from rdflib import Graph, URIRef, Namespace

# Create a new RDFLib Graph
graph = Graph()

# Load hto ontology file into the graph
ontology_file = "../../results/hto_eb_7th_hq.ttl"
graph.parse(ontology_file, format="turtle")
hto = Namespace("https://w3id.org/hto#")

In [2]:
len(graph)

417418

In [8]:
import pandas as pd
# Get all original description with the highest text quality of terms, along with the uri of the term.
from rdflib.plugins.sparql import prepareQuery
q1 = prepareQuery('''
    SELECT ?term ?text WHERE {
    ?term a ?termType;
        hto:hasOriginalDescription ?desc.
    ?desc hto:text ?text.
	FILTER NOT EXISTS {
          ?term hto:hasOriginalDescription [hto:hasTextQuality [hto:isTextQualityHigherThan ?textQuality]].
        }
    FILTER (?termType = hto:ArticleTermRecord || ?termType = hto:TopicTermRecord)
  }
  ''',
  initNs = { "hto": hto}
)

uri_description_list = []
descriptions = []
for r in graph.query(q1):
    term_uri = r.term
    description = r.text
    MAX_LENGTH = 10000
    if len(description) > MAX_LENGTH:
        description = description[:MAX_LENGTH]
        print(f"----\n{description}\n")
    uri_description = {
        "term_uri": term_uri,
        "description": description,
    }
    descriptions.append(description)
    uri_description_list.append(uri_description)
    #print("%s %s" % (term_uri, description))

----
a preparation from the fish known by the name of huso. The word is Greek, formed of 7χSυj, fish, and %oλλα, glue. The method of making isinglass was long a secret in the hands of the Russians. The following account of it was published by Mr Humphrey Jackson, in the 63d volume of the Philosophical Transactions.
“All authors who have hitherto delivered processes for making ichthyocolla, fish-glue, or isinglass, have greatly mistaken both its constituent matter and preparation. To prove this assertion, it may not be improper to recite what Pomet says upon the subject, as he appears to be the principal author, whom the rest have copied. After describing the fish, and referring to a cut engraved from an original in his custody, he says, ‘ As to the manner of making the isinglass, the sinewy parts of the fish are boiled in water till all of them be dissolved that will dissolve; then the gluey liquor is strained, and set to cool. Being cold, the fat is carefully taken off, and the liquor

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [15]:
len(uri_description_list)

23965

In [18]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-mpnet-base-v2')
paraphrases = util.paraphrase_mining(model, descriptions, corpus_chunk_size=len(descriptions), top_k=20)

In [20]:
for paraphrase in paraphrases:
    score, i, j = paraphrase
    threshold = 0.5
    if i != j and score > threshold:
        graph.add((URIRef(uri_description_list[i]["uri"]), hto.similarTo, URIRef(uri_description_list[j]["uri"])))
        #print("{} \t\t {} \t\t Score: {:.4f}".format(descriptions[i], descriptions[j], score))

a city of the Austrian government of Laybach, in the circle of Adelsburg. It is on the river Idrizza, in a mountainous district, where are some of the richest mines of quicksilver in Europe, which have received greater activity from the neglected state of the similar mines at Almaden in Spain. The city contains 320 houses, with 3650 inhabitants, who are chiefly dependent on the mines for employment. Long. 15. 3. 45. E. Lat. 46. 0. 48. N. 		 a circle of the Austrian province of Moravia, extending over 1110 square miles, comprehending thirty-five cities and towns, 469 villages and hamlets, 23,312 houses, and 146,189 inhabitants. The chief place is a city of the same name, situated on the river Iglawa. It is well built, and surrounded with walls; contains 1200 houses, with 10,986 inhabitants. It is a great manufacturing place, producing yearly from 40,000 to 50,000 pieces of cloth, besides much paper, leather, and other goods. Its situation, on the chief road through the province, furnish

In [None]:
# Save the Graph in the RDF Turtle format
graph.serialize(format="turtle", destination="../results/hto_eb_7th_hq_similar.ttl")