This notebook will generate summaries of descriptions in the HTO knowledge graphs. Create triples for these summaries, and add them to the graph.

We will use transformers to perform extractive summarization.

In [3]:
# Load the graph
from rdflib import Graph, URIRef, Namespace

# Create a new RDFLib Graph
graph = Graph()

# Load hto ontology file into the graph
ontology_file = "../results/hto_eb_7th_hq.ttl"
graph.parse(ontology_file, format="turtle")
hto = Namespace("https://w3id.org/hto#")

In [2]:
len(graph)

417418

In [6]:
import pandas as pd
# Get all original description of topic terms
from rdflib.plugins.sparql import prepareQuery
q1 = prepareQuery('''
    SELECT ?description ?text WHERE {
        ?term a hto:TopicTermRecord;
            hto:hasOriginalDescription ?description.
        ?description hto:text ?text.
    }
  ''',
  initNs = { "hto": hto}
)

uri_description_list = []
for r in graph.query(q1):
    uri_description = {
        "description_uri": r.description,
        "description": str(r.text),
        "summary": None
    }
    uri_description_list.append(uri_description)
    print("%s %s" % (r.description, len(r.text)))


df_uri_description = pd.DataFrame(data=uri_description_list, columns=["description_uri", "description", "summary"])

https://w3id.org/hto/OriginalDescription/9910796273804340_192693199_HYGROMETRY_0NCKP 167459
https://w3id.org/hto/OriginalDescription/9910796273804340_192693199_ICELAND_0NCKP 53301
https://w3id.org/hto/OriginalDescription/9910796273804340_192693199_INDEPENDENTS_0NCKP 34153
https://w3id.org/hto/OriginalDescription/9910796273804340_192693199_INK_0NCKP 35103
https://w3id.org/hto/OriginalDescription/9910796273804340_192693199_INQUISITION_0NCKP 52745
https://w3id.org/hto/OriginalDescription/9910796273804340_192693199_INSCRIPTION_0NCKP 25113
https://w3id.org/hto/OriginalDescription/9910796273804340_192693199_INTEREST_0NCKP 68974
https://w3id.org/hto/OriginalDescription/9910796273804340_192693199_IRELAND_1NCKP 38531
https://w3id.org/hto/OriginalDescription/9910796273804340_192693199_IRELAND_2NCKP 40674
https://w3id.org/hto/OriginalDescription/9910796273804340_192693199_IRRIGATION_0NCKP 84625
https://w3id.org/hto/OriginalDescription/9910796273804340_192693199_ITALY_1NCKP 186289
https://w3id.org

In [7]:
print(df_uri_description.loc[1])

description_uri    https://w3id.org/hto/OriginalDescription/99107...
description        one of the largest islands in Europe (being li...
summary                                                         None
Name: 1, dtype: object


## Define function for summarizing text

In [15]:
import nltk

def reduce_text_size(text):
    MAX_SENTENCES = 100
    # Tokenize the text into sentences using NLTK
    sentences = nltk.sent_tokenize(text)
    print(len(sentences))
    if len(sentences) > MAX_SENTENCES:
        reduced_text = ' '.join(sentences[:MAX_SENTENCES])
        return reduced_text
    else:
        return text

In [16]:
from summarizer import TransformerSummarizer
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

extractive_model = TransformerSummarizer(transformer_type="XLNet",transformer_model_key="xlnet-base-cased")

def summarize_text_extractive(text):
    text = reduce_text_size(text)
    summary = ''.join(extractive_model(text, min_length=60, max_length=300))
    return summary

In [9]:
import nltk
from transformers import pipeline

model_name = "Falconsai/text_summarization"
summarizer = pipeline("summarization", model=model_name)

def summarise_text_abstractive(text):
    # Spilt text into sentences, and the number of sentences should not be over max sentences
    MAX_SENTENCES = 100
    sentences = nltk.sent_tokenize(text)
    # print(len(sentences))
    if len(sentences) > MAX_SENTENCES:
        sentences = sentences[:MAX_SENTENCES]

    # print("chunking the text....")
    # Group sentences into small chunk of text whose token length should not be over max token length allowed by the model.
    tokenizer = summarizer.tokenizer
    max_token_length = tokenizer.model_max_length - 10
    # Split the input text into chunks of max_chunk_length
    chunks = []
    current_chunk = []

    # Chunk the sentences based on the maximum token length
    for sentence in sentences:
        tokenized_sentence = tokenizer.encode(sentence, add_special_tokens=False)
        if len(current_chunk) + len(tokenized_sentence) < max_token_length :
            current_chunk.extend(tokenized_sentence)
        else:
            chunks.append(current_chunk)
            current_chunk = list(tokenized_sentence)

    if current_chunk:
        chunks.append(current_chunk)

    # Convert token IDs back to text
    grouped_sentences = [''.join(tokenizer.decode(chunk)) for chunk in chunks]
    # print(f"text is chunked into {len(grouped_sentences)} pieces")

    summaries = []
    for index in range(0, len(grouped_sentences)):
        # Perform summarization on each chunk
        chunk = grouped_sentences[index]
        chunk_token_length = len(chunks[index])
        MAX_SUMMARY_LENGTH = 100
        MIN_LENGTH = 5
        if chunk_token_length < MIN_LENGTH * 2:
            continue
        if chunk_token_length < MAX_SUMMARY_LENGTH * 2:
            MAX_SUMMARY_LENGTH = int(chunk_token_length / 2)

        summary = summarizer(chunk, max_length=MAX_SUMMARY_LENGTH, min_length=MIN_LENGTH, do_sample=False)
        summaries.append(summary[0]['summary_text'])
    return ' '.join(summaries)

In [10]:
input_text = uri_description_list[1]["description"]
print(input_text)

one of the largest islands in Europe (being little inferior, in point of superficial extent, to Ireland), is situated in the north part of the Atlantic Ocean, between the 63d and 67th degrees of north latitude, and the 12th and 25th degrees of west longitude. Its extreme length from east to west is about 280 miles, and its breadth from north to south varies from 180 to 200.
The precise period at which this island was discovered and first colonized is unknown; but, from the Landnamabok, an ancient Icelandic chronicle, and a work generally relied upon as authentic, we learn that the Norwegians were the first settlers upon its coasts. Naddodr, a famous pirate of that adventurous nation, w as, on his return to the Feroe Islands from a predatory excursion, about the year 860, driven by a tempest upon the coast of Iceland. He ascended to the summit of a mountain, but observing around him neither the vestige of a human residence, nor aught else than vast and trackless fields of snow, he immed

In [17]:
import time
input_text = uri_description_list[1]["description"]
start_time = time.time()
summarised_text = summarize_text_extractive(input_text)
end_time = time.time()
elapsed_time = end_time - start_time
print(elapsed_time)
print(summarised_text)

257
4.857506036758423
one of the largest islands in Europe (being little inferior, in point of superficial extent, to Ireland), is situated in the north part of the Atlantic Ocean, between the 63d and 67th degrees of north latitude, and the 12th and 25th degrees of west longitude. Its extreme length from east to west is about 280 miles, and its breadth from north to south varies from 180 to 200. When Ingolf approached the coast of Iceland, we are informed that he threw into the sea the wooden door of his former habitation in Norway; and finding it some time afterwards cast upon the shore at Reikiavik, he fixed his abode on the spot where the capital of the island now stands. He was followed by several others, and in a short time considerable portions of the southern and eastern districts of the island were taken possession of. To obviate these, a chief was named, under whose guidance and direction the concerns, interests, and feelings of the yet separate communities might be regulated 

  super()._check_params_vs_input(X, default_n_init=10)


In [11]:
import time
input_text = uri_description_list[1]["description"]
start_time = time.time()
summarised_text = summarise_text_abstractive(input_text)
end_time = time.time()
elapsed_time = end_time - start_time
print(elapsed_time)
print(summarised_text)

24.0132737159729
Naddodr, a famous pirate of that adventurous nation, returned to the Feroe Islands from a predatory excursion, about the year 860, driven by a tempest upon the coast of Iceland . he ascended to the summit of a mountain, but observing around him neither the vestige of an human residence nor aught else than vast and trackless fields of snow, he immediately abandoned it . The third adventurer Harold Harfagra threw into the sea the wooden door of his former habitation in Norway . he imposed a fine of four ounces of fine silver upon every person who should leave Norway to settle in Iceland . The Landnamabok describes with singular minuteness the arrival and spreading of the different settlers, and records . This beneficial change was effected A. d. 928, and a republican form of government was thus established, well calculated to provide for the emergencies which gave it birth . The ancient Icelanders possessed, as is still case with their posterity, few of the luxuries or r

## Generate summary for each description in uri_description_list

In [102]:
print(f"total number of topics: {len(uri_description_list)}")
for index in range(0, len(uri_description_list)):
    print(f"------Summarising {index + 1}th description ---------")
    description = uri_description_list[index]["description"]
    summary = summarise_text_abstractive(description)
    uri_description_list[index]["summary"] = summary

total number of topics: 959
------Summarising 1th description ---------
6732
chunking the text....
text is chunked into 9 pieces
------Summarising 2th description ---------
801
chunking the text....
text is chunked into 11 pieces
------Summarising 3th description ---------
257
chunking the text....
text is chunked into 11 pieces
------Summarising 4th description ---------
5464
chunking the text....
text is chunked into 12 pieces
------Summarising 5th description ---------
188
chunking the text....
text is chunked into 10 pieces


Token indices sequence length is longer than the specified maximum sequence length for this model (581 > 512). Running this sequence through the model will result in indexing errors


------Summarising 6th description ---------
180
chunking the text....
text is chunked into 12 pieces
------Summarising 7th description ---------
230
chunking the text....
text is chunked into 14 pieces
------Summarising 8th description ---------
106
chunking the text....
text is chunked into 13 pieces
------Summarising 9th description ---------
371
chunking the text....
text is chunked into 10 pieces
------Summarising 10th description ---------
299
chunking the text....
text is chunked into 10 pieces
------Summarising 11th description ---------
388
chunking the text....
text is chunked into 9 pieces
------Summarising 12th description ---------
109
chunking the text....
text is chunked into 11 pieces
------Summarising 13th description ---------
420
chunking the text....
text is chunked into 7 pieces
------Summarising 14th description ---------
2383
chunking the text....
text is chunked into 13 pieces
------Summarising 15th description ---------
253
chunking the text....
text is chunked 

KeyboardInterrupt: 

In [None]:
from rdflib import RDF, Literal, XSD

for uri_description in uri_description_list:
    summary = uri_description["summary"]
    if summary is not None and summary != "":
        description_uri = uri_description["description_uri"]
        description_id = str(description_uri).split("/")[-1]
        summary_uri = URIRef("https://w3id.org/hto/Summary/" + description_id)
        graph.add((summary_uri, RDF.type, hto.Summary))
        graph.add((description_uri, hto.hasSummary, summary_uri))
        graph.add((summary_uri, hto.text, Literal(summary, datatype=XSD.string)))

In [None]:
# Save the Graph in the RDF Turtle format
graph.serialize(format="turtle", destination="../results/hto_eb_7th_hq_summary.ttl")