In [1]:
## This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [159]:
import warnings
warnings.filterwarnings('ignore')
from funcy import print_durations

### Fetch article

In [3]:
!pip install wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25ldone
[?25h  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11696 sha256=35bca2dc19216449116e0b3ce33db17a25c34cd0ef71f1ead37e849b46cc4831
  Stored in directory: /root/.cache/pip/wheels/5e/b6/c5/93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [160]:
import wikipedia

@print_durations
def get_text_of_wiki_page(article_name: str):
    """Given an article name(need not be exact title of page),
    return the textual content of the wikipedia article.
    We do a search for the articles and select the top-1 result, in case
    where the article name is not the exact title.

    Args:
        article_name (str): Name of a wikipedia article

    Returns:
        str: The text of that article.
    """
    article_name_result = wikipedia.page(wikipedia.search(article_name)[0], auto_suggest=False)
    article_name_content = article_name_result.content
    article_name_content.replace("\n", "").replace("\t", "")
    return article_name_content

In [161]:
tea = get_text_of_wiki_page("Tea")

  450.08 ms in get_text_of_wiki_page('Tea')


### Preprocess

In [21]:
import nltk
nltk.download("punkt")
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
tea_sentences = sent_tokenize(tea)
len(tea_sentences)

280

In [167]:
def annotate_sentence(sentence, mention):
    match = re.search(mention.lower(), sentence.lower())
    start, end = match.span()
    sentence = sentence[:start] + " [START_ENT] " + sentence[start:end] + " [END_ENT] " + sentence[end:]
    return sentence

In [163]:
annotate_sentence(tea_sentences[0], "China"), annotate_sentence(tea_sentences[0], "Myanmar")

   13.78 mks in annotate_sentence('Tea is an aromatic be..., 'China')
    9.97 mks in annotate_sentence('Tea is an aromatic be..., 'Myanmar')


('Tea is an aromatic beverage prepared by pouring hot or boiling water over cured or fresh leaves of Camellia sinensis, an evergreen shrub native to East Asia which probably originated in the borderlands of southwestern  [START_ENT] China [END_ENT]  and northern Myanmar.',
 'Tea is an aromatic beverage prepared by pouring hot or boiling water over cured or fresh leaves of Camellia sinensis, an evergreen shrub native to East Asia which probably originated in the borderlands of southwestern China and northern  [START_ENT] Myanmar [END_ENT] .')

### REBEL

In [149]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

In [168]:
import ast
import torch
import pandas as pd
gen_kwargs = {
    "max_length": 256,
    "length_penalty": 0,
    "num_beams": 10,
    "num_return_sequences": 10,
}

# Function to parse the generated text and extract the triplets
def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets

@print_durations
def extract_relations_rebel(model, tokenizer, text):
    
#     tokenized_sentences = sentence_tokenize(text)
    tokenized_sentences = [text]
    list_triples = []

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for text in tokenized_sentences:
        model_inputs = tokenizer(text, max_length=256, padding=True, truncation=True, return_tensors = 'pt')
        # Generate
        generated_tokens = model.generate(
            model_inputs["input_ids"].to(model.device),
            attention_mask=model_inputs["attention_mask"].to(model.device),
            **gen_kwargs,
        )

        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

        l1 = []
        for idx, sentence in enumerate(decoded_preds):
            #print(f'Prediction triplets sentence {idx}')
            #print(extract_triplets(sentence))
            l1 += extract_triplets(sentence)

        d1 = {}
        ctr = 0


        for x in l1:
            ctr += 1
            if not str(x) in d1:
                d1[str(x)] = 0
            d1[str(x)] += 1

        for x in d1:
            t = x.replace("}", "")
            final_dict = t + ", 'Confidence': " + str(d1[x]/ctr) + "}"
            #print(final_dict)
            final_dictionary = ast.literal_eval(final_dict)
            list_triples.append(final_dictionary)

    return pd.DataFrame(list_triples).sort_values(by="Confidence", ascending=False)

In [173]:
tea_sentences[0], sent_tokenize(tea_sentences[0])

('Tea is an aromatic beverage prepared by pouring hot or boiling water over cured or fresh leaves of Camellia sinensis, an evergreen shrub native to East Asia which probably originated in the borderlands of southwestern China and northern Myanmar.',
 ['Tea is an aromatic beverage prepared by pouring hot or boiling water over cured or fresh leaves of Camellia sinensis, an evergreen shrub native to East Asia which probably originated in the borderlands of southwestern China and northern Myanmar.'])

In [169]:
tea_0_triples = extract_relations_rebel(model=model, tokenizer=tokenizer, text=tea_sentences[0])
tea_0_triples

    1.10 s in extract_relations_rebel(model=BartForConditionalGene..., tokenizer=BartTokenizerFast(name..., text='Tea is an aromatic be...)


Unnamed: 0,head,type,tail,Confidence
1,China,shares border with,Myanmar,0.222222
3,Myanmar,shares border with,China,0.222222
0,China,part of,East Asia,0.2
4,East Asia,has part,China,0.133333
2,Myanmar,part of,East Asia,0.088889
5,Myanmar,located on terrain feature,East Asia,0.066667
6,East Asia,has part,Myanmar,0.044444
7,Tea,subclass of,beverage,0.022222


### GENRE

In [19]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
genre_tokenizer = AutoTokenizer.from_pretrained("facebook/genre-linking-blink")
genre_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/genre-linking-blink").eval()

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/997 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/196 [00:00<?, ?B/s]

In [30]:
def EL_GENRE(annotated_sentences, model, tokenizer):
    """A method to perform entity linking for entity-mentions annotated
    in sentences using the GENRE model.

    ```
    tokenizer = AutoTokenizer.from_pretrained("facebook/genre-linking-blink")
    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/genre-linking-blink").eval()

    sentences = [
        "[START_ENT] England [END_ENT] won the cricket world cup in 2019",
        "I just finished reading [START_ENT] 'The Jungle Book' [END_ENT]",
        "India is a country in Asia. [START_ENT] It [END_ENT] has a rich cultural heritage"
    ]

    EL_GENRE(annotated_sentences=sentences, model=model, tokenizer=tokenizer)
    
    ```

    Args:
        annotated_sentences (list): A list of sentences annotated with entity-mentions
        model : GENRE model from huggingface hub
        tokenizer : Appropriate tokenizer for GENRE model
    """
    outputs = model.generate(
    **tokenizer(annotated_sentences, return_tensors="pt", padding=True),
    num_beams=5,
    num_return_sequences=1,
    # OPTIONAL: use constrained beam search
    # prefix_allowed_tokens_fn=lambda batch_id, sent: trie.get(sent.tolist()),
    )

    entites = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # These entites are in the form of wikipedia page titles. Need to 
    # add the https://dbpedia/resource to each of them as postprocessing step
    return entites

In [41]:
sub, pred, obj, score = tea_0_triples.iloc[0].values

In [150]:
sub_ent = EL_GENRE(annotate_sentence(tea_sentences[0], sub), genre_model, genre_tokenizer)
obj_ent = EL_GENRE(annotate_sentence(tea_sentences[0], obj), genre_model, genre_tokenizer)

In [44]:
sub_ent, obj_ent

(['China'], ['Myanmar'])

### Onto-Embeddings

In [45]:
!pip3 install sentence-transformers gensim SPARQLWrapper

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Collecting rdflib>=6.1.1 (from SPARQLWrapper)
  Downloading rdflib-6.3.2-py3-none-any.whl (528 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m528.1/528.1 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting isodate<0.7.0,>=0.6.0 (from rdflib>=6.1.1->SPARQLWrapper)
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━

In [170]:
from sentence_transformers import SentenceTransformer
encoder_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def get_sentence_transformer_model(model_name):
    model = SentenceTransformer(model_name_or_path=model_name)
    return model

@print_durations
def get_embeddings(labels, sent_tran_model):
    embeddings = sent_tran_model.encode(labels, show_progress_bar=False)
    return embeddings

In [54]:
from SPARQLWrapper import SPARQLWrapper, JSON

NS_RESOURCE = 'http://dbpedia.org/resource/'
NS_RESOURCE_LEN = len(NS_RESOURCE)

NS_ONTOLOGY = 'http://dbpedia.org/ontology/'
NS_ONTOLOGY_LEN = len(NS_ONTOLOGY)


def retrieve_tbox(lang='en', offset=0):
    sparql = SPARQLWrapper('http://dbpedia.org/sparql')
    query = f"""
    SELECT ?uri ?label {{
      ?uri a ?type ; rdfs:label ?label .
      values(?type) {{ (owl:Class) (rdf:Property) }}
      filter(lang(?label) = '{lang}' && regex(?uri, "http://dbpedia.org/ontology/"))
    }} LIMIT 10000 OFFSET {offset}
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    tbox = {}
    for result in results['results']['bindings']:
        uri = result['uri']['value']
        label = result['label']['value']
        if label not in tbox:
            tbox[label] = set()
        tbox[label].add(uri)
    return tbox

def get_labels_and_tbox():
    offset = 0
    tbox = {}
    while True:
        tbox_chunk = retrieve_tbox(lang='en', offset=offset)
        if len(tbox_chunk) == 0:
            break
        offset += 10000
        for k, v in tbox_chunk.items():
            if k not in tbox:
                tbox[k] = set()
            tbox[k] = tbox[k].union(v)
    labels = [l.replace('\n', ' ') for l in tbox]
    return labels, tbox

def to_uri(label, tbox):
    return list(filter(lambda x: 'A' <= x[NS_ONTOLOGY_LEN : NS_ONTOLOGY_LEN+1] <= 'z', tbox[label]))

def write_embeddings_to_file(embeddings, labels, filename):
    with open(filename, 'w', encoding='utf-8') as f_out:
        f_out.write(f"{len(labels)} {len(embeddings[0])}\n")
        for label, embedding in zip(labels, embeddings):
            f_out.write(f"{label.replace(' ', '_')} {' '.join([str(x) for x in embedding])}\n")
    print("Embeddings written to file successfully")

In [147]:
from gensim.models import KeyedVectors


def ontosim_search(term, gensim_model, sent_tran_model, tbox):
    result = gensim_model.most_similar(
        positive=sent_tran_model.encode([term], show_progress_bar=False), topn=5)
    out = []
    for label, score in result:
        out.append({'label': label.replace('_', ' '), 'score': score})
    df = pd.DataFrame(out)
    df.insert(0, 'URIs', df['label'].map(lambda x: to_uri(x, tbox=tbox)))
    return df

def load_gensim_model_from_file(filepath):
    model = KeyedVectors.load_word2vec_format(filepath, binary=False)
    return model

In [55]:
import time
start = time.time()
labels, tbox = get_labels_and_tbox()
embeddings = get_embeddings(labels, encoder_model)
write_embeddings_to_file(embeddings, labels, "dbpedia-ontology.vectors")
end = time.time()
print(f"Time taken to compute and write embeddings - {end-start} seconds")

Batches:   0%|          | 0/110 [00:00<?, ?it/s]

Embeddings written to file successfully
Time taken to compute and write embeddings - 6.913477182388306 seconds


In [56]:
gensim_model = load_gensim_model_from_file("dbpedia-ontology.vectors")

In [59]:
print(pred)
db_pred = ontosim_search(pred, gensim_model, encoder_model, tbox)
print(db_pred)

shares border with


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

                                            URIs              label     score
0           [http://dbpedia.org/ontology/border]             border  0.728290
1       [http://dbpedia.org/ontology/flagBorder]        flag border  0.631572
2  [http://dbpedia.org/ontology/hasJunctionWith]  has junction with  0.487150
3      [http://dbpedia.org/ontology/linkedSpace]       linked space  0.423550
4    [http://dbpedia.org/ontology/routeJunction]     route junction  0.405855


### End-2-End

In [158]:
@print_durations
def get_triple_from_triple(sub, relation, obj, sentence):
    
    subject_entity = EL_GENRE(
        annotate_sentence(sentence, sub), genre_model, genre_tokenizer)[0]
    subject_entity = "https://dbpedia.org/resource/"+"_".join(subject_entity.split())
    
    object_entity = EL_GENRE(
        annotate_sentence(sentence, obj), genre_model, genre_tokenizer)[0]
    object_entity = "https://dbpedia.org/resource/"+"_".join(object_entity.split())
    
    predicates, label, score = ontosim_search(
        relation, gensim_model, encoder_model, tbox).iloc[0].values
    
    predicate = None
    for p in predicates:
        if p.split("/")[-1][0].islower():
            predicate = p
            break
#     return (subject_entity, (predicate, score), object_entity)
    return (subject_entity, predicate, object_entity)
    

In [154]:
@print_durations
def get_triples_from_sentence(sentence):
    sent_triples =  extract_relations_rebel(model=model, tokenizer=tokenizer, text=sentence)
    triples = {}
    
    for i in range(len(sent_triples)):
        subject, relation, objct, score = sent_triples.iloc[i].values
        triple = get_triple_from_triple(subject, relation, objct, sentence)
        triples[subject+"_"+relation+"_"+objct] = triple
    return triples

In [156]:
triples_s0 = get_triples_from_sentence(tea_sentences[0])

   36.70 s in get_triples_from_sentence('Tea is an aromatic be...)


In [121]:
for t,v in triples_s0.items():
    print(t.replace("_"," "))
    print(v)
    print()

China shares border with Myanmar
('https://dbpedia.org/resource/China', 'http://dbpedia.org/ontology/border', 'https://dbpedia.org/resource/Myanmar')

Myanmar shares border with China
('https://dbpedia.org/resource/Myanmar', 'http://dbpedia.org/ontology/border', 'https://dbpedia.org/resource/China')

China part of East Asia
('https://dbpedia.org/resource/China', 'http://dbpedia.org/ontology/part', 'https://dbpedia.org/resource/East_Asia')

East Asia has part China
('https://dbpedia.org/resource/East_Asia', 'http://dbpedia.org/ontology/part', 'https://dbpedia.org/resource/China')

Myanmar part of East Asia
('https://dbpedia.org/resource/Myanmar', 'http://dbpedia.org/ontology/part', 'https://dbpedia.org/resource/East_Asia')

Myanmar located on terrain feature East Asia
('https://dbpedia.org/resource/Myanmar', None, 'https://dbpedia.org/resource/East_Asia')

East Asia has part Myanmar
('https://dbpedia.org/resource/East_Asia', 'http://dbpedia.org/ontology/part', 'https://dbpedia.org/resou

In [171]:
triples_s200 = get_triples_from_sentence(tea_sentences[200])

  487.28 ms in extract_relations_rebel(model=BartForConditionalGene..., tokenizer=BartTokenizerFast(name..., text='During the Second Wor...)
    5.08 s in get_triple_from_triple('Canadian', 'participant in', 'Second World War', 'During the Second Wor...)
    5.19 s in get_triple_from_triple('British', 'conflict', 'Second World War', 'During the Second Wor...)
    4.93 s in get_triple_from_triple('British', 'participant in', 'Second World War', 'During the Second Wor...)
    5.05 s in get_triple_from_triple('Canadian', 'conflict', 'Second World War', 'During the Second Wor...)
    4.68 s in get_triple_from_triple('Canadian soldiers', 'conflict', 'Second World War', 'During the Second Wor...)
    5.24 s in get_triple_from_triple('British and Canadian ..., 'conflict', 'Second World War', 'During the Second Wor...)
    4.94 s in get_triple_from_triple('Compo', 'conflict', 'Second World War', 'During the Second Wor...)
    5.19 s in get_triple_from_triple('composite ration pack', 'conflict'

In [172]:
for t,v in triples_s200.items():
    print(t.replace("_"," "))
    print(v)
    print()

Canadian participant in Second World War
('https://dbpedia.org/resource/Canada', 'http://dbpedia.org/ontology/participant', 'https://dbpedia.org/resource/World_War_II')

British conflict Second World War
('https://dbpedia.org/resource/United_Kingdom', 'http://dbpedia.org/ontology/conflict', 'https://dbpedia.org/resource/World_War_II')

British participant in Second World War
('https://dbpedia.org/resource/United_Kingdom', 'http://dbpedia.org/ontology/participant', 'https://dbpedia.org/resource/World_War_II')

Canadian conflict Second World War
('https://dbpedia.org/resource/Canada', 'http://dbpedia.org/ontology/conflict', 'https://dbpedia.org/resource/World_War_II')

Canadian soldiers conflict Second World War
('https://dbpedia.org/resource/Canadian_Armed_Forces', 'http://dbpedia.org/ontology/conflict', 'https://dbpedia.org/resource/World_War_II')

British and Canadian soldiers conflict Second World War
('https://dbpedia.org/resource/British_and_Canadian_Army_during_World_War_II', 'htt