In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import re

In [2]:
# pip install networkx pyvis

# 01 - Load Data

In [3]:
# data source: Kaggle - https://www.kaggle.com/datasets/naseralqaydeh/named-entity-recognition-ner-corpus
df1 = pd.read_csv("../Data/ner.csv")
df = df1.head(20)

In [4]:
df.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [5]:
# Load model and tokenizer
model_name = "Babelscape/rebel-large"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [6]:
def extract_relations(text):

    inputs = tokenizer(
        text, 
        return_tensors="pt",
        truncation=True,
        max_length=512
    )

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=256,
            num_beams=3
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=False)
    print (decoded) 
    
    return str(decoded)

def extract_triplet(text):
    """
    Extract (subject, relation, object) from REBEL-style output.
    """
    # Remove start/end tokens
    text = text.replace("<s>", "").replace("</s>", "").strip()

    # Regex pattern for REBEL format
    pattern = r"<triplet>([^<]+)<subj>([^<]+)<obj>([^<]+)"

    triplets = re.findall(pattern, text)
    #print (triplets)

    #for t in triplets:
    #    print(t)

    return triplets

def relation_extraction(strr):
    temp = extract_relations(strr)
    triples = extract_triplet(temp)
    return (triples)

In [8]:
strrs = "<s><triplet> Kalam <subj> Rameswaram <obj> place of birth</s>"
extract_triplet(strrs)

[(' Kalam ', ' Rameswaram ', ' place of birth')]

In [9]:
txt = 'Kalam was born in Rameswaram'
triplets = relation_extraction(txt)
triplets

<s><triplet> Kalam <subj> Rameswaram <obj> place of birth</s>


[(' Kalam ', ' Rameswaram ', ' place of birth')]

In [10]:
txt = 'Rameswaram is where Kalam was born '
relation_extraction(txt)

<s><triplet> Kalam <subj> Rameswaram <obj> place of birth</s>


[(' Kalam ', ' Rameswaram ', ' place of birth')]

In [11]:
# Apply relation extraction
df["triplets"] = df["Sentence"].apply(relation_extraction)

<s><triplet> war in Iraq <subj> British troops <obj> participant <triplet> British troops <subj> war in Iraq <obj> conflict</s>
<s><triplet> families <subj> soldiers <obj> has part <triplet> soldiers <subj> families <obj> part of</s>
<s><triplet> Hyde Park <subj> Houses of Parliament <obj> has part</s>
<s><triplet> Kraków <subj> Poland <obj> country <subj> Poland <obj> country <triplet> Poland <subj> Kraków <obj> capital <subj> Poland <obj> shares border with <triplet> Poland <subj> Kraków <obj> capital <subj> Poland <obj> shares border with</s>
<s><triplet> annual conference <subj> Brighton <obj> location</s>
<s><triplet> continued deployment of 8,500 British troops in that country <subj> Iraq conflict <obj> part of</s>
<s><triplet> Rome <subj> anti-war protests today <obj> part of <triplet> Paris <subj> anti-war protests today <obj> part of <triplet> Madrid <subj> anti-war protests today <obj> part of</s>
<s><triplet> International Atomic Energy Agency <subj> Vienna <obj> headquarter

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["triplets"] = df["Sentence"].apply(relation_extraction)


In [12]:
df.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag,triplets
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '...","[( war in Iraq , British troops , participan..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","[( families , soldiers , has part ), ( soldi..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","[( Hyde Park , Houses of Parliament , has pa..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","[( Kraków , Poland , country ), ( Poland , ..."
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","[( annual conference , Brighton , location)]"


In [13]:
df.to_csv('Relations.csv')