In [None]:
!git clone https://github.com/thunlp/OpenNRE.git
!cp -r OpenNRE/opennre opennre
!pip install transformers

fatal: destination path 'OpenNRE' already exists and is not an empty directory.


In [None]:
import spacy
import opennre

##Name-Entity Recognition

In [None]:
nlp = spacy.load("en_core_web_sm")
def info_retrieval(document):
  doc = nlp(document)
  organizations = []
  persons = []
  locations = []
  years = []
  for ent in doc.ents:
    if ent.label_ == "ORG":
        organizations.append(ent.text)
    elif ent.label_ == "PERSON":
        persons.append(ent.text)
    elif ent.label_ == "GPE":
        locations.append(ent.text)
    elif ent.label_ == "DATE":
        years.append(ent.text)
  return [("Organizations:", organizations), ("Persons:", persons), ("Locations:", locations), ("Years:", years)]

In [None]:
info_retrieval("Apple Inc. was founded by Steve Jobs in Cupertino, California, in 1976.")

[('Organizations:', ['Apple Inc.']),
 ('Persons:', ['Steve Jobs']),
 ('Locations:', ['Cupertino', 'California']),
 ('Years:', ['1976'])]

##Relationship Extraction

In [None]:
model = opennre.get_model('wiki80_cnn_softmax')
def relation_extraction_opennre(document):
    doc = nlp(document)
    entities = {}
    for ent in doc.ents:
        entities[ent.text] = {'pos': (ent.start_char, ent.end_char)}

    relationships = []
    for subj, subj_pos in entities.items():
        for obj, obj_pos in entities.items():
            if subj != obj:
                relation_info = model.infer({'text': document, 'h': subj_pos, 't': obj_pos})
                # print(relation_info)
                predicted_relation = relation_info[0]
                relationships.append((subj, predicted_relation, obj))

    return relationships[0]

In [None]:
relation_extraction_opennre("Apple Inc. was founded by Steve Jobs in Cupertino, California, in 1976.")

('Apple Inc.', 'owned by', 'Steve Jobs')

###NER and RE on given text data

In [None]:
model = opennre.get_model('wiki80_cnn_softmax')

def relation_extraction_opennre_from_xml(document):

    p1_text = re.findall(r'<p1>(.*?)</p1>', document)
    p2_text = re.findall(r'<p2>(.*?)</p2>', document)
    relationships = []
    for p1, p2 in zip(p1_text, p2_text):
        relation_info = model.infer({'text': f"{p1} {p2}", 'h': {'pos': (0, len(p1))}, 't': {'pos': (len(p1) + 1, len(p1) + len(p2) + 1)}})
        predicted_relation = relation_info[0]
        relationships.append((p1, predicted_relation, p2))

    return relationships

In [None]:
import re
import pandas as pd

with open('/content/sample_data/text.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

data = []
in_line =[]
p1_list = []
p2_list = []
ner_list = []
relationship_list = []

p1 = ""
p2 = ""

for line in lines:
    in_line.append(line)
    p1_match = re.search(r'<p1>(.*?)</p1>', line)

    if p1_match:
        p1 = p1_match.group(1)

    p2_match = re.search(r'<p2>(.*?)</p2>', line)
    if p2_match:
        p2 = p2_match.group(1)
    relationship_list.append(relation_extraction_opennre_from_xml(line))
    line = re.sub(r'<.*?>', '', line)

    ner_list.append(info_retrieval(line))
    in_line.append(line)
    p1_list.append(p1)
    p2_list.append(p2)

df = pd.DataFrame({'Input':lines, 'p1': p1_list, 'p2': p2_list, 'NER': ner_list,'Relationship Extraction': relationship_list})

df.head()

Unnamed: 0,Input,p1,p2,NER,Relationship Extraction
0,<p1> Google </p1> assimilates <p2> YouTube </p...,Google,YouTube,"[(Organizations:, [Google, YouTube]), (Persons...","[( Google , headquarters location, YouTube )]"
1,finally <p1> google </p1> bought <p2> youtube ...,google,youtube,"[(Organizations:, []), (Persons:, []), (Locati...","[( google , headquarters location, youtube )]"
2,<p1> Google </p1> actually bought <p2> YouTube...,Google,YouTube,"[(Organizations:, [Google, YouTube]), (Persons...","[( Google , headquarters location, YouTube )]"
3,<p1> Google </p1> Closes <p2> YouTube </p2> Ac...,Google,YouTube,"[(Organizations:, [Google Closes YouTube Ac...","[( Google , headquarters location, YouTube )]"
4,"never even saw this coming , but apparently <p...",Adobe,Macromedia,"[(Organizations:, [Macromedia]), (Persons:, []...","[( Adobe , developer, Macromedia )]"


In [None]:
df.to_excel('output.xlsx', index=False)