<a href="https://colab.research.google.com/github/elenasoria3/prueba_semantica/blob/main/SpaCy_informationextraction_elenasorialopez.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extraer conocimiento del texto: Extracción de información con spaCy y Neo4j
https://towardsdatascience.com/extract-knowledge-from-text-end-to-end-information-extraction-pipeline-with-spacy-and-neo4j-502b2b1e0754


In [None]:
!pip install crosslingual-coreference==0.2.3 spacy-transformers==1.1.5 wikipedia neo4j
!pip install --upgrade google-cloud-storage
!pip install transformers==4.18.0
!python -m spacy download en_core_web_sm


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers<4.18.0,>=3.4.0
  Using cached transformers-4.17.0-py3-none-any.whl (3.8 MB)
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.18.0
    Uninstalling transformers-4.18.0:
      Successfully uninstalled transformers-4.18.0
Successfully installed transformers-4.17.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.18.0
  Using cached transformers-4.18.0-py3-none-any.whl (4.0 MB)
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.17.0
    Uninstalling transformers-4.17.0:
      Successfully uninstalled transformers-4.17.0
[31mERROR: pip's dep

In [None]:
import spacy
import crosslingual_coreference

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
# https://github.com/Babelscape/rebel/blob/main/spacy_component.py
import requests
import re
import hashlib
from spacy import Language
from typing import List

from spacy.tokens import Doc, Span

from transformers import pipeline

def call_wiki_api(item):
  try:
    url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={item}&language=en&format=json"
    data = requests.get(url).json()
    # Return the first id (Could upgrade this in the future)
    return data['search'][0]['id']
  except:
    return 'id-less'

def extract_triplets(text):
    """
    Function to parse the generated text and extract the triplets
    """
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})

    return triplets


@Language.factory(
    "rebel",
    requires=["doc.sents"],
    assigns=["doc._.rel"],
    default_config={
        "model_name": "Babelscape/rebel-large",
        "device": 0,
    },
)
class RebelComponent:
    def __init__(
        self,
        nlp,
        name,
        model_name: str,
        device: int,
    ):
        assert model_name is not None, ""
        self.triplet_extractor = pipeline("text2text-generation", model=model_name, tokenizer=model_name, device=device)
        self.entity_mapping = {}
        # Register custom extension on the Doc
        if not Doc.has_extension("rel"):
          Doc.set_extension("rel", default={})

    def get_wiki_id(self, item: str):
        mapping = self.entity_mapping.get(item)
        if mapping:
          return mapping
        else:
          res = call_wiki_api(item)
          self.entity_mapping[item] = res
          return res

    
    def _generate_triplets(self, sent: Span) -> List[dict]:
          output_ids = self.triplet_extractor(sent.text, return_tensors=True, return_text=False)[0]["generated_token_ids"]["output_ids"]
          extracted_text = self.triplet_extractor.tokenizer.batch_decode(output_ids[0])
          extracted_triplets = extract_triplets(extracted_text[0])
          return extracted_triplets

    def set_annotations(self, doc: Doc, triplets: List[dict]):
        for triplet in triplets:

            # Remove self-loops (relationships that start and end at the entity)
            if triplet['head'] == triplet['tail']:
                continue

            # Use regex to search for entities
            head_span = re.search(triplet["head"], doc.text)
            tail_span = re.search(triplet["tail"], doc.text)

            # Skip the relation if both head and tail entities are not present in the text
            # Sometimes the Rebel model hallucinates some entities
            if not head_span or not tail_span:
              continue

            index = hashlib.sha1("".join([triplet['head'], triplet['tail'], triplet['type']]).encode('utf-8')).hexdigest()
            if index not in doc._.rel:
                # Get wiki ids and store results
                doc._.rel[index] = {"relation": triplet["type"], "head_span": {'text': triplet['head'], 'id': self.get_wiki_id(triplet['head'])}, "tail_span": {'text': triplet['tail'], 'id': self.get_wiki_id(triplet['tail'])}}

    def __call__(self, doc: Doc) -> Doc:
        for sent in doc.sents:
            sentence_triplets = self._generate_triplets(sent)
            self.set_annotations(doc, sentence_triplets)
        return doc

In [None]:
DEVICE = -1 # Number of the GPU, -1 if want to use CPU

# Add coreference resolution model
coref = spacy.load('en_core_web_sm', disable=['ner', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer'])
coref.add_pipe(
    "xx_coref", config={"chunk_size": 2500, "chunk_overlap": 2, "device": DEVICE})

# Define rel extraction model

rel_ext = spacy.load('en_core_web_sm', disable=['ner', 'lemmatizer', 'attribute_rules', 'tagger'])
rel_ext.add_pipe("rebel", config={
    'device':DEVICE, # Number of the GPU, -1 if want to use CPU
    'model_name':'Babelscape/rebel-large'} # Model used, will default to 'Babelscape/rebel-large' if not given
    )



models/crosslingual-coreference/minilm/model.tar.gz: 358490KB [00:11, 31132.62KB/s]
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Downloading:   0%|          | 0.00/357 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/489 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/225M [00:00<?, ?B/s]

Some weights of the model checkpoint at nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-st

Downloading:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/123 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/344 [00:00<?, ?B/s]

<__main__.RebelComponent at 0x7f1fbea94a00>

# Input Text 1: Stranger Things

In [None]:
input_text = """Stranger Things is an American science fiction horror drama television series created by the Duffer Brothers, who also serve as showrunners and are executive producers along with Shawn Levy and Dan Cohen. Produced by Monkey Massacre Productions and Levy's 21 Laps Entertainment, the first season was released on Netflix on July 15, 2016. Its second, third, and fourth seasons followed in October 2017, July 2019, and May and July 2022, respectively. In February 2022, the series was renewed for a fifth and final season. Set in the 1980s, primarily in the fictional town of Hawkins, Indiana, the series centers on a number of mysteries and supernatural events occurring around the town and their impact on an ensemble of child and adult characters. It stars Winona Ryder, David Harbour, Finn Wolfhard, Millie Bobby Brown, Gaten Matarazzo, Caleb McLaughlin, Natalia Dyer, Charlie Heaton, Cara Buono, Matthew Modine, Noah Schnapp, Sadie Sink, Joe Keery, Dacre Montgomery, Sean Astin, Paul Reiser, Maya Hawke, Priah Ferguson, and Brett Gelman."""

coref_text = coref(input_text)._.resolved_text

doc = rel_ext(coref_text)

for value, rel_dict in doc._.rel.items():
    print(f"{value}: {rel_dict}")

929c1947865c4a6f98cf5d264f7d8ee9145cdf00: {'relation': 'genre', 'head_span': {'text': 'Stranger Things', 'id': 'Q19798734'}, 'tail_span': {'text': 'science fiction', 'id': 'Q24925'}}
a89f7a968c75099ddb2a61b3b6316d0ca4b6b57d: {'relation': 'genre', 'head_span': {'text': 'Stranger Things', 'id': 'Q19798734'}, 'tail_span': {'text': 'horror drama', 'id': 'Q103319035'}}
05a542e242584bf1bc38a6f0ea8f22d45cec3ffc: {'relation': 'creator', 'head_span': {'text': 'Stranger Things', 'id': 'Q19798734'}, 'tail_span': {'text': 'Duffer Brothers', 'id': 'Q26097618'}}
8b1073b6489aa9f945497157ee3ef014e93d7410: {'relation': 'creator', 'head_span': {'text': 'Stranger Things', 'id': 'Q19798734'}, 'tail_span': {'text': 'Shawn Levy', 'id': 'Q323076'}}
cd572439dec552b13b64328c1fe7fda3312fbd45: {'relation': 'creator', 'head_span': {'text': 'Stranger Things', 'id': 'Q19798734'}, 'tail_span': {'text': 'Dan Cohen', 'id': 'Q22108672'}}
65f829f9ead93ee377663ba41e427fd371e32e1b: {'relation': 'notable work', 'head_span'

In [None]:
import pandas as pd
import wikipedia
from neo4j import GraphDatabase

# Estas tres variables deben cambiar a la sesión del proyecto de Neo4J creada por cada uno
host = 'bolt://44.192.106.246:7687'
user = 'neo4j'
password = 'hairpin-tolerance-blows'
driver = GraphDatabase.driver(host,auth=(user, password))

import_query = """
UNWIND $data AS row
MERGE (h:Entity {id: CASE WHEN NOT row.head_span.id = 'id-less' THEN row.head_span.id ELSE row.head_span.text END})
ON CREATE SET h.text = row.head_span.text
MERGE (t:Entity {id: CASE WHEN NOT row.tail_span.id = 'id-less' THEN row.tail_span.id ELSE row.tail_span.text END})
ON CREATE SET t.text = row.tail_span.text
WITH row, h, t
CALL apoc.merge.relationship(h, toUpper(replace(row.relation,' ', '_')),
  {},
  {},
  t,
  {}
)
YIELD rel
RETURN distinct 'done' AS result;
"""


def run_query(query, params={}):
    with driver.session() as session:
        result = session.run(query, params)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())

def store_wikipedia_summary(page):
  try:
    input_text = wikipedia.page(page).summary
    coref_text = coref(input_text)._.resolved_text
    doc = rel_ext(coref_text)
    params = [rel_dict for value, rel_dict in doc._.rel.items()]
    run_query(import_query, {'data': params})
  except Exception as e:
    print(f"Couldn't parse text for {page} due to {e}")


# Input Text 2: Eleven

In [None]:
input_text2 = """Eleven is the daughter of Teresa "Terry" Ives, and a participant in the Project MKUltra experiments conducted by the Central Intelligence Agency (CIA). Eleven appears to have been born a psychic with notable telekinetic and extrasensory abilities. However, when she uses these abilities to a significant degree, she becomes temporarily weakened and her nose bleeds. At birth, Eleven was taken away from her mother by Dr. Martin Brenner and was raised as a test subject in Hawkins National Laboratory in order to develop her psychokinetic skills. When placed in a sensory deprivation tank, she can use remote viewing to access other dimensions, primarily for the purposes of espionage. In addition, Eleven can open and close portals, known as “Gates”, to a parallel dimension called the Upside Down."""

coref_text2 = coref(input_text2)._.resolved_text

doc2 = rel_ext(coref_text2)

for value, rel_dict in doc2._.rel.items():
    print(f"{value}: {rel_dict}")

dd58a2547cf6ed6073332e24a8857765fdfa372e: {'relation': 'operator', 'head_span': {'text': 'Project MKUltra', 'id': 'Q815614'}, 'tail_span': {'text': 'Central Intelligence Agency', 'id': 'Q37230'}}
61d967bf042060a06b19fd96c1b3e59b1b3aa4cb: {'relation': 'subclass of', 'head_span': {'text': 'extrasensory', 'id': 'Q26157035'}, 'tail_span': {'text': 'psychic', 'id': 'Q2917466'}}
c0a9251752b818e8c42518b8d582692b24fb2010: {'relation': 'subclass of', 'head_span': {'text': 'telekinetic', 'id': 'Q186446'}, 'tail_span': {'text': 'extrasensory abilities', 'id': 'id-less'}}
6b4e10322f2ec5b0f52abd3646255fa375f5790b: {'relation': 'residence', 'head_span': {'text': 'Eleven', 'id': 'Q37136'}, 'tail_span': {'text': 'Hawkins National Laboratory', 'id': 'id-less'}}
76c2628019b7e18255d76dccb3bc7f1b96b2ccc2: {'relation': 'use', 'head_span': {'text': 'remote viewing', 'id': 'Q844197'}, 'tail_span': {'text': 'espionage', 'id': 'Q165950'}}
f03f9e26ef417ff62de7b3fe8a91ea56dfb97ae5: {'relation': 'instance of', 'h

In [None]:
params = [rel_dict for value, rel_dict in doc2._.rel.items()]
run_query(import_query, {'data': params})

Unnamed: 0,result
0,done


# Input Text 3: Cast Members and Characters

In [None]:
input_text3 = """Winona Ryder as Joyce Byers,the mother of Will and Jonathan Byers. She is divorced from Lonnie Byers, the father of Will and Jonathan. In season two, she dates her old high school classmate, Bob, until his death later in the season. She and Hopper have feelings for each other.
David Harbour as Jim Hopper,chief of Hawkins Police Department. After his young daughter Sara died of cancer, Hopper divorced and lapsed into alcoholism. Eventually he grows to be more responsible, saving Will Byers after he is taken in season 1, as well as taking Eleven as his adopted daughter. He and Joyce have feelings for each other.
Finn Wolfhard as Mike Wheeler,middle child of Karen and Ted Wheeler, brother of Nancy and Holly, and one of three friends of Will Byers. He is an intelligent and conscientious student and is committed to his friends. He develops romantic feelings for Eleven and later dates her.
Millie Bobby Brown as Eleven / Jane Hopper, a teen girl with telepathic and psychokinetic abilities, a result of being one of Dr. Brenner's subjects from Hawkins National Laboratory. She escapes the lab and eventually becomes an adoptive daughter to Jim Hopper taking his surname, and adjusts to living a normal life with the help of Mike (whom she later dates) and his friends.
Gaten Matarazzo as Dustin Henderson,one of Will Byers' friends. His cleidocranial dysplasia causes him to lisp. In the second season, he is proud of his new front teeth and is attracted to Max. In season 3, he gets a girlfriend, Suzie (portrayed by Gabriella Pizzolo), whom he met at Camp Know Where prior to the start of the season.
Caleb McLaughlin as Lucas Sinclair,one of Will's friends. He is wary of Eleven but later befriends her. In season two, he is one of Max's love interests and eventually becomes her boyfriend in season three. He becomes more popular in season 4 as a result of joining the Hawkins High basketball team, which briefly puts him at odds with his regular friend group.
Natalia Dyer as Nancy Wheeler,daughter of Karen and Ted and older sister of Mike and Holly. Studious and rule-abiding, Nancy finds another side of herself while investigating the Hawkins Lab and the death of her friend Barbara. In the first two seasons, she is the girlfriend of Steve Harrington but breaks up with him and then dates Jonathan Byers. She is an aspiring journalist.
Charlie Heaton as Jonathan Byers,the older brother of Will Byers and the son of Joyce Byers. He is a quiet and kind-hearted teenager, an outsider at school, and an aspiring photographer. He is close with his mother and brother, and he becomes the boyfriend of Nancy Wheeler.
Cara Buono as Karen Wheeler,  mother of Nancy, Mike, and toddler Holly. Karen has a brief fling with Billy in season 3."""

coref_text3 = coref(input_text3)._.resolved_text

doc3 = rel_ext(coref_text3)

for value, rel_dict in doc3._.rel.items():
    print(f"{value}: {rel_dict}")

  num_effective_segments = (seq_lengths + self._max_length - 1) // self._max_length


950a8e2b5bdda9e10c5679da4b0d687953b33bb6: {'relation': 'performer', 'head_span': {'text': 'Will', 'id': 'Q155656'}, 'tail_span': {'text': 'Winona Ryder', 'id': 'Q101797'}}
f18829d1d96012e9025e807d69df0462c3f944bc: {'relation': 'sibling', 'head_span': {'text': 'Will', 'id': 'Q155656'}, 'tail_span': {'text': 'Jonathan Byers', 'id': 'Q65515725'}}
f30b5503711c782084bfd0c6c0288353a8017583: {'relation': 'performer', 'head_span': {'text': 'Jonathan Byers', 'id': 'Q65515725'}, 'tail_span': {'text': 'Winona Ryder', 'id': 'Q101797'}}
fa560bfa311b82ee6b49c66ed3b3957b66054578: {'relation': 'sibling', 'head_span': {'text': 'Jonathan Byers', 'id': 'Q65515725'}, 'tail_span': {'text': 'Will', 'id': 'Q155656'}}
7dcd7e45b34a126f49f0a0975d696a856383d89b: {'relation': 'mother', 'head_span': {'text': 'Will', 'id': 'Q155656'}, 'tail_span': {'text': 'Joyce Byers', 'id': 'Q65515261'}}
fd2d08ff33b544f618f6ed0982ce6de62856973e: {'relation': 'mother', 'head_span': {'text': 'Jonathan Byers', 'id': 'Q65515725'}, '

In [None]:
params = [rel_dict for value, rel_dict in doc3._.rel.items()]
run_query(import_query, {'data': params})

Unnamed: 0,result
0,done


In [None]:
#Elena: el código de ladies no se ha ejecutado en este caso, pues no había indicación de si era necesario.

In [None]:


sparqlQuery = """
CALL apoc.periodic.iterate("
  MATCH (e:Entity)
  WHERE e.id STARTS WITH 'Q'
  RETURN e
","
  // Prepare a SparQL query
  WITH 'SELECT * WHERE{ ?item rdfs:label ?name . filter (?item = wd:' + e.id + ') filter (lang(?name) = \\\"en\\\") ' +
     'OPTIONAL {?item wdt:P31 [rdfs:label ?label] .filter(lang(?label)=\\\"en\\\")}}' AS sparql, e
  // make a request to Wikidata
  CALL apoc.load.jsonParams(
    'https://query.wikidata.org/sparql?query=' + 
      sparql,
      { Accept: 'application/sparql-results+json'}, null)
  YIELD value
  UNWIND value['results']['bindings'] as row
  SET e.wikipedia_name = row.name.value
  WITH e, row.label.value AS label
  MERGE (c:Class {id:label})
  MERGE (e)-[:INSTANCE_OF]->(c)
  RETURN distinct 'done'", {batchSize:1, retry:1})
"""
run_query(sparqlQuery)

Unnamed: 0,batches,total,timeTaken,committedOperations,failedOperations,failedBatches,retries,errorMessages,batch,operations,wasTerminated,failedParams,updateStatistics
0,26,26,2,0,26,26,0,{'Server returned HTTP response code: 400 for ...,"{'total': 26, 'committed': 0, 'failed': 26, 'e...","{'total': 26, 'committed': 0, 'failed': 26, 'e...",False,{},"{'nodesDeleted': 0, 'labelsAdded': 0, 'relatio..."
