# Entity Recognition Pipeline

Implement a trainable end-to-end entity recognition and linking pipeline 
leveraging a database schemas to query for entities

[https://spacy.io/api/coref](https://spacy.io/api/coref)

[https://explosion.ai/blog/coref](https://explosion.ai/blog/coref)

In [5]:
!which python

/home/sean/.cache/pypoetry/virtualenvs/promptedgraphs-6U6kQWDY-py3.10/bin/python


In [6]:
!spacy_experimental

/bin/bash: line 1: spacy_experimental: command not found


In [8]:
# !pip install https://github.com/explosion/spacy-experimental/releases/download/v0.6.1/en_coreference_web_trf-3.4.0a2-py3-none-any.whl

import spacy
import spacy_experimental
from spacy.tokens import Doc
import pandas as pd
from promptedgraphs.vis import render_entities
from promptedgraphs.models import EntityReference

assert spacy.__version__ == "3.4.4", spacy.__version__

AssertionError: 3.7.2

In [None]:
# !python3 -m spacy download en_core_web_sm

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
nlp_coref = spacy.load("en_coreference_web_trf", vocab=nlp.vocab)

In [None]:
doc = nlp_coref("The cats were startled by the dog as it growled at them.")
doc.spans

In [None]:
nlp_coref.replace_listeners("transformer", "coref", ['model.tok2vec'])
nlp_coref.replace_listeners("transformer", "span_resolver", ['model.tok2vec'])

nlp.add_pipe("coref", source=nlp_coref)
nlp.add_pipe("span_resolver", source=nlp_coref)

doc = nlp("The cats were startled by the dog as it growled at them.")

In [None]:
doc.spans

In [None]:
# Define a lightweight function for resolving references in text, excluding generic terms
def resolve_references(doc: Doc) -> str:
    """Function for resolving references with the coref output, excluding generic terms.
    doc (Doc): The Doc object processed by the coref pipeline.
    RETURNS (str): The Doc string with resolved references.
    """
    # Define generic terms to exclude
    generic_terms = {'he', 'she', 'it', 'them', 'his', 'her', 'its', 'their', 'they'}
    
    # token.idx : replacement_text
    token_mention_mapper = {}
    output_string = ""
    clusters = [
        val for key, val in doc.spans.items() if key.startswith("coref_cluster")
    ]

    # Iterate through every found cluster
    for cluster in clusters:
        # Find the first non-generic mention in the cluster
        first_mention = next((span for span in cluster if span[0].lower_ not in generic_terms), cluster[0])

        # Iterate through every span in the cluster
        for mention_span in cluster:
            if mention_span != first_mention:
                # Set first_mention as the replacement for the first token in mention_span
                token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_
                for token in mention_span[1:]:
                    # Set empty string for all other tokens in mention_span
                    token_mention_mapper[token.idx] = ""

    # Iterate through every token in the Doc
    for token in doc:
        # Check if token exists in token_mention_mapper and add replacement or original text
        output_string += token_mention_mapper.get(token.idx, token.text + token.whitespace_)

    return output_string

In [None]:
# Example usage
# nlp = spacy.load("en_core_web_sm")  # Make sure to load your coreference model or pipeline instead
text = "John said that he would attend the meeting. He arrived late."  # Example text
doc = nlp(text)  # Assuming 'doc' has been processed by a coreference resolution pipeline

resolved_text = resolve_references(doc)
print(resolved_text)

In [None]:
# nlp = spacy.load("en_coreference_web_trf")

text = ["Philip plays the bass because he loves it.",
"Sam thanked the doctor for helping him.",
"Tina drover the car to the shops because they were about to close."]

df = pd.DataFrame(text, columns=['text'])

df['text-coref'] = [resolve_references(coref_doc) for coref_doc in nlp.pipe(df['text'])]

for txt in df['text-coref']:
    print(txt)

In [None]:
doc = next(nlp.pipe(df['text']))

In [None]:
resolve_references(doc)

In [None]:
doc.ents

In [None]:
doc.spans


In [None]:
doc.ents

In [None]:
doc2 = nlp(resolve_references(doc)) 

In [None]:
doc2.spans['coref_clusters_1'][1].start

In [None]:
e = list(doc2.ents)[0]
e.label_

In [None]:
render_entities(doc.text, [EntityReference(e.start_char, e.end_char, label=e.label_, text=e.text) for e in list(doc.ents)])

In [None]:
from promptedgraphs.vis import render_entities

render_entities(doc.text, [EntityReference(e.start_char, e.end_char, label=e.label_, text=e.text) for e in list(doc2.ents)])

In [None]:
text = "Yesterday, Google announced its own AI chatbot, Bard, a competitor to ChatGPT, developed by OpenAI. However, the tech giant embarrassed itself by sharing an inaccurate information generated with the new platform. As a result, the company's stock plunged pretrading before recouping its losses during the day."
doc = nlp(text)
print(doc.spans)

render_entities(doc.text, [EntityReference(e.start_char, e.end_char, label=e.label_, text=e.text) for e in list(doc.ents)])

In [None]:
from spacy import displacy

displacy.render(doc, style="dep")

In [None]:
render_entities(doc.text, [EntityReference(e.start_char, e.end_char, label=e.label_, text=e.text) for e in list(doc.ents)])

# Natural language understanding steps

1. Understand the sentence structure and coreference resolutions
2. What is the information provided by the question? (Create an ER graph)
3. Map this information to domain-specific schemas we know about (Entity Recognition)
4. What is the query intent?
5. Planning Steps to query information (RAG)

In [None]:
# https://github.com/emorynlp/elit/tree/main
# !poetry add git+https://github.com/python-poetry/poetry.git#develop/dev-candidate-1
!poetry add amrlib

In [None]:
# https://github.com/bjascob/amrlib-models/releases
# !wget -O /usr/local/data/amr_models/model_parse_t5-v0_2_0.tar.gz https://github.com/bjascob/amrlib-models/releases/download/model_parse_t5-v0_2_0/model_parse_t5-v0_2_0.tar.gz
# !tar -xzvf /usr/local/data/amr_models/model_parse_t5-v0_2_0.tar.gz -C /usr/local/data/amr_models/

# https://github.com/bjascob/amrlib-models/releases/download/parse_xfm_bart_base-v0_1_0/model_parse_xfm_bart_base-v0_1_0.tar.gz
# !wget -O /usr/local/data/amr_models/model_parse_xfm_bart_base-v0_1_0.tar.gz https://github.com/bjascob/amrlib-models/releases/download/parse_xfm_bart_base-v0_1_0/model_parse_xfm_bart_base-v0_1_0.tar.gz
# !tar -xzvf /usr/local/data/amr_models/model_parse_xfm_bart_base-v0_1_0.tar.gz -C /usr/local/data/amr_models/

# !wget -O /usr/local/data/amr_models/model_parse_xfm_bart_large-v0_1_0.tar.gz https://github.com/bjascob/amrlib-models/releases/download/parse_xfm_bart_large-v0_1_0/model_parse_xfm_bart_large-v0_1_0.tar.gz
# !tar -xzvf /usr/local/data/amr_models/model_parse_xfm_bart_large-v0_1_0.tar.gz -C /usr/local/data/amr_models/

In [None]:
# amr_model_dir = '/usr/local/data/amr_models/model_stog'
amr_model_dir = '/usr/local/data/amr_models/model_parse_t5-v0_2_0'
stog = amrlib.load_stog_model(model_dir=amr_model_dir)

In [None]:
import amrlib
graphs = stog.parse_sents(['This is a test of the system.', 'This is a second sentence.'])
for graph in graphs:
    print(graph)

In [None]:
graphs = stog.parse_sents([str(doc.text)])
for graph in graphs:
    print(graph)

In [None]:
type(graph)

In [None]:
import penman

In [None]:
penman_graph = penman.parse(graph)

In [None]:
penman_graph

In [None]:
import amrlib
import spacy
amrlib.setup_spacy_extension()
# nlp = spacy.load('en_core_web_sm')
doc = nlp(text)

# The following are roughly equivalent but demonstrate the different objects.
graphs = doc._.to_amr()
for graph in graphs:
    print(graph)

# for span in doc.sents:
#     graphs = span._.to_amr()
#     print(graphs[0])

In [None]:
from   amrlib.graph_processing.amr_plot import AMRPlot
from   amrlib.graph_processing.amr_loading import load_amr_entries
# input_file = 'amrlib/data/LDC2020T02/test.txt'
# # Load the AMR file
# entries = load_amr_entries(input_file)
# entry = entries[125]    # pick an index
# # Plot
plot = AMRPlot()
plot.build_from_graph(graph, debug=False)
plot.view()

In [None]:
!pip install graphviz

In [None]:
import spacy
import networkx as nx
import matplotlib.pyplot as plt

# Initialize spaCy model
nlp = spacy.load("en_core_web_sm")

# Define the function for simple Semantic Role Labeling (SRL)
def simple_srl(sentence, nlp):
    doc = nlp(sentence)
    subjects = []
    verbs = []
    objects = []
    indirect_objects = []
    
    for token in doc:
        if "subj" in token.dep_:
            subjects.append(token.text)
        if "VERB" in token.pos_:
            verbs.append(token.lemma_)
        if "obj" in token.dep_:
            objects.append(token.text)
        if "dative" in token.dep_:
            indirect_objects.append(token.text)
            
    return {
        'subjects': subjects,
        'verbs': verbs,
        'objects': objects,
        'indirect_objects': indirect_objects
    }

def build_and_plot_knowledge_graph_matplotlib(srl_results):
    G = nx.DiGraph()
    
    for result in srl_results:
        subjects = result['subjects']
        verbs = result['verbs']
        objects = result['objects']
        indirect_objects = result['indirect_objects']
        
        for subject in subjects:
            for verb in verbs:
                for obj in objects:
                    G.add_edge(subject, obj, label=verb)
                for ind_obj in indirect_objects:
                    G.add_edge(subject, ind_obj, label=verb)
    
    pos = nx.spring_layout(G, seed=20)
    
    # Draw nodes and edges
    nx.draw(G, pos, with_labels=True, node_color="skyblue", node_size=2000, font_size=12, font_color="black", font_weight="bold", arrows=True)
    
    # Draw edge labels
    edge_labels = nx.get_edge_attributes(G, 'label')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
    
    # Show plot
    plt.show()

# Process each sentence and extract SRL results
srl_results = []
for sent in nlp(text).sents:
    result = simple_srl(sent.text, nlp)
    srl_results.append(result)

# Build and plot the knowledge graph with matplotlib
build_and_plot_knowledge_graph_matplotlib(srl_results)

## Better Semantic Role Labeling

https://luheng.github.io/files/acl2017_hllz.pdf

https://github.com/luheng/deep_srl

https://paperswithcode.com/task/semantic-role-labeling