<h2> Entity recognition </h2>

In [1]:
import spacy
import pandas as pd
from spacy import displacy
import matplotlib.pyplot as plt
from collections import Counter
import json
import os 

In [2]:
file_path = os.path.join('..', 'data', 'processed', 'fables_en.json')
with open(file_path, 'r', encoding='utf-8') as f:
    english_data = json.load(f)

In [3]:
#Load english model
nlp_en = spacy.load("en_core_web_lg")

In [4]:
def analyze_entities(text,nlp):
    doc = nlp(text)
    entities = [(ent.text, ent.label_, ent.start_char, ent.end_char) for ent in doc.ents]
    return entities

In [5]:
analyze_entities("A wolf once saw a lamb who had wandered away from the flock. He did not want to rush upon the lamb and seize him violently. Instead, he sought a reasonable complaint to justify his hatred.'You insulted me last year, when you were small', said the wolf. The lamb replied,'How could I have insulted you last year? I'm not even a year old.' The wolf continued,'Well, are you not cropping the grass of this field which belongs to me?' The lamb said,'No, I haven't eaten any grass; I have not even begun to graze.' Finally the wolf exclaimed,'But didn't you drink from the fountain which I drink from?' The lamb answered,'It is my mother's breast that gives me my drink.' The wolf then seized the lamb and as he chewed he said,'You are not going to make this wolf go without his dinner, even if you are able to easily refute every one of my charges!", nlp_en)

[('last year', 'DATE', 205, 214),
 ('last year', 'DATE', 301, 310),
 ('a year old', 'DATE', 325, 335),
 ("said,'No", 'PERSON', 440, 448),
 ("The lamb answered,'It", 'ORG', 598, 619)]

In [6]:
for fable in english_data:
    print(f"Title: {fable['title']}")
    entities = analyze_entities(fable['body'], nlp_en)
    for entity in entities:
        print(f"  {entity[0]} - {entity[1]}")
    print("\n")

Title: The Wolf and the Lamb
  last year - DATE
  last year - DATE
  a year old - DATE
  said,'No - PERSON
  The lamb answered,'It - ORG


Title: The City Mouse and the Country Mouse
  insisted,'My - GPE


Title: The Fox and the Grapes
  the fox remarked,'Oh - FAC


Title: The Wolf and the Crane
  wolf - PERSON
  wolf - PERSON


Title: The Lion and the Mouse on His Mane


Title: The Fox and the Goat in the Well
  fox - ORG
  fox - ORG
  fox - ORG
  fox - ORG




In [7]:
doc = nlp_en("A wolf once saw a lamb who had wandered away from the flock. He did not want to rush upon the lamb and seize him violently. Instead, he sought a reasonable complaint to justify his hatred.'You insulted me last year, when you were small', said the wolf. The lamb replied,'How could I have insulted you last year? I'm not even a year old.' The wolf continued,'Well, are you not cropping the grass of this field which belongs to me?' The lamb said,'No, I haven't eaten any grass; I have not even begun to graze.' Finally the wolf exclaimed,'But didn't you drink from the fountain which I drink from?' The lamb answered,'It is my mother's breast that gives me my drink.' The wolf then seized the lamb and as he chewed he said,'You are not going to make this wolf go without his dinner, even if you are able to easily refute every one of my charges!")
options = {"compact":True, "bg": "#FFA500", "color": "white", "font": "Helvetica" }
html = displacy.render(doc, style="dep", options=options, jupyter=False)
from IPython.display import HTML
display(HTML(html))

In [8]:
ner_labels = nlp_en.get_pipe("ner").labels
print("Available entitiy types in spaCy's English model:")
for label in sorted(ner_labels):
    print(f"- {label}")

Available entitiy types in spaCy's English model:
- CARDINAL
- DATE
- EVENT
- FAC
- GPE
- LANGUAGE
- LAW
- LOC
- MONEY
- NORP
- ORDINAL
- ORG
- PERCENT
- PERSON
- PRODUCT
- QUANTITY
- TIME
- WORK_OF_ART


In [9]:
from IPython.display import display, HTML

html = displacy.render(doc, style="ent", jupyter=False)

display(HTML(html))

In [10]:
nlp_nl = spacy.load("nl_core_news_lg")
ner_labels_nl = nlp_nl.get_pipe("ner").labels
print("Available entity types in Dutch (nl_core_news_lg):")
for label in sorted(ner_labels_nl):
    print(f"- {label}")


Available entity types in Dutch (nl_core_news_lg):
- CARDINAL
- DATE
- EVENT
- FAC
- GPE
- LANGUAGE
- LAW
- LOC
- MONEY
- NORP
- ORDINAL
- ORG
- PERCENT
- PERSON
- PRODUCT
- QUANTITY
- TIME
- WORK_OF_ART


In [11]:
nlp_es = spacy.load("es_core_news_md")
ner_labels_es = nlp_es.get_pipe("ner").labels
print("Available entity types in Spanish (es_core_news_md):")
for label in sorted(ner_labels_es):
    print(f"- {label}")


Available entity types in Spanish (es_core_news_md):
- LOC
- MISC
- ORG
- PER


In [13]:
import spacy
from spacy.pipeline import EntityRuler

# Load the English model
nlp_en = spacy.load("en_core_web_lg")

# Define patterns for animal characters
animal_patterns = [
     # DATE Patterns
        {"label": "DATE", "pattern": [{"LOWER": "last"}, {"LOWER": "year"}]},
        {"label": "DATE", "pattern": [{"LOWER": "this"}, {"LOWER": "year"}]},
        {"label": "DATE", "pattern": [{"LOWER": "a"}, {"LOWER": "year"}, {"LOWER": "ago"}]},
        
        # LOCATION Patterns
        {"label": "LOCATION", "pattern": [{"LOWER": "field"}]},
        {"label": "LOCATION", "pattern": [{"LOWER": "the"}, {"LOWER": "field"}]},
        {"label": "LOCATION", "pattern": [{"LOWER": "this"}, {"LOWER": "field"}]},
        {"label": "LOCATION", "pattern": [{"LOWER": "fountain"}]},
        {"label": "LOCATION", "pattern": [{"LOWER": "the"}, {"LOWER": "fountain"}]},
        
        # CHARACTER Patterns
        {"label": "CHARACTER", "pattern": [{"LOWER": "wolf"}]},
        {"label": "CHARACTER", "pattern": [{"LOWER": "the"}, {"LOWER": "wolf"}]},
        {"label": "CHARACTER", "pattern": [{"LOWER": "lamb"}]},
        {"label": "CHARACTER", "pattern": [{"LOWER": "the"}, {"LOWER": "lamb"}]},
]

# In spaCy 3.x, we first add the entity_ruler by name, then configure it
ruler = nlp_en.add_pipe("entity_ruler", before="ner", name="animal_ruler")

# Now add the patterns to the ruler
ruler.add_patterns(animal_patterns)

# Set overwrite_ents
ruler.overwrite = True  # This is how we set overwrite_ents in spaCy 3

# Process the text
text = "A wolf once saw a lamb who had wandered away from the flock. He did not want to rush upon the lamb and seize him violently. Instead, he sought a reasonable complaint to justify his hatred.'You insulted me last year, when you were small', said the wolf. The lamb replied,'How could I have insulted you last year? I'm not even a year old.' The wolf continued,'Well, are you not cropping the grass of this field which belongs to me?' The lamb said,'No, I haven't eaten any grass; I have not even begun to graze.' Finally the wolf exclaimed,'But didn't you drink from the fountain which I drink from?' The lamb answered,'It is my mother's breast that gives me my drink.' The wolf then seized the lamb and as he chewed he said,'You are not going to make this wolf go without his dinner, even if you are able to easily refute every one of my charges!"

doc = nlp_en(text)

# Print the entities
print("Custom entity recognition:")
for ent in doc.ents:
    print(f"  {ent.text} - {ent.label_}")

Custom entity recognition:
  wolf - CHARACTER
  lamb - CHARACTER
  the lamb - CHARACTER
  last year - DATE
  the wolf - CHARACTER
  The lamb - CHARACTER
  last year - DATE
  a year old - DATE
  The wolf - CHARACTER
  this field - LOCATION
  The lamb - CHARACTER
  said,'No - PERSON
  the wolf - CHARACTER
  the fountain - LOCATION
  The lamb - CHARACTER
  The wolf - CHARACTER
  the lamb - CHARACTER
  wolf - CHARACTER


In [14]:
from IPython.display import display, HTML

html = displacy.render(doc, style="ent", jupyter=False)

display(HTML(html))

trying on a different fable now 

In [15]:
# Process the text
text = "A fox had unwittingly fallen down a well and found herself trapped inside its high walls. Meanwhile, a thirsty goat had made his way to that same place and asked the fox whether the water was fresh and plentiful. The fox set about laying her trap. ‘Come down, my friend,’ said the fox. The water is so good that I cannot get enough of it myself!’ The bearded billy-goat lowered himself into the well, whereupon that little vixen leaped up on his lofty horns and emerged from the hole, leaving the goat stuck inside the watery prison."

doc = nlp_en(text)

# Print the entities
print("Custom entity recognition:")
for ent in doc.ents:
    print(f"  {ent.text} - {ent.label_}")

Custom entity recognition:
  fox - ORG
  fox - ORG
  fox - ORG
  fox - ORG


In [16]:
from IPython.display import display, HTML

html = displacy.render(doc, style="ent", jupyter=False)

display(HTML(html))