In [1]:
# load spacy library
import spacy

In [2]:
# load a pipeline
nlp = spacy.load("en_core_web_sm")

In [3]:
# print the name of the pipeline components
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [4]:
# print the full pipeline name
print(nlp.pipeline)

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x11753c5e0>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x124229d60>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x1241415f0>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x124430b40>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x12443f9c0>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x1241417b0>)]


In [5]:
# now lets create a pipeline. first, import the requeried library
from spacy.language import Language

In [6]:
# then define a custom pipeline
@Language.component("custom_component")
def custom_component_function(doc):
    print("Doc length:", len(doc))
    return doc

In [7]:
# and finally add this pipeline to the spacy object, before the "ner" pipeline
nlp.add_pipe("custom_component", before="ner")


<function __main__.custom_component_function(doc)>

In [8]:
# Then check again all the pipelines names:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'custom_component', 'ner']


In [9]:
# Lets tokenize some text. When the custom_component pipele is executed, we will see the lenght of th doc
doc = nlp("Hello word! Today we will do our best!")

Doc length: 10


In [10]:
# just to remember you, there are 10 tokens in the doc:
list_tokens = []
for token in doc:
    list_tokens.append(token)

print(list_tokens)

[Hello, word, !, Today, we, will, do, our, best, !]


In [11]:
# Now lets for animals in the some document and
# - add the matched Spans, using PhraseMatcher (match parreterns in the for of Doc objects
# - and add them in the Docs.ents (know entities)

In [12]:
# import the needed libraries
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

In [13]:
#load a new spacy vocab
nlp = spacy.load("en_core_web_sm")

In [14]:
# tokenize the text that will be processed
doc = nlp("I have a cat and a Golden Retriever. I do not have a Snake or an Elephant!")

In [15]:
# a list of animals to be found
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus", "Elephant"]
print(animals)

['Golden Retriever', 'cat', 'turtle', 'Rattus norvegicus', 'Elephant']


In [16]:
# tokenize them and put inside of list 
animal_patterns = list(nlp.pipe(animals))
print("animal_patterns:", animal_patterns)

animal_patterns: [Golden Retriever, cat, turtle, Rattus norvegicus, Elephant]


In [17]:
# to add to the matcher the new patterns
# first, create the PhraseMatcher object using the shared vocabulary
matcher = PhraseMatcher(nlp.vocab)
# and then add the animal partterns
matcher.add("ANIMAL", animal_patterns)

In [18]:
# Define the custom component
@Language.component("animal_component")
def animal_component_function(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label "ANIMAL"
    spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]
    # Overwrite the doc.ents with the matched spans
    doc.ents = spans
    return doc

In [19]:
# before continue, lets show what is inside of every object

In [20]:
# matcher object is a PhraseMatcher object type that is using the nlp.vocab;
# And nlp is a trained spacy pipeline called en_core_web_sm 
print(matcher)

<spacy.matcher.phrasematcher.PhraseMatcher object at 0x1245265f0>


In [21]:
# Looks into the doc for the animmal patterns. 3 were found.
matches = matcher(doc)
print(matches)
# The list contain 3 items. Every item has 3 informations: the match_id, where it start and where it ends
# the match_id is the hash_value of the string that was found

[(6303828839600189595, 3, 4), (6303828839600189595, 6, 8), (6303828839600189595, 17, 18)]


In [22]:
# Then we will recreate the doc.ents based just in animal_patterns
# We will create one Span for every match in matches list
# Remembering that Span is a slice of the Doc object
spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]
# Overwrite the doc.ents with the matched spans
doc.ents = spans

In [23]:
print(doc.ents)

(cat, Golden Retriever, Elephant)


In [24]:
# lets add the animal_component_function after the "ner" pipeline component
nlp.add_pipe("animal_component", after="ner")

<function __main__.animal_component_function(doc)>

In [25]:
# And now lets use just the spacy pipelines to get the animals found and their label
doc = nlp("I have a cat and a Golden Retriever. I do not have a Snake or an Elephant!")
print([(ent.text, ent.label_) for ent in doc.ents])

[('cat', 'ANIMAL'), ('Golden Retriever', 'ANIMAL'), ('Elephant', 'ANIMAL')]
