<a href="https://www.kaggle.com/code/faressayah/spacy-chapter-3-processing-pipelines?scriptVersionId=117768838" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# 1. What happens when you call nlp?

- Tokenize the text and apply each pipeline component in order. The tokenizer turns a string of text into a `Doc` object. spaCy then applies every component in the pipeline on document, in order.

# 2. Inspecting the Pipeline

In [1]:
import spacy

print(spacy.__version__)

3.3.2


In [2]:
nlp = spacy.load('en_core_web_sm')

print(nlp.pipe_names)
print(nlp.pipeline)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7f0e25267bb0>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x7f0e25267a60>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x7f0e24faad50>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x7f0e24e7c410>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x7f0e24e879b0>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x7f0e24faaa50>)]


# 3. Simple Components


In [3]:
import spacy
from spacy.language import Language

@Language.component("Length")
def length_component(doc):
    doc_length = len(doc)
    print(f"This document is {doc_length} tokens long.")
    return doc

nlp = spacy.load("en_core_web_sm")

nlp.add_pipe("Length", first=True)
print(nlp.pipe_names)

doc = nlp("This is a sentence.")

['Length', 'tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
This document is 5 tokens long.


# 4. Complex Components

In [4]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from spacy.language import Language


nlp = spacy.load("en_core_web_sm")
animals = ["Golden Retriever", "cat", "trutule", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
print("animal_patterns: ", animal_patterns)
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns)

@Language.component("Animal")
def animal_component(doc):
    matches = matcher(doc)
    spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]
    
    doc.ents = spans
    return doc

nlp.add_pipe("Animal", after="ner")
print(nlp.pipe_names)

doc = nlp("I have a cat and a Golden Retriever")
print([(ent.text, ent.label_) for ent in doc.ents])

animal_patterns:  [Golden Retriever, cat, trutule, Rattus norvegicus]
['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner', 'Animal']
[('cat', 'ANIMAL'), ('Golden Retriever', 'ANIMAL')]


# 5. Setting Extension Attributes

In [5]:
from spacy.lang.en import English
from spacy.tokens import Token

nlp = English()

Token.set_extension("is_country", default=False)

doc = nlp("I live in Spain.")
doc[3]._.is_country = True

print([(token.text, token._.is_country) for token in doc])

[('I', False), ('live', False), ('in', False), ('Spain', True), ('.', False)]


In [6]:
from spacy.lang.en import English
from spacy.tokens import Token

nlp = English()

def get_reversed(token):
    return token.text[::-1]

Token.set_extension("reversed", getter=get_reversed)

doc = nlp("All generalizations are false, including this one.")

for token in doc:
    print("reversed:", token._.reversed)

reversed: llA
reversed: snoitazilareneg
reversed: era
reversed: eslaf
reversed: ,
reversed: gnidulcni
reversed: siht
reversed: eno
reversed: .


In [7]:
from spacy.lang.en import English
from spacy.tokens import Doc

nlp = English()

def get_has_number(doc):
    return any(token.like_num for token in doc)

Doc.set_extension("has_number", getter=get_has_number)

doc = nlp("The museum closed for five years in 2012.")
print("has_number:", doc._.has_number)

has_number: True


In [8]:
from spacy.lang.en import English
from spacy.tokens import Span

nlp = English()

def to_html(span, tag):
    return f"<{tag}>{span.text}</{tag}>"

Span.set_extension("to_html", method=to_html)

doc = nlp("Hello world, this is a sentence.")
span = doc[0:2]
print("to_html", span._.to_html('strong'))

to_html <strong>Hello world</strong>


# 6. Entities and Extensions

In [9]:
import spacy
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")

def get_wikipedia_url(span):
    if span.label_ in ("PERSON", "ORG", "GPE", "LOCATION"):
        entity_text = span.text.replace(" ", "_")
        return "https://en.wikipedia.org/w/index.php?search="+entity_text
    
    
Span.set_extension("wikipedia_url", getter=get_wikipedia_url)

doc = nlp(
    "In over fifty years from his very first recordings right through to his "
    "last album, David Bowie was at the vanguard of contemporary culture."
)

for ent in doc.ents:
    print(ent.text, ent._.wikipedia_url)

over fifty years None
David Bowie https://en.wikipedia.org/w/index.php?search=David_Bowie
