# Lesson 03

## Processing Pipelines

https://course.spacy.io/chapter3

In [18]:
import json

import spacy
from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher

from spacy.tokens import Doc, Span, Token

# 
from print_util import print_doc_analysis, print_matcher_results

In [2]:
spacy.prefer_gpu()
nlp = English()
nlp = spacy.load("en_core_web_sm")

### 1 Pipeline Attributes

In [3]:
print (nlp.pipe_names)

['tagger', 'parser', 'ner']


In [4]:
print (nlp.pipeline)

[('tagger', <spacy.pipeline.pipes.Tagger object at 0x7fb2903c2320>), ('parser', <spacy.pipeline.pipes.DependencyParser object at 0x7fb261d49408>), ('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x7fb261d49468>)]


### 4 Custom Pipeline Components

In [5]:
def custom_component(doc):
    print('Doc Length:', len(doc))
    return doc

In [6]:
nlp.add_pipe(custom_component, first=True)
print('Pipeline:', nlp.pipe_names)

Pipeline: ['custom_component', 'tagger', 'parser', 'ner']


In [7]:
doc = nlp("Hello World!")

Doc Length: 3


### 6 Simple Components

In [8]:
def length_component(doc):
    doc_length = len(doc)
    print ("doc is {} tokens long.".format(doc_length))
    return doc

In [9]:
# you need to reload this pipeline
nlp = spacy.load("en_core_web_sm")

In [10]:
nlp.add_pipe(length_component, first=True)
print(nlp.pipe_names)

['length_component', 'tagger', 'parser', 'ner']


In [11]:
doc = nlp("this is a sentsnce.")

doc is 5 tokens long.


### 7 Complex Components

In [12]:
nlp = spacy.load("en_core_web_sm")

In [13]:
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
print ("animal patterns:", animal_patterns)

animal patterns: [Golden Retriever, cat, turtle, Rattus norvegicus]


In [14]:
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns)

In [15]:
def animal_component(doc):
    matches = matcher(doc)
    spans = [Span(doc, start, end, label='animal') for match_id, start, end in matches]
    doc.ents = spans
    return doc

In [16]:
nlp.add_pipe(animal_component, first=True)
print(nlp.pipe_names)

['animal_component', 'tagger', 'parser', 'ner']


In [17]:
doc = nlp("I have a cat and a Golden Retriever")
print ([(ent.text, ent.label_) for ent in doc.ents])

[('cat', 'animal'), ('Golden Retriever', 'animal')]


### 9 Setting Extension Attributes (1)

In [19]:
nlp = English()

In [21]:
# Register the token extension
Token.set_extension('is_country', default=False)

In [23]:
doc = nlp("I live in Spain")
doc[3]._.is_country = True

In [25]:
print ([(token.text, token._.is_country) for token in doc])

[('I', False), ('live', False), ('in', False), ('Spain', True)]


#### step 2

In [26]:
def get_reversed(token):
    return token.text[::-1]

In [28]:
Token.set_extension('reversed', getter=get_reversed)

In [32]:
doc = nlp("All generalizations are false, including this one")
for token in doc:
    print (token, " --reversed--> ", token._.reversed)

All  --reversed-->  llA
generalizations  --reversed-->  snoitazilareneg
are  --reversed-->  era
false  --reversed-->  eslaf
,  --reversed-->  ,
including  --reversed-->  gnidulcni
this  --reversed-->  siht
one  --reversed-->  eno


### 10 Setting Extension Attributes

In [33]:
nlp = English()

In [34]:
def to_html(span, tag):
    # Wrap the span in HTML
    return "<{tag}>{text}</{tag}>".format(tag=tag, text=span.text)

In [35]:
Span.set_extension('to_html', method=to_html)

In [37]:
doc = nlp("Hello World, this is a sentence.")
span = doc[0:2]
print (span._.to_html("strong"))

<strong>Hello World</strong>


### 11 Entities & Extensions
set the extension name and the getter

In [71]:
# notice the difference here
# I don't get it
# nlp = spacy.load("en_core_web_sm") # doc.ents = 3 
nlp = spacy.load("en_core_web_lg")
# nlp = English() -- doc.ents = ()

In [72]:
def get_wikipedia_url(span):
    # Get a Wiki URLif apsn has one of the labels
    if span.label_ in ("PERSON", "ORG", "GPE", "LOCATION"):
        entity_text = span.text.replace(" ", "_")
        return "https://en.wikipedia.org/w/index.php?search=" + entity_text

In [73]:
# set the Span extension
Span.set_extension("wikipedia_url", getter=get_wikipedia_url, force=True)

In [74]:
doc = nlp(
    "In over fifty years from his very first recordings right through to his "
    "last album, David Bowie was at the vanguard of contemporary culture")
print (doc.ents)

(over fifty years, first, David Bowie)


In [75]:
for ent in doc.ents:
    print (ent.text, ent._.wikipedia_url)

over fifty years None
first None
David Bowie https://en.wikipedia.org/w/index.php?search=David_Bowie


### 12 Componentts with extensions