# Lesson 03

## Processing Pipelines

https://course.spacy.io/chapter3

In [1]:
import json

import spacy
from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher

from spacy.tokens import Doc, Span, Token

# 
from print_util import print_doc_analysis, print_matcher_results

In [2]:
spacy.prefer_gpu()
nlp = English()
nlp = spacy.load("en_core_web_sm")

### 1 Pipeline Attributes

In [3]:
print (nlp.pipe_names)

['tagger', 'parser', 'ner']


In [4]:
print (nlp.pipeline)

[('tagger', <spacy.pipeline.pipes.Tagger object at 0x7fb2903c2320>), ('parser', <spacy.pipeline.pipes.DependencyParser object at 0x7fb261d49408>), ('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x7fb261d49468>)]


### 4 Custom Pipeline Components

In [5]:
def custom_component(doc):
    print('Doc Length:', len(doc))
    return doc

In [6]:
nlp.add_pipe(custom_component, first=True)
print('Pipeline:', nlp.pipe_names)

Pipeline: ['custom_component', 'tagger', 'parser', 'ner']


In [7]:
doc = nlp("Hello World!")

Doc Length: 3


### 6 Simple Components

In [8]:
def length_component(doc):
    doc_length = len(doc)
    print ("doc is {} tokens long.".format(doc_length))
    return doc

In [9]:
# you need to reload this pipeline
nlp = spacy.load("en_core_web_sm")

In [10]:
nlp.add_pipe(length_component, first=True)
print(nlp.pipe_names)

['length_component', 'tagger', 'parser', 'ner']


In [11]:
doc = nlp("this is a sentsnce.")

doc is 5 tokens long.


### 7 Complex Components

In [12]:
nlp = spacy.load("en_core_web_sm")

In [13]:
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
print ("animal patterns:", animal_patterns)

animal patterns: [Golden Retriever, cat, turtle, Rattus norvegicus]


In [14]:
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns)

In [15]:
def animal_component(doc):
    matches = matcher(doc)
    spans = [Span(doc, start, end, label='animal') for match_id, start, end in matches]
    doc.ents = spans
    return doc

In [16]:
nlp.add_pipe(animal_component, first=True)
print(nlp.pipe_names)

['animal_component', 'tagger', 'parser', 'ner']


In [17]:
doc = nlp("I have a cat and a Golden Retriever")
print ([(ent.text, ent.label_) for ent in doc.ents])

[('cat', 'animal'), ('Golden Retriever', 'animal')]


### 9 Setting Extension Attributes (1)

In [19]:
nlp = English()

In [21]:
# Register the token extension
Token.set_extension('is_country', default=False)

In [23]:
doc = nlp("I live in Spain")
doc[3]._.is_country = True

In [25]:
print ([(token.text, token._.is_country) for token in doc])

[('I', False), ('live', False), ('in', False), ('Spain', True)]


#### step 2

In [26]:
def get_reversed(token):
    return token.text[::-1]

In [28]:
Token.set_extension('reversed', getter=get_reversed)

In [32]:
doc = nlp("All generalizations are false, including this one")
for token in doc:
    print (token, " --reversed--> ", token._.reversed)

All  --reversed-->  llA
generalizations  --reversed-->  snoitazilareneg
are  --reversed-->  era
false  --reversed-->  eslaf
,  --reversed-->  ,
including  --reversed-->  gnidulcni
this  --reversed-->  siht
one  --reversed-->  eno


### 10 Setting Extension Attributes

In [33]:
nlp = English()

In [34]:
def to_html(span, tag):
    # Wrap the span in HTML
    return "<{tag}>{text}</{tag}>".format(tag=tag, text=span.text)

In [35]:
Span.set_extension('to_html', method=to_html)

In [37]:
doc = nlp("Hello World, this is a sentence.")
span = doc[0:2]
print (span._.to_html("strong"))

<strong>Hello World</strong>


### 11 Entities & Extensions
set the extension name and the getter

In [71]:
# notice the difference here
# I don't get it
# nlp = spacy.load("en_core_web_sm") # doc.ents = 3 
nlp = spacy.load("en_core_web_lg")
# nlp = English() -- doc.ents = ()

In [72]:
def get_wikipedia_url(span):
    # Get a Wiki URLif apsn has one of the labels
    if span.label_ in ("PERSON", "ORG", "GPE", "LOCATION"):
        entity_text = span.text.replace(" ", "_")
        return "https://en.wikipedia.org/w/index.php?search=" + entity_text

In [73]:
# set the Span extension
Span.set_extension("wikipedia_url", getter=get_wikipedia_url, force=True)

In [74]:
doc = nlp(
    "In over fifty years from his very first recordings right through to his "
    "last album, David Bowie was at the vanguard of contemporary culture")
print (doc.ents)

(over fifty years, first, David Bowie)


In [75]:
for ent in doc.ents:
    print (ent.text, ent._.wikipedia_url)

over fifty years None
first None
David Bowie https://en.wikipedia.org/w/index.php?search=David_Bowie


### 12 Componentts with extensions

In [17]:
with open("countries.json") as f:
    COUNTRIES = json.loads(f.read())

In [18]:
with open("capitals.json") as f:
    CAPITALS = json.loads(f.read())

In [27]:
# this gives you an empty pipeline 
nlp = English()

# this gives you a full pipeline
# nlp = spacy.load("en_core_web_lg")

print (nlp.pipe_names)

[]


In [28]:
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))

In [29]:
def countries_component(doc):
    matches = matcher(doc)
    doc.ents = [Span(doc, start,end, label='GPE') for match_id, start, end in matches]
    return doc

In [30]:
nlp.add_pipe(countries_component, first=True)
print (nlp.pipe_names)   # only one component due to how you created nlp

['countries_component']


In [34]:
# Getter that looks up the CAPITAL in the ditionary given country
get_capital = lambda span: CAPITALS.get(span.text)

Span.set_extension("capital", getter=get_capital, force=True)

In [36]:
doc = nlp("Czech Republic may help Slovakia protect its airspace")
print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])

[('Czech Republic', 'GPE', 'Prague'), ('Slovakia', 'GPE', 'Bratislava')]


### 13 Performance Tips
- use nlp.pipe(LOTS of texts)
- passing a context

In [49]:
# you must have all attributes - you can't leave an attribute missing

data = [
    ('text string 1', {'id': 1, 'restaurant': 'ATL', 'operator': 'Bob'}),
    ('text string 2', {'id': 2, 'restaurant': 'NYC', 'operator': 'Judy'}),
    ('text string 3', {'id': 3, 'restaurant': 'MSY', 'operator': 'corp'})
]

In [47]:
# doc extensions
Doc.set_extension('id', default=None, force=True)
Doc.set_extension('restaurant', default=None, force=True)
Doc.set_extension('operator', default='corp', force=True)

In [50]:
for doc, context in nlp.pipe(data, as_tuples=True):
    doc._.id = context['id']
    doc._.restaurant = context['restaurant']
    doc._.operator = context['operator']
    print(doc.text, doc._.restaurant, doc._.operator)

text string 1 ATL Bob
text string 2 NYC Judy
text string 3 MSY corp


In [51]:
doc = nlp.make_doc("Hello World!")

### 14 Processing Streams

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
with open("tweets.json") as f:
    TEXTS = json.loads(f.read())

In [5]:
for text in TEXTS:
    doc = nlp(text)
    print ([token.text for token in doc if token.pos_ == 'ADJ'])

['favorite']
['sick']
[]
['happy']
['delicious', 'fast']
[]
['terrible', 'gettin', 'payin']


In [10]:
docs = list(nlp.pipe(TEXTS))

In [11]:
# reminder doc.ents = PROPN or entities
for doc in docs:
    print (doc.ents)

(McDonalds,)
(@McDonalds,)
(McDonalds,)
(McDonalds, Spain)
(The Arch Deluxe,)
(WANT, McRib)
(This morning,)


In [14]:
print (docs[0])
print_doc_analysis(docs[0])

McDonalds is my favorite restaurant.
Index: 0 |  is_alpha True | is_punct False | like_num False | is_title False | POS PROPN | Text: McDonalds
Index: 1 |  is_alpha True | is_punct False | like_num False | is_title False | POS VERB | Text: is
Index: 2 |  is_alpha True | is_punct False | like_num False | is_title False | POS DET | Text: my
Index: 3 |  is_alpha True | is_punct False | like_num False | is_title False | POS ADJ | Text: favorite
Index: 4 |  is_alpha True | is_punct False | like_num False | is_title False | POS NOUN | Text: restaurant
Index: 5 |  is_alpha False | is_punct True | like_num False | is_title False | POS PUNCT | Text: .


### 14 - part 3

In [15]:
nlp = English()

In [16]:
people = ["David Bowie", "Angela Merkel", "Lady Gaga", "Jay Duff"]

In [18]:
patterns_1 = [nlp(person) for person in people]
print (type(patterns_1), patterns_1)

<class 'list'> [David Bowie, Angela Merkel, Lady Gaga, Jay Duff]


In [21]:
patterns_2 = list(nlp.pipe(people))
print (type(patterns_2), patterns_2)

<class 'list'> [David Bowie, Angela Merkel, Lady Gaga, Jay Duff]


### 15 Processing Data With Context

In [25]:
with open("bookquotes.json") as f:
    DATA = json.loads(f.read())

In [26]:
nlp = English()

In [27]:
Doc.set_extension("author", default=None)
Doc.set_extension("book", default=None)

In [34]:
for doc, context in nlp.pipe(DATA, as_tuples=True):
    doc._.book = context['book']
    doc._.author = context['author']
    
    # print the text and the custom attributes
    print (doc.text, "\n", "- '{}' by []".format(doc._.book, doc._.author), "\n")

One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin. 
 - 'Metamorphosis' by [] 

I know not all that may be coming, but be it what it will, I'll go to it laughing. 
 - 'Moby-Dick or, The Whale' by [] 

It was the best of times, it was the worst of times. 
 - 'A Tale of Two Cities' by [] 

The only people for me are the mad ones, the ones who are mad to live, mad to talk, mad to be saved, desirous of everything at the same time, the ones who never yawn or say a commonplace thing, but burn, burn, burn like fabulous yellow roman candles exploding like spiders across the stars. 
 - 'On the Road' by [] 

It was a bright cold day in April, and the clocks were striking thirteen. 
 - '1984' by [] 

Nowadays people know the price of everything and the value of nothing. 
 - 'The Picture Of Dorian Gray' by [] 



### 16 Selective Processing

In [41]:
nlp = spacy.load("en_core_web_sm")

In [42]:
text = ("Chick-fil-A is an American fast food restaurant chain headquartered in "
       "the city of College Park, Georgia, specializing in chicken sandwiches."
       )

In [39]:
# part 1
doc = nlp.make_doc(text)
print([token.text for token in doc])

['Chick', '-', 'fil', '-', 'A', 'is', 'an', 'American', 'fast', 'food', 'restaurant', 'chain', 'headquartered', 'in', 'the', 'city', 'of', 'College', 'Park', ',', 'Georgia', ',', 'specializing', 'in', 'chicken', 'sandwiches', '.']


In [45]:
# part 2
print (nlp.pipe_names)
with nlp.disable_pipes("tagger", "parser"):
    doc = nlp(text)
    print ([token.text for token in doc])

['tagger', 'parser', 'ner']
['Chick', '-', 'fil', '-', 'A', 'is', 'an', 'American', 'fast', 'food', 'restaurant', 'chain', 'headquartered', 'in', 'the', 'city', 'of', 'College', 'Park', ',', 'Georgia', ',', 'specializing', 'in', 'chicken', 'sandwiches', '.']
