# Lesson 03

## Processing Pipelines

https://course.spacy.io/chapter3

In [None]:
import json

import spacy
from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher

from spacy.tokens import Doc, Span, Token

# 
from print_util import print_doc_analysis, print_matcher_results

In [None]:
spacy.prefer_gpu()
nlp = English()
nlp = spacy.load("en_core_web_sm")

### 1 Pipeline Attributes

In [None]:
print (nlp.pipe_names)

In [None]:
print (nlp.pipeline)

### 4 Custom Pipeline Components

In [None]:
def custom_component(doc):
    print('Doc Length:', len(doc))
    return doc

In [None]:
nlp.add_pipe(custom_component, first=True)
print('Pipeline:', nlp.pipe_names)

In [None]:
doc = nlp("Hello World!")

### 6 Simple Components

In [None]:
def length_component(doc):
    doc_length = len(doc)
    print ("doc is {} tokens long.".format(doc_length))
    return doc

In [None]:
# you need to reload this pipeline
nlp = spacy.load("en_core_web_sm")

In [None]:
nlp.add_pipe(length_component, first=True)
print(nlp.pipe_names)

In [None]:
doc = nlp("this is a sentsnce.")

### 7 Complex Components

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
print ("animal patterns:", animal_patterns)

In [None]:
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns)

In [None]:
def animal_component(doc):
    matches = matcher(doc)
    spans = [Span(doc, start, end, label='animal') for match_id, start, end in matches]
    doc.ents = spans
    return doc

In [None]:
nlp.add_pipe(animal_component, first=True)
print(nlp.pipe_names)

In [None]:
doc = nlp("I have a cat and a Golden Retriever")
print ([(ent.text, ent.label_) for ent in doc.ents])

### 9 Setting Extension Attributes (1)

In [None]:
nlp = English()

In [None]:
# Register the token extension
Token.set_extension('is_country', default=False)

In [None]:
doc = nlp("I live in Spain")
doc[3]._.is_country = True

In [None]:
print ([(token.text, token._.is_country) for token in doc])

#### step 2

In [None]:
def get_reversed(token):
    return token.text[::-1]

In [None]:
Token.set_extension('reversed', getter=get_reversed)

In [None]:
doc = nlp("All generalizations are false, including this one")
for token in doc:
    print (token, " --reversed--> ", token._.reversed)

### 10 Setting Extension Attributes

In [None]:
nlp = English()

In [None]:
def to_html(span, tag):
    # Wrap the span in HTML
    return "<{tag}>{text}</{tag}>".format(tag=tag, text=span.text)

In [None]:
Span.set_extension('to_html', method=to_html)

In [None]:
doc = nlp("Hello World, this is a sentence.")
span = doc[0:2]
print (span._.to_html("strong"))

### 11 Entities & Extensions
set the extension name and the getter

In [None]:
# notice the difference here
# I don't get it
# nlp = spacy.load("en_core_web_sm") # doc.ents = 3 
nlp = spacy.load("en_core_web_lg")
# nlp = English() -- doc.ents = ()

In [None]:
def get_wikipedia_url(span):
    # Get a Wiki URLif apsn has one of the labels
    if span.label_ in ("PERSON", "ORG", "GPE", "LOCATION"):
        entity_text = span.text.replace(" ", "_")
        return "https://en.wikipedia.org/w/index.php?search=" + entity_text

In [None]:
# set the Span extension
Span.set_extension("wikipedia_url", getter=get_wikipedia_url, force=True)

In [None]:
doc = nlp(
    "In over fifty years from his very first recordings right through to his "
    "last album, David Bowie was at the vanguard of contemporary culture")
print (doc.ents)

In [None]:
for ent in doc.ents:
    print (ent.text, ent._.wikipedia_url)

### 12 Componentts with extensions

In [None]:
with open("countries.json") as f:
    COUNTRIES = json.loads(f.read())

In [None]:
with open("capitals.json") as f:
    CAPITALS = json.loads(f.read())

In [None]:
# this gives you an empty pipeline 
nlp = English()

# this gives you a full pipeline
# nlp = spacy.load("en_core_web_lg")

print (nlp.pipe_names)

In [None]:
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))

In [None]:
def countries_component(doc):
    matches = matcher(doc)
    doc.ents = [Span(doc, start,end, label='GPE') for match_id, start, end in matches]
    return doc

In [None]:
nlp.add_pipe(countries_component, first=True)
print (nlp.pipe_names)   # only one component due to how you created nlp

In [None]:
# Getter that looks up the CAPITAL in the ditionary given country
get_capital = lambda span: CAPITALS.get(span.text)

Span.set_extension("capital", getter=get_capital, force=True)

In [None]:
doc = nlp("Czech Republic may help Slovakia protect its airspace")
print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])

### 13 Performance Tips
- use nlp.pipe(LOTS of texts)
- passing a context

In [None]:
# you must have all attributes - you can't leave an attribute missing

data = [
    ('text string 1', {'id': 1, 'restaurant': 'ATL', 'operator': 'Bob'}),
    ('text string 2', {'id': 2, 'restaurant': 'NYC', 'operator': 'Judy'}),
    ('text string 3', {'id': 3, 'restaurant': 'MSY', 'operator': 'corp'})
]

In [None]:
# doc extensions
Doc.set_extension('id', default=None, force=True)
Doc.set_extension('restaurant', default=None, force=True)
Doc.set_extension('operator', default='corp', force=True)

In [None]:
for doc, context in nlp.pipe(data, as_tuples=True):
    doc._.id = context['id']
    doc._.restaurant = context['restaurant']
    doc._.operator = context['operator']
    print(doc.text, doc._.restaurant, doc._.operator)

In [None]:
doc = nlp.make_doc("Hello World!")

### 14 Processing Streams

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
with open("tweets.json") as f:
    TEXTS = json.loads(f.read())

In [None]:
for text in TEXTS:
    doc = nlp(text)
    print ([token.text for token in doc if token.pos_ == 'ADJ'])

In [None]:
docs = list(nlp.pipe(TEXTS))

In [None]:
# reminder doc.ents = PROPN or entities
for doc in docs:
    print (doc.ents)

In [None]:
print (docs[0])
print_doc_analysis(docs[0])

### 14 - part 3

In [None]:
nlp = English()

In [None]:
people = ["David Bowie", "Angela Merkel", "Lady Gaga", "Jay Duff"]

In [None]:
patterns_1 = [nlp(person) for person in people]
print (type(patterns_1), patterns_1)

In [None]:
patterns_2 = list(nlp.pipe(people))
print (type(patterns_2), patterns_2)

### 15 Processing Data With Context

In [None]:
with open("bookquotes.json") as f:
    DATA = json.loads(f.read())

In [None]:
nlp = English()

In [None]:
Doc.set_extension("author", default=None)
Doc.set_extension("book", default=None)

In [None]:
for doc, context in nlp.pipe(DATA, as_tuples=True):
    doc._.book = context['book']
    doc._.author = context['author']
    
    # print the text and the custom attributes
    print (doc.text, "\n", "- '{}' by []".format(doc._.book, doc._.author), "\n")

### 16 Selective Processing

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
text = ("Chick-fil-A is an American fast food restaurant chain headquartered in "
       "the city of College Park, Georgia, specializing in chicken sandwiches."
       )

In [None]:
# part 1
doc = nlp.make_doc(text)
print([token.text for token in doc])

In [None]:
# part 2
print (nlp.pipe_names)
with nlp.disable_pipes("tagger", "parser"):
    doc = nlp(text)
    print ([token.text for token in doc])