Named entities are "real world objects" that are assigned a name – for example, a person, an organization or a country.

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

# Iterate over the predicted entities
for ent in doc.:
    # Print the entity text and its label
    print(ent.text, ent.)

In [None]:
spacy.explain("GPE")

In [None]:
spacy.explain("NNP")

In [None]:
spacy.explain("dobj")

#### Visualizers: https://spacy.io/usage/visualizers

In [None]:
from spacy import 
(doc, style="ent")

#### Exercise

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"

# Process the text
doc = ____

# Iterate over the entities
for ____ in ____.____:
    # Print the entity text and label
    print(____.____, ____.____)

# Get the span for "iPhone X"
iphone_x = ____

# Print the span text
print("Missing entity:", iphone_x.text)

### Rule Matching

Why not just regular expressions?

* Match on Doc objects, not just strings
* Match on tokens and token attributes
* Use a model's predictions
* Example: "duck" (verb) vs. "duck" (noun)

#### Matching Text

In [None]:
import spacy

# Import the Matcher
from spacy.matcher import 

# Load a pipeline and create the nlp object
nlp = spacy.load("en_core_web_sm")

# Initialize the matcher with the shared vocab
matcher = (nlp.vocab)

# Add the pattern to the matcher
pattern = [{"": ""}, {"": ""}]
matcher.add("", [pattern])

# Process some text
doc = nlp("Upcoming iPhone X release date leaked")

# Call the matcher on the doc
matches = (doc)

In [None]:
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[:]
    print(matched_span.text)

#### Matching lexical attributes

In [None]:
doc = nlp("2018 FIFA World Cup: France won!")

In [None]:
pattern = [
    {"IS_DIGIT": True},
    {"LOWER": "fifa"},
    {"LOWER": "world"},
    {"LOWER": "cup"},
    {"IS_PUNCT": True}
]

In [None]:
matcher.add("FIFA_PATTERN", [pattern])

In [None]:
matches = (doc)

In [None]:
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[:]
    print(matched_span.text)

#### Matching other token attributes

In [None]:
doc = nlp("I loved dogs but now I love cats more.")

In [None]:
pattern = [
    {"LEMMA": "love", "POS": "VERB"},
    {"POS": "NOUN"}
]

matcher.add("POS_PATTERN", [pattern])

In [None]:
matches = (doc)

In [None]:
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[:]
    print(matched_span.text)

#### Using operators and quantifiers (1)

* !  Negation: match 0 times
* ?  Optional: match 0 or 1 times
* \+  Match 1 or more times
* \* Match 0 or more times

In [None]:
doc = nlp("I bought a smartphone. Now I'm buying apps.")

In [None]:
pattern = [
    {"LEMMA": "buy"},
    {"POS": "DET", "OP": "?"},  # optional: match 0 or 1 times
    {"POS": "NOUN"}
]

In [None]:
matcher.add("POS_PATTERN", [pattern])

In [None]:
matches = (doc)

In [None]:
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[:]
    print(matched_span.text)

### Exercise

#### Number 1

In [None]:
import spacy

# Import the Matcher
from spacy.____ import ____

nlp = spacy.load("en_core_web_sm")
doc = nlp("Upcoming iPhone X release date leaked as Apple reveals pre-orders")

# Initialize the Matcher with the shared vocabulary
matcher = ____(____.____)

# Create a pattern matching two tokens: "iPhone" and "X"
pattern = [____]

# Add the pattern to the matcher
____.____("IPHONE_X_PATTERN", ____)

# Use the matcher on the doc
matches = ____
print("Matches:", [doc[start:end].text for match_id, start, end in matches])

#### Number 2:

In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?"
)

# Write a pattern that matches a form of "download" plus proper noun
pattern = [{"LEMMA": ____}, {"POS": ____}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("DOWNLOAD_THINGS_PATTERN", [pattern])
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

#### Number 3:

In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "Features of the app include a beautiful design, smart search, automatic "
    "labels and optional voice responses."
)

# Write a pattern for adjective plus one or two nouns
pattern = [{"POS": ____}, {"POS": ____}, {"POS": ____, "OP": ____}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("ADJ_NOUN_PATTERN", [pattern])
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)