# Chapter 2

https://course.spacy.io/chapter2

### continued from less02.py
where I switched from python script to a notebook in Jupyter


In [1]:
import json

import spacy
from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher

from spacy.tokens import Doc, Span

# 
from print_util import print_doc_analysis, print_matcher_results

In [2]:
spacy.prefer_gpu()
nlp = English()
nlp = spacy.load("en_core_web_sm")

In [3]:
doc = nlp("testing 1, 2, 3, 4")
print_doc_analysis(doc)

Index: 0 |  is_alpha True | is_punct False | like_num False | is_title False | POS VERB | Text: testing
Index: 1 |  is_alpha False | is_punct False | like_num True | is_title False | POS NUM | Text: 1
Index: 2 |  is_alpha False | is_punct True | like_num False | is_title False | POS PUNCT | Text: ,
Index: 3 |  is_alpha False | is_punct False | like_num True | is_title False | POS NUM | Text: 2
Index: 4 |  is_alpha False | is_punct True | like_num False | is_title False | POS PUNCT | Text: ,
Index: 5 |  is_alpha False | is_punct False | like_num True | is_title False | POS NUM | Text: 3
Index: 6 |  is_alpha False | is_punct True | like_num False | is_title False | POS PUNCT | Text: ,
Index: 7 |  is_alpha False | is_punct False | like_num True | is_title False | POS NUM | Text: 4


## 14 - Efficient Phrase Matching

In [4]:

with open("countries.json") as f:
        COUNTRIES = json.loads(f.read())

In [5]:
doc = nlp("Czech Republic may help Slovakia protect its airspace")

In [6]:
matcher = PhraseMatcher(nlp.vocab)

### pipe a list
COUNTRIES is a list  

In [7]:
print('COUNTRIES type:', type(COUNTRIES))

countries_pipe = nlp.pipe(COUNTRIES)
print('nlp.pipe type:', type(countries_pipe))

patterns = list(countries_pipe)
print('patterns type:', type(patterns))

matcher.add("COUNTRY", None, *patterns)

COUNTRIES type: <class 'list'>
nlp.pipe type: <class 'generator'>
patterns type: <class 'list'>


In [8]:
matches = matcher(doc)
print_matcher_results(doc,matches)

0 2 : Czech Republic
4 5 : Slovakia


   ## 15 Extract Countries & Relationships

In [9]:
with open("country_text.txt") as f:
    TEXT = f.read()
    
print ("document length:", type(TEXT), len(TEXT))
# print ('TEXT:', TEXT)

document length: <class 'str'> 4577


In [19]:
nlp = English()   # note - learned this the hardway - you need to re-initialize nlp - not sure why
                  #      - you might have a problem where you didn't clear nlp.pipe?

matcher = PhraseMatcher(nlp.vocab)
patterns = list(nlp.pipe(COUNTRIES))
matcher.add("COUNTRY", None, *patterns)

In [29]:

doc = nlp(TEXT)
print ('initial doc.ents:', type(doc.ents), doc.ents)
print (' - - - - -  - - - -')
# print_doc_analysis(doc)

initial doc.ents: <class 'tuple'> ()
 - - - - -  - - - -


1. Iterate over the matches - create a span with label GPE (geoplitical entity)
2. Overwrite the entities in doc.ents and add the matched span
3. get the matched span root head token
4. print head and span

In [30]:
# doc.ents = () -- that it nothing, given your reintialized nlp
for match_id, start, end in matcher(doc):
    # 1 - create the span
    span = Span(doc, start, end, label="GPE")
    # 2 - overwrite entities
    doc.ents = list(doc.ents) + [span]
    
    # 3 - get the root head token
    span_root_head = span.root.head
    # 4 - print
    doc_ents_length = len(list(doc.ents))
    print(doc_ents_length, span, ' : ', span_root_head.text, "-->", span.text)
    # print (doc.ents)

1 Namibia  :  Namibia --> Namibia
2 South Africa  :  South --> South Africa
3 Cambodia  :  Cambodia --> Cambodia
4 Kuwait  :  Kuwait --> Kuwait
5 Somalia  :  Somalia --> Somalia
6 Haiti  :  Haiti --> Haiti
7 Mozambique  :  Mozambique --> Mozambique
8 Somalia  :  Somalia --> Somalia
9 Rwanda  :  Rwanda --> Rwanda
10 Singapore  :  Singapore --> Singapore
11 Sierra Leone  :  Sierra --> Sierra Leone
12 Afghanistan  :  Afghanistan --> Afghanistan
13 Iraq  :  Iraq --> Iraq
14 Sudan  :  Sudan --> Sudan
15 Congo  :  Congo --> Congo
16 Haiti  :  Haiti --> Haiti


In [31]:
# Print the entities in the document
print (type(doc.ents))
print([(ent.text, ent.label_) for ent in doc.ents if ent.label_ == "GPE"])

<class 'tuple'>
[('Namibia', 'GPE'), ('South Africa', 'GPE'), ('Cambodia', 'GPE'), ('Kuwait', 'GPE'), ('Somalia', 'GPE'), ('Haiti', 'GPE'), ('Mozambique', 'GPE'), ('Somalia', 'GPE'), ('Rwanda', 'GPE'), ('Singapore', 'GPE'), ('Sierra Leone', 'GPE'), ('Afghanistan', 'GPE'), ('Iraq', 'GPE'), ('Sudan', 'GPE'), ('Congo', 'GPE'), ('Haiti', 'GPE')]
