# NLP Basics with spacy

In [1]:
import spacy

In [2]:
import en_core_web_sm

## Spacy basics

In [3]:
# This nlp is the pipeline object with which we use different techniques
# This is loading a model, and we loaded an nlp english model
nlp = spacy.load("en_core_web_sm")

In [5]:
# This will parse this in different tokens
doc = nlp(u"Tesla is looking at buying U.S. startup for $6 million")

In [6]:
doc

Tesla is looking at buying U.S. startup for $6 million

In [7]:
# token.pos_ -> provides the type of english character it is
# dep_ -> syntactic dependency
for token in doc:
    print(token,token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [8]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x28c7feccf70>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x28c7ee9c2e0>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x28c7ee9c160>)]

In [9]:
for token in doc:
    print(token.lemma_)

Tesla
be
look
at
buy
U.S.
startup
for
$
6
million


In [10]:
doc2 = nlp(u"Leverage agile frameworks to provide a robust synopsis for high leveloverviews. Iterative approaches to corporate strategy foster collaborativethinking to further the overall value proposition. Organically grow theholistic world view of disruptive innovation via workplace diversity andempowerment.")

In [11]:
quote = doc2[16:30]

In [12]:
print(quote)

strategy foster collaborativethinking to further the overall value proposition. Organically grow theholistic world


In [13]:
type(quote)

spacy.tokens.span.Span

In [14]:
# Getting different sentences
for sentence in doc2.sents:
    print(sentence)

Leverage agile frameworks to provide a robust synopsis for high leveloverviews.
Iterative approaches to corporate strategy foster collaborativethinking to further the overall value proposition.
Organically grow theholistic world view of disruptive innovation via workplace diversity andempowerment.


In [15]:
# checking the vocab of the language loaded 
len(doc.vocab)

531

In [16]:
# Document objects does not support reassignment like doc[2] = "something"

In [17]:
# Getting named entity
doc3 = nlp(u"Apple to build a factory in Hong Kong for $6 million.")

In [18]:
for entity in doc3.ents:
    print(entity, entity.label_)

Apple ORG
Hong Kong GPE
$6 million MONEY


In [19]:
from spacy import displacy

In [20]:
displacy.render(doc3, style='ent', jupyter=True)

# Stemming
### porter's algorithm

In [21]:
import nltk
from nltk.stem.porter import PorterStemmer

In [22]:
p_stemmer = PorterStemmer()

In [23]:
words = ["run", "runner", "ran", "runs","easily", "fairly","fairness"]

In [24]:
for word in words:
    print(f"{word} -----> {p_stemmer.stem(word=word)}")

run -----> run
runner -----> runner
ran -----> ran
runs -----> run
easily -----> easili
fairly -----> fairli
fairness -----> fair


### Snowball stemmer

In [25]:
from nltk.stem.snowball import SnowballStemmer
snowball_stemmer = SnowballStemmer(language="english")

In [26]:
for word in words:
    print(f"{word} -----> {snowball_stemmer.stem(word=word)}")

run -----> run
runner -----> runner
ran -----> ran
runs -----> run
easily -----> easili
fairly -----> fair
fairness -----> fair


# Lemmatization

In [27]:
doc_l = nlp(u"I am a runner running in the race because I love to run since i ran today")

In [28]:
for token in doc_l:
    print(token.text, "\t", token.pos_, "\t", token.lemma_)

I 	 PRON 	 -PRON-
am 	 AUX 	 be
a 	 DET 	 a
runner 	 NOUN 	 runner
running 	 VERB 	 run
in 	 ADP 	 in
the 	 DET 	 the
race 	 NOUN 	 race
because 	 SCONJ 	 because
I 	 PRON 	 -PRON-
love 	 VERB 	 love
to 	 PART 	 to
run 	 VERB 	 run
since 	 SCONJ 	 since
i 	 PRON 	 i
ran 	 VERB 	 run
today 	 NOUN 	 today


# Stop words

In [29]:
print(nlp.Defaults.stop_words)

{'should', 'there', 'together', 'has', 'make', 'same', 'us', 'somehow', 'yourselves', 'yet', 'your', 'who', 'formerly', 'eleven', 'due', 'some', 'whom', 'with', 'then', 'hereafter', 'regarding', 'between', 'else', 'indeed', 'those', 'cannot', 'whereby', 'namely', 'five', 'besides', 'off', 'which', 'its', 'one', 'it', 'least', 'before', 'please', 'what', 'call', 'mostly', 'seeming', 'until', 'across', 'my', '‘m', 'thus', 'a', 'whoever', 'seems', 'although', 'latterly', 'herein', 'less', 'none', 'something', 'thereupon', 'could', 'beside', 'via', 'here', 'well', 'amongst', 'without', 'three', 'last', 'most', 'yours', 'six', 'into', 'used', 're', 'whereafter', 'various', '‘s', 'was', 'might', 'amount', 'sometime', 'two', 'see', 'among', 'during', 'nevertheless', 'through', 'am', 'few', 'behind', 'anything', '’re', 'per', 'back', 'still', 'but', 'once', '’ll', 'get', 'anyway', 'wherein', 'up', 'thru', 'hundred', 'under', 'ca', 'when', 'while', 'whereas', 'itself', 'quite', 'she', 'is', 'fo

In [30]:
len(nlp.Defaults.stop_words)

326

In [31]:
# Adding a custom stop word
nlp.Defaults.stop_words.add("btw")

In [32]:
nlp.vocab["btw"].is_stop

True

In [33]:
len(nlp.Defaults.stop_words)

327

In [34]:
# Removing a stopword
nlp.Defaults.stop_words.remove("already")

In [35]:
len(nlp.Defaults.stop_words)

326

## Phrase Matching and Vocabulary
So far we've seen how a body of text is divided into tokens, and how individual tokens are parsed and tagged with parts of speech, dependencies and lemmas.

In this section we will identify and label specific phrases that match patterns we can define ourselves. 

In [36]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [37]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]
pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]

matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [38]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')

In [39]:
found_matches = matcher(doc)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


This found both two-word patterns, with and without the hyphen!

The following quantifiers can be passed to the `'OP'` key:
<table><tr><th>OP</th><th>Description</th></tr>

<tr ><td><span >\!</span></td><td>Negate the pattern, by requiring it to match exactly 0 times</td></tr>
<tr ><td><span >?</span></td><td>Make the pattern optional, by allowing it to match 0 or 1 times</td></tr>
<tr ><td><span >\+</span></td><td>Require the pattern to match 1 or more times</td></tr>
<tr ><td><span >\*</span></td><td>Allow the pattern to match zero or more times</td></tr>
</table>

# Assesment

In [40]:
with open("..\sample_files\owlcreek.txt") as f:
    owl_doc = nlp(f.read())

In [41]:
print(owl_doc[:36])

AN OCCURRENCE AT OWL CREEK BRIDGE

by Ambrose Bierce

I

A man stood upon a railroad bridge in northern Alabama, looking down
into the swift water twenty feet below.  


### <b>How many tokens are in the file</b>
### How many sentences are in the file

In [42]:
print(len(owl_doc))

4835


In [44]:
sent_list3 = []
for senten in owl_doc.sents:
    sent_list.append(senten)
print(len(sent_list3))

0


# Named Entity Recognition

In [48]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(f"{ent.text} - {ent.label_} - {str(spacy.explain(ent.label_))}")
    else:
        print("No Entities found")

In [49]:
show_ents(doc_l)

today - DATE - Absolute or relative dates or periods


In [52]:
show_ents(doc3)

Apple - ORG - Companies, agencies, institutions, etc.
Hong Kong - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [57]:
doc_ent = nlp("May I go to the washington DC. next may to see my friend josh and see the washington monument")

In [59]:
show_ents(doc_ent)

washington DC - GPE - Countries, cities, states
next may - DATE - Absolute or relative dates or periods
josh - PERSON - People, including fictional
washington - GPE - Countries, cities, states


In [60]:
doc3

Apple to build a factory in Hong Kong for $6 million.

In [3]:
import spacy
nlp_en = spacy.load('en_core_web_sm')

In [4]:
doc = nlp_en("The quick brown fox jumps over the lazy dog") 
print(doc[2:4])

brown fox
