### Tokenization using spacy

In [3]:
import spacy
import nltk

In [8]:
# load the English language library
nlp = spacy.load("en_core_web_md")

In [9]:
# Define a string
text = ("Microsoft is shit")
print(text)

Microsoft is shit


In [11]:
# Create an object and explore tokens
doc = nlp(text)
for token in doc:
    print(token.text)

Microsoft
is
shit


In [12]:
# Counting tokens
len(doc)

3

In [13]:
# Counting vocab 
len(doc.vocab)

766

### Indexing and slicing in tokens

In [16]:
sentence = nlp("Ferrari is the best f1 team in the world!")
for token in sentence:
    print(token.text)

Ferrari
is
the
best
f1
team
in
the
world
!


In [17]:
sentence[0:5]

Ferrari is the best f1

In [18]:
sentence[-2]

world

### Named Entities

In [19]:
sentence2 = nlp("Apple is developing a car for around $300k")

In [21]:
# Recognizing the named entities in the sentence (done by spacy)
for ent in sentence2.ents:
    print(ent)

Apple
around $
300k


In [22]:
## Analyzing the entities
for ent in sentence2.ents:
    print(ent)
    print(ent.label_)
    print(str(spacy.explain(ent.label_)))
    print('\n')

Apple
ORG
Companies, agencies, institutions, etc.


around $
MONEY
Monetary values, including unit


300k
ORG
Companies, agencies, institutions, etc.




### Noun Chunks

In [23]:
# Noun chunks can be described as noun + words that describe a particular noun
sentence3 = nlp("It is difficult to insure autonomous cars")
for chunk in sentence3.noun_chunks:
    print(chunk.text)

It
autonomous cars


### Visualizing tokens

In [24]:
from spacy import displacy

In [29]:
doc = nlp("Apple is developing a car for around $300k")
displacy.render(docs = doc, style='dep', jupyter=True, options = {'distance': 100})

In [30]:
displacy.render(docs=doc, style='ent', jupyter=True)

### Token Attributes

In [39]:
doc = nlp("Apple is developing a car for around $300k")

In [40]:
token0 = doc[0]
token0

Apple

In [43]:
token0.is_alpha

True

In [44]:
token0.like_num

False

In [45]:
token1 = doc[7]
token1

$

In [46]:
token1.is_currency

True

In [47]:
for token in doc:
    print(token, "==>", "index: ", token.i, "is_alpha:", token.is_alpha, 
          "is_punct:", token.is_punct, 
          "like_num:", token.like_num,
          "is_currency:", token.is_currency,
         )

Apple ==> index:  0 is_alpha: True is_punct: False like_num: False is_currency: False
is ==> index:  1 is_alpha: True is_punct: False like_num: False is_currency: False
developing ==> index:  2 is_alpha: True is_punct: False like_num: False is_currency: False
a ==> index:  3 is_alpha: True is_punct: False like_num: False is_currency: False
car ==> index:  4 is_alpha: True is_punct: False like_num: False is_currency: False
for ==> index:  5 is_alpha: True is_punct: False like_num: False is_currency: False
around ==> index:  6 is_alpha: True is_punct: False like_num: False is_currency: False
$ ==> index:  7 is_alpha: False is_punct: False like_num: False is_currency: True
300k ==> index:  8 is_alpha: False is_punct: False like_num: False is_currency: False


### Customizing Tokenizer

In [48]:
from spacy.symbols import ORTH

nlp = spacy.blank("en")
doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [49]:
nlp.tokenizer.add_special_case("gimme", [
    {ORTH: "gim"},
    {ORTH: "me"},
])
doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

### Exercise
(1) Think stats is a free book to study statistics (https://greenteapress.com/thinkstats2/thinkstats2.pdf)

This book has references to many websites from where you can download free datasets. You are an NLP engineer working for some company and you want to collect all dataset websites from this book. To keep exercise simple you are given a paragraph from this book and you want to grab all urls from this paragraph using spacy

In [62]:
text="""
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
"""

In [63]:
exercise_doc = nlp(text)

In [72]:
for token in exercise_doc:
    if token.like_url:
        print(token)

http://www.data.gov/
http://www.science
http://data.gov.uk/.
http://www3.norc.org/gss+website/
http://www.europeansocialsurvey.org/.


(2) Extract all money transaction from below sentence along with currency. Output should be,

two $

500 €

In [74]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"
doc_transaction = nlp(transactions)
n = len(doc_transaction)

for i in range(n):
    if i<n-1 and doc_transaction[i+1].is_currency:
        print(doc_transaction[i], doc_transaction[i+1])

two $
500 €


### Stemming in NLTK

In [32]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [35]:
words = ["eating", "eats", "eat", "ate", "adjustable", "rafting", "ability", "meeting"]

for word in words:
    print("Original word: ",word, " | ", "Lemma: ",stemmer.stem(word))

Original word:  eating  |  Lemma:  eat
Original word:  eats  |  Lemma:  eat
Original word:  eat  |  Lemma:  eat
Original word:  ate  |  Lemma:  ate
Original word:  adjustable  |  Lemma:  adjust
Original word:  rafting  |  Lemma:  raft
Original word:  ability  |  Lemma:  abil
Original word:  meeting  |  Lemma:  meet


### Lemmatization in Spacy

In [36]:
nlp = spacy.load("en_core_web_md")

doc = nlp("Mando talked for 3 hours although talking isn't his thing")

for token in doc:
    print("Original word: ",token, " | ", "Lemma: ",token.lemma_)

Original word:  Mando  |  Lemma:  Mando
Original word:  talked  |  Lemma:  talk
Original word:  for  |  Lemma:  for
Original word:  3  |  Lemma:  3
Original word:  hours  |  Lemma:  hour
Original word:  although  |  Lemma:  although
Original word:  talking  |  Lemma:  talk
Original word:  is  |  Lemma:  be
Original word:  n't  |  Lemma:  not
Original word:  his  |  Lemma:  his
Original word:  thing  |  Lemma:  thing


### Customizing Lemmatizer

In [37]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [38]:
ar = nlp.get_pipe('attribute_ruler')

ar.add([[{"TEXT":"Bro"}],[{"TEXT":"Brah"}]],{"LEMMA":"Brother"})

doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")
for token in doc:
    print(token.text, "|", token.lemma_)

Bro | Brother
, | ,
you | you
wanna | wanna
go | go
? | ?
Brah | Brother
, | ,
do | do
n't | not
say | say
no | no
! | !
I | I
am | be
exhausted | exhausted


### Position of Sentence (POS)

In [75]:
from spacy.attrs import POS

In [77]:
doc = nlp("Mando talked for 3 hours although talking isn't his thing")

count = doc.count_by(POS)

for k, v in count.items():
    print(doc.vocab[k].text, "|", v)

 | 11
