In [1]:
import spacy
print(spacy.__version__)

2.2.2


In [2]:
nlp = spacy.load('en')

In [3]:
nlf = spacy.load('fr')

In [17]:
doc = nlp(u'I am learning how to build chatbots')

In [18]:
for token in doc:
    print(token.text, token.pos_)

I PRON
am AUX
learning VERB
how ADV
to PART
build VERB
chatbots NOUN


In [19]:
doc = nlp(u'I am going to London next week for a meeting.')

In [22]:
for token in doc:
    print(token.text, token.pos_)

I PRON
am AUX
going VERB
to ADP
London PROPN
next ADJ
week NOUN
for ADP
a DET
meeting NOUN
. PUNCT


In [24]:
docs = nlf(u'Je m\'appelle Abdou ' )

In [25]:
for token in docs:
    print(token.text, token.pos_)

Je PRON
m' PRON
appelle AUX
Abdou PROPN


In [26]:
doc = nlp(u'Google release "Move Mirror" AI experiment that matches your pose from 80,000 images')

In [27]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

Google Google PROPN NNP compound Xxxxx True False
release release NOUN NN nmod xxxx True False
" " PUNCT `` punct " False False
Move Move PROPN NNP nmod Xxxx True True
Mirror Mirror PROPN NNP nmod Xxxxx True False
" " PUNCT '' punct " False False
AI AI PROPN NNP compound XX True False
experiment experiment NOUN NN ROOT xxxx True False
that that PRON WDT nsubj xxxx True True
matches match VERB VBZ relcl xxxx True False
your -PRON- PRON PRP$ poss xxxx True True
pose pose NOUN NN dobj xxxx True False
from from ADP IN prep xxxx True True
80,000 80,000 NUM CD nummod dd,ddd False False
images image NOUN NNS pobj xxxx True False


In [28]:
doc = nlp(u'I am learning how to build chatbots')
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

I -PRON- PRON PRP nsubj X True True
am be AUX VBP aux xx True True
learning learn VERB VBG ROOT xxxx True False
how how ADV WRB advmod xxx True True
to to PART TO aux xx True True
build build VERB VB xcomp xxxx True False
chatbots chatbot NOUN NNS dobj xxxx True False


## Stemming and Lemmatization

In [84]:
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups

In [49]:
lemma_index = {"verb": ("cope", "cop")}
lemma_exc = {"verb": {"coping": ("cope",)}}
lemma_rules = {"verb": [["ing", ""]]}

In [50]:
lookups = Lookups()

In [51]:
lookups.add_table("lemma_index", lemma_index)
lookups.add_table("lemma_exc", lemma_exc)
lookups.add_table("lemma_rules", lemma_rules)

Table([(6360137228241296794, [['ing', '']])])

In [52]:
lemmatizer = Lemmatizer(lookups)

In [53]:
lemmatizer('chuckles', 'NOUN')

['chuckles']

In [54]:
lemmatizer('blazing', 'VERB')

['blaz']

In [55]:
lemmatizer('fastest', 'ADJ')

['fastest']

In [57]:
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer

In [58]:
porter_stemmer = PorterStemmer()
snowball_stemmer = SnowballStemmer("english")
print(porter_stemmer.stem("fastest"))
print(snowball_stemmer.stem("fastest"))

fastest
fastest


## Named-Entity Recognition

In [62]:
my_string = u"Google has its headquarters in Mountain View, California having revenue amounted to 109.65 billion US dollars"
doc = nlp(my_string)
for ent in doc.ents:
    print(ent.text, ent.label_)

Google ORG
Mountain View GPE
California GPE
109.65 billion US dollars MONEY


In [63]:
my_string = u"Mark Zuckerberg born May 14, 1984 in New York is an American technology entrepreneur and philanthropist best known for co-founding and leading Facebook as its chairman and CEO."
doc = nlp(my_string)
for ent in doc.ents:
    print(ent.text, ent.label_)

Mark Zuckerberg PERSON
May 14, 1984 DATE
New York GPE
American NORP
Facebook ORG


In [64]:
my_string = u"I usually wake up at 9:00 AM. 90% of my daytime goes in learning new things."
doc = nlp(my_string)
for ent in doc.ents:
    print(ent.text, ent.label_)

9:00 AM TIME
90% PERCENT


#### Deux Phrases avec des mots similaires mais avec une signification différente

In [69]:
my_string1 = u"Imagine Dragons are the best band."
my_string2 = u"Imagine dragons come and take over the city."
doc1 = nlp(my_string1)
doc2 = nlp(my_string2)
for ent in doc1.ents:
    print(ent.text, ent.label_)

##  Stop Words

In [73]:
from spacy.lang.en.stop_words import STOP_WORDS
print(STOP_WORDS)

{'two', '‘re', 'see', 'it', 'hundred', 'forty', 'afterwards', 'across', 'either', 'whoever', 'become', 'its', 'thereupon', 'else', '’m', 'regarding', 'onto', '’ve', 'cannot', 'most', 'not', 'so', 'am', 'how', 'below', 'until', 'yourselves', 'may', "n't", 'do', 'namely', "'ll", 'please', 'out', 'here', 'yourself', 'during', '‘m', 'becomes', 'herein', 'same', "'ve", 'keep', '’d', 'third', 'elsewhere', '‘ll', 'some', 'although', 'is', 'using', 'latterly', 'they', 'anyhow', 'myself', 'whither', 'nine', 'hers', 'neither', 'everything', 'four', 'while', 'whence', 'formerly', 'nevertheless', 'seemed', 'very', 'take', 'now', 'hence', 'quite', 'such', 'between', 'thence', '’ll', 'whereas', 'in', 'someone', 'around', 'others', 'who', 'full', 'few', 'made', 're', 'whereby', 'together', 'whether', 'than', 'thus', 'get', 'within', "'m", 'often', 'moreover', 'ourselves', 'was', 'but', 'her', 'less', 'as', 'up', 'also', 'beforehand', 'none', 'a', 'show', 'any', 'twenty', 'himself', 'ten', 'themselves

In [74]:
nlp.vocab[u'is'].is_stop

True

In [75]:
nlp.vocab[u'hello'].is_stop

False

In [76]:
nlp.vocab[u'with'].is_stop

True

## Dependency Parsing

In [77]:
doc = nlp(u'Book me a flight from Bangalore to Goa')
blr, goa = doc[5], doc[7]
list(blr.ancestors)

[from, flight, Book]

In [78]:
list(goa.ancestors)

[to, flight, Book]

In [79]:
list(doc[4].ancestors) #doc[4]==flight

[flight, Book]

In [80]:
doc[3].is_ancestor(doc[5])

True

In [92]:
doc = nlp(u'Book a table at the restaurant and the taxi to the hotel')
tasks = doc[2], doc[8] #(table, taxi)
tasks_target = doc[5], doc[11] #(restaurant, hotel)
for task in tasks_target:
    for tok in task.ancestors:
        if tok in tasks:
            print("Booking of {} belongs to {}".format(tok, task))
            break

Booking of table belongs to restaurant


### Interactive Visualization for Dependency Parsing

In [85]:
from spacy import displacy

In [86]:
doc = nlp(u'Book a table at the restaurant and the taxi to the hotel')
displacy.serve(doc, style='dep')

  "__main__", mod_spec)



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [91]:
doc = nlp(u"What are some places to visit in Berlin and stay in Lubeck")
places = [doc[7], doc[11]] #[Berlin, Lubeck]
actions = [doc[5], doc[9]] #[visit, stay]

for place in places:
    for tok in place.ancestors:
        if tok in actions:
            print("User is referring {} to {}".format(place, tok))
            break

User is referring Berlin to visit
User is referring Lubeck to stay


## Noun Chunks

In [93]:
doc = nlp(u"Boston Dynamics is gearing up to produce thousands of robot dogs")
list(doc.noun_chunks)

[Boston Dynamics, thousands, robot dogs]

In [94]:
doc = nlp(u"Deep learning cracks the code of messenger RNAs and protein-coding potential")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

Deep learning learning nsubj cracks
the code code dobj cracks
messenger RNAs RNAs pobj of
protein-coding potential potential conj RNAs


## Finding Similarity

In [95]:
doc = nlp(u'How are you doing today?')
for token in doc:
    print(token.text, token.vector[:5])

How [-1.1488694  -1.9446867  -3.3982992  -1.7499528  -0.79467654]
are [-0.19671196 -5.245683   -1.0717657  -0.3345598   0.455879  ]
you [ 0.30779707 -3.2884548   2.9664392  -0.3070859  -0.4289852 ]
doing [-1.036085   -2.7334254  -2.331066   -0.67264587 -1.9747161 ]
today [-0.70118666 -0.54315114  0.54996264 -3.8326564   2.543304  ]
? [-2.408741   3.010358   2.4110384 -2.378118   0.9366494]


In [96]:
hello_doc = nlp(u"hello")
hi_doc = nlp(u"hi")
hella_doc = nlp(u"hella")
print(hello_doc.similarity(hi_doc))
print(hello_doc.similarity(hella_doc))

  "__main__", mod_spec)


0.8586564135458238


  "__main__", mod_spec)


0.3203011220450762


In [97]:
GoT_str1 = nlp(u"When will next season of Game of Thrones be releasing?")
GoT_str2 = nlp(u"Game of Thrones next season release date?")
GoT_str1.similarity(GoT_str2)

  "__main__", mod_spec)


0.7285646101203324

In [99]:
example_doc = nlp(u"car truck google")
for t1 in example_doc:
    for t2 in example_doc:
        similarity_perc = int(t1.similarity(t2) * 100)
        print ("Word {} is {}% similar to word {}".format(t1.text,similarity_perc, t2.text))

Word car is 100% similar to word car


  "__main__", mod_spec)


Word car is 44% similar to word truck


  "__main__", mod_spec)


Word car is 32% similar to word google


  "__main__", mod_spec)


Word truck is 44% similar to word car
Word truck is 100% similar to word truck


  "__main__", mod_spec)


Word truck is 42% similar to word google


  "__main__", mod_spec)


Word google is 32% similar to word car


  "__main__", mod_spec)


Word google is 42% similar to word truck
Word google is 100% similar to word google


In [100]:
doc = nlp(u'Brexit is the impending withdrawal of the U.K. from the European Union.')
for token in doc:
    print(token.text)

Brexit
is
the
impending
withdrawal
of
the
U.K.
from
the
European
Union
.


## Regular Expressions

In [104]:
import re

sentence1 = "Book me a metro from Airport Station to Hong Kong Station."
sentence2 = "Book me a cab to Hong Kong Airport from AsiaWorld-Expo."

from_to = re.compile('.* from (.*) to (.*)')
to_from = re.compile('.* to (.*) from (.*)')

from_to_match = from_to.match(sentence1)
to_from_match = to_from.match(sentence1)

if (from_to_match and from_to_match.groups()):
    _from = from_to_match.groups()[0]
    _to = from_to_match.groups()[1]
    print("from_to pattern matched correctly. Printing values\n")
    print("From: {}, To: {}".format(_from, _to))
elif to_from_match and to_from_match.groups():
    _to = to_from_match.groups()[0]
    _from = to_from_match.groups()[1]
    print("to_from pattern matched correctly. Printing values\n")
    print("From: {}, To: {}".format(_from, _to))

from_to pattern matched correctly. Printing values

From: Airport Station, To: Hong Kong Station.
