In [1]:
import spacy

In [2]:
# import spacy and load the language library

nlp=spacy.load('en_core_web_sm')



In [3]:
# create a Doc object

doc=nlp(u'Tesla is looking at buying u.s. startup for $6 million')

In [4]:
# print each token seperately

for token in doc:
    print(token.text, token.pos_,token.dep_)

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
u.s PROPN compound
. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [5]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x1443486b0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x1443495b0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x14b57e030>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x14b7d5fd0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x14b7d8dd0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x14b57e180>)]

In [6]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [7]:
# Create a string that includes opening and closing quotations marks

mystring = '"we\' re moving to L.A.!"'
print(mystring)

"we' re moving to L.A.!"


In [8]:
# Create a doc object and exlore tokens
doc=nlp(mystring)

In [9]:
for token in doc:
    print(token.text, end='|')

"|we|'|re|moving|to|L.A.|!|"|

In [10]:
# Prefixes, suffixes and infixes
doc2=nlp(u"we're here to help! send snail-mail, email support@oursite.com or visit us at htto://www.oursite.com!")

In [11]:
for t in doc2:
    print(t)

we
're
here
to
help
!
send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
htto://www.oursite.com
!


In [12]:
doc3=nlp(u' A skm NYC cab ride costs $10.30')

In [13]:
for t in doc3:
    print(doc3)

 A skm NYC cab ride costs $10.30
 A skm NYC cab ride costs $10.30
 A skm NYC cab ride costs $10.30
 A skm NYC cab ride costs $10.30
 A skm NYC cab ride costs $10.30
 A skm NYC cab ride costs $10.30
 A skm NYC cab ride costs $10.30
 A skm NYC cab ride costs $10.30
 A skm NYC cab ride costs $10.30


In [14]:
# Exceptions

doc4 = nlp(u"Let's visit st. Louis in the U.S. next year.")

In [15]:
for t in doc4:
    print(t)

Let
's
visit
st
.
Louis
in
the
U.S.
next
year
.


In [16]:
# Counting tokens
len(doc)

9

In [17]:
# Tokens can be retrieved by index position and slice
doc5=nlp(u'It is better to give than to receive')

In [18]:
# Retrieve the third Token:
doc5[2]

better

In [19]:
doc5[2:5]

better to give

In [20]:
doc5[-4:]

give than to receive

In [21]:
# Tokens cannot be reassigned

In [22]:
doc6=nlp(u'My dinner was horrible.')
doc7=nlp(u'Your dinner was delicious.')

In [23]:
doc6[3],doc7[3]

(horrible, delicious)

In [24]:
# Named entities
doc8=nlp(u'Apple to build a Hong Kong factory for $6 million')

In [25]:
for token in doc8:
    print(token.text, end='|')
print('\n----')
for ent in doc8.ents:
    print(ent.text+' - '+ent.label_+'-'+str(spacy.explain(ent.label_)))

Apple|to|build|a|Hong|Kong|factory|for|$|6|million|
----
Apple - ORG-Companies, agencies, institutions, etc.
Hong Kong - GPE-Countries, cities, states
$6 million - MONEY-Monetary values, including unit


In [26]:
#STOP WORDS

In [27]:
# Perform standard imports
import spacy

In [28]:
nlp = spacy.load('en_core_web_sm')

In [29]:
print(nlp.Defaults.stop_words)

{'several', 'within', 'before', 'those', 'must', 'somehow', 'a', 'do', 'please', 'less', 'together', 'could', 'noone', 'call', 'due', 'fifteen', 'been', 'as', 'around', 'might', 'sometime', 'with', 'few', 'his', 'serious', 'it', 'can', 'besides', 'itself', 'whereafter', 'whole', 'he', 'over', '‘s', 'enough', 'either', 'show', 'all', 'am', 'eleven', 'then', 'ten', 'say', 'be', 'yours', 'everywhere', 'own', 'whenever', 'however', 'various', 'using', 'bottom', 'former', 'we', 'same', 'next', 'your', 'twenty', 'whence', 'back', 'her', 'seemed', 'mine', '‘ll', 'used', 'each', 'is', 'what', 'herself', 'who', 'does', 'unless', 'cannot', 'full', 'if', 'off', 'up', 'yet', 'in', 'against', 'and', 'anyone', 'done', 'hence', 'that', 'hereby', 'about', 'eight', 'how', 'indeed', 'out', 'something', 'too', 'under', 'latter', 'ca', 'hereupon', '’re', 'him', 'first', 'least', 'else', 'whereby', 'but', 'one', 'put', 'hers', "'re", 'thereby', 'on', 'only', 'nine', 'already', 'whether', 'moreover', 'other

In [30]:
len(nlp.Defaults.stop_words)

326

In [31]:
nlp.vocab['myself'].is_stop

True

In [32]:
nlp.vocab['mystery'].is_stop

False

In [33]:
nlp.vocab['btw'].is_stop

False

In [34]:
nlp.Defaults.stop_words.add('btw')

In [35]:
nlp.vocab['btw'].is_stop = True

In [36]:
len(nlp.Defaults.stop_words)

327

In [37]:
nlp.vocab['btw'].is_stop

True

In [38]:
# FRESH MATCHING

In [39]:
import spacy

In [40]:
nlp=spacy.load('en_core_web_sm')

In [41]:
from spacy.matcher import Matcher

In [42]:
matcher = Matcher(nlp.vocab)

In [43]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]
pattern3 = [{'LOWER': 'Solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]

In [44]:
matcher.add('solarpower',[pattern1, pattern2, pattern3])

In [45]:
doc = nlp(u'The solar power industry continues to grow a solarpower increase. solar-power cars are gaining popularity.')

In [46]:
found_matcher = matcher(doc)
print(found_matcher)

[(5703546853475899243, 1, 3), (5703546853475899243, 8, 9)]


In [47]:
pattern1 = [{'LOWER':'machine'},{'LOWER':'intelligence'}]
pattern2 = [{'LOWER':'artificial'},{'LOWER':'intelligence'}]
pattern3 = [{'LOWER':'deep'},{'LOWER':'Learning'}]

In [48]:
matcher.add("DS Terms" ,[pattern1,pattern2,pattern3])

In [49]:
doc = nlp("I will learn machine-learning and artificial-intelligence.")
matches = matcher(doc)

In [50]:
matches

[]

# Visualizing part of speech

In [52]:
import spacy 

In [53]:
nlp = spacy.load("en_core_web_sm")



In [54]:
from spacy import displacy

In [55]:
doc = nlp(u"The quick browen fox jumped over the lazy dog's back.")
doc

The quick browen fox jumped over the lazy dog's back.

In [56]:
displacy.render(doc, style='dep', jupyter=True, options={'distance': 110, 'compact': 'True', 'color': 'yellow', 'bg': 'red', 'font': 'Times'})

In [None]:
displacy.serve(doc, port=5001)




Using the 'dep' visualizer
Serving on http://0.0.0.0:5001 ...



In [None]:
displacy.serve(doc, auto_select_port=True)

In [None]:
doc2 = nlp(u"This is a sentence. This is another, possibly longer sentence.")

## N-grams

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv