In [5]:
import spacy
nlp = spacy.load('en')
doc = nlp("Next week I'll   be in Madrid.")
for token in doc:
    print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
        token.text,
        token.idx,
        token.lemma_,
        token.is_punct,
        token.is_space,
        token.shape_,
        token.pos_,
        token.tag_
    ))

Next	0	next	False	False	Xxxx	ADJ	JJ
week	5	week	False	False	xxxx	NOUN	NN
I	10	-PRON-	False	False	X	PRON	PRP
'll	11	will	False	False	'xx	VERB	MD
  	15	  	False	True	  	SPACE	_SP
be	17	be	False	False	xx	VERB	VB
in	20	in	False	False	xx	ADP	IN
Madrid	23	madrid	False	False	Xxxxx	PROPN	NNP
.	29	.	True	False	.	PUNCT	.


In [12]:
# Basic Sentence Detection
doc = nlp("Next week. I be in Madrid.")
for sent in doc.sents:
    print(sent)

Next week.
I be in Madrid.


In [18]:
# POS Tagging
for i, token in enumerate(doc):
    print(i, token.text, token.tag_)

0 Next JJ
1 week NN
2 . .
3 I PRP
4 be VBP
5 in IN
6 Madrid NNP
7 . .


In [21]:
print(doc.ents)
print([(e.text, e.label_) for e in doc.ents])

(Next week, Madrid)
[('Next week', 'DATE'), ('Madrid', 'GPE')]


In [22]:
# IOB Tagging - Inside, Outside, Beginning tagging
# ^ Identifying tag INSIDE chunk, text belonging to no chunk and the start of chunk respectively

In [29]:
doc = nlp("I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the Wall Street Journal")
for ent in doc.ents:
    print(ent.text, ent.label_)
spacy.displacy.render(doc, style='ent', jupyter=True)

2 CARDINAL
9 a.m. TIME
30% PERCENT
just 2 days DATE
the Wall Street Journal ORG


In [30]:
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.label_, chunk.root.text)
#root.text gives the main word of the phrase

I NP I
2 shares NP shares
9 a.m. NP a.m.
the stock NP stock
just 2 days NP days
the Wall Street Journal NP Journal


In [36]:
# Dependency Parsing
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
for token in doc:
    print("{0}/{1} <--{2}-- {3}/{4}".format(
        token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))
# to visualize
spacy.displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})
#shows the relation between tokens in the sentence

Wall/NNP <--compound-- Street/NNP
Street/NNP <--compound-- Journal/NNP
Journal/NNP <--nsubj-- published/VBD
just/RB <--advmod-- published/VBD
published/VBD <--ROOT-- published/VBD
an/DT <--det-- piece/NN
interesting/JJ <--amod-- piece/NN
piece/NN <--dobj-- published/VBD
on/IN <--prep-- piece/NN
crypto/JJ <--compound-- currencies/NNS
currencies/NNS <--pobj-- on/IN


In [38]:
# Word Vectors/Embeddings
nlp = spacy.load('en_core_web_lg')
print(nlp.vocab['banana'].vector)

[ 2.0228e-01 -7.6618e-02  3.7032e-01  3.2845e-02 -4.1957e-01  7.2069e-02
 -3.7476e-01  5.7460e-02 -1.2401e-02  5.2949e-01 -5.2380e-01 -1.9771e-01
 -3.4147e-01  5.3317e-01 -2.5331e-02  1.7380e-01  1.6772e-01  8.3984e-01
  5.5107e-02  1.0547e-01  3.7872e-01  2.4275e-01  1.4745e-02  5.5951e-01
  1.2521e-01 -6.7596e-01  3.5842e-01 -4.0028e-02  9.5949e-02 -5.0690e-01
 -8.5318e-02  1.7980e-01  3.3867e-01  1.3230e-01  3.1021e-01  2.1878e-01
  1.6853e-01  1.9874e-01 -5.7385e-01 -1.0649e-01  2.6669e-01  1.2838e-01
 -1.2803e-01 -1.3284e-01  1.2657e-01  8.6723e-01  9.6721e-02  4.8306e-01
  2.1271e-01 -5.4990e-02 -8.2425e-02  2.2408e-01  2.3975e-01 -6.2260e-02
  6.2194e-01 -5.9900e-01  4.3201e-01  2.8143e-01  3.3842e-02 -4.8815e-01
 -2.1359e-01  2.7401e-01  2.4095e-01  4.5950e-01 -1.8605e-01 -1.0497e+00
 -9.7305e-02 -1.8908e-01 -7.0929e-01  4.0195e-01 -1.8768e-01  5.1687e-01
  1.2520e-01  8.4150e-01  1.2097e-01  8.8239e-02 -2.9196e-02  1.2151e-03
  5.6825e-02 -2.7421e-01  2.5564e-01  6.9793e-02 -2

In [48]:
# Word Similarity - based off word vectors

x = nlp.vocab['ship']
y = nlp.vocab['sea']
a = nlp.vocab['dog']
b = nlp.vocab['animal']
print(x.similarity(y), x.similarity(b))
print(b.similarity(a), y.similarity(a),'\n')

# Can be used on sentences/texts as well
target = nlp("Cats are beautiful animals.")
 
doc1 = nlp("Dogs are awesome.")
doc2 = nlp("Some gorgeous creatures are felines.")
doc3 = nlp("Dolphins are swimming mammals.")

print('Sentence similarities:')
print(target.similarity(doc1))  
print(target.similarity(doc2))
print(target.similarity(doc3))  

0.47626138 0.16165273
0.66185343 0.24735726 

Sentence similarities:
0.8901765218466683
0.9115828449161616
0.7822956752876101
