# **Enhancing NLP Pipelines With SpaCy**

**Import Libraries**

In [1]:
import spacy
from spacy import displacy

**Load SpaCy**

In [2]:
nlp = spacy.load("en_core_web_sm")

**Linguistic Annotations**

In [3]:
doc = nlp("Company Y is planning to acquire stake in X company for $23 billion")
for token in doc:
    print(token.text, token.pos_, token.dep_)

Company NOUN compound
Y PROPN nsubj
is AUX aux
planning VERB ROOT
to PART aux
acquire VERB xcomp
stake NOUN dobj
in ADP prep
X NOUN compound
company NOUN pobj
for ADP prep
$ SYM quantmod
23 NUM compound
billion NUM pobj


**Print Active Pipeline Components**

In [14]:
doc = nlp("He went to play cricket with friends in the stadium")
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

**Tokenization**

In [7]:
doc = nlp("Reliance is looking at buying U.K. based analytics startup for $7 billion")
for token in doc:
    print(token.text)

Reliance
is
looking
at
buying
U.K.
based
analytics
startup
for
$
7
billion


**Print Tokens Part-Of-Speech (POS) Taggings**

In [8]:
for token in doc:
    # Print the token and its part-of-speech tag
    print(token, token.tag_, token.pos_, spacy.explain(token.tag_))

Reliance NN NOUN noun, singular or mass
is VBZ AUX verb, 3rd person singular present
looking VBG VERB verb, gerund or present participle
at IN ADP conjunction, subordinating or preposition
buying VBG VERB verb, gerund or present participle
U.K. NNP PROPN noun, proper singular
based VBN VERB verb, past participle
analytics NNS NOUN noun, plural
startup VB VERB verb, base form
for IN ADP conjunction, subordinating or preposition
$ $ SYM symbol, currency
7 CD NUM cardinal number
billion CD NUM cardinal number


**visualize Doc**

In [9]:
doc = nlp("board member meet with senior manager")
displacy.render(doc, style="dep" , jupyter=True)

**Entity Detection**

In [10]:
doc= nlp(u"""The Amazon rainforest,[a] alternatively,
the Amazon Jungle, also known in English as Amazonia,
is a moist broadleaf tropical rainforest in the Amazon
 biome that covers most of the Amazon basin of South America.
  This basin encompasses 7,000,000 km2 (2,700,000 sq mi), of
   which 5,500,000 km2 (2,100,000 sq mi) are covered by the rainforest.
   This region includes territory belonging to nine nations.""")

entities=[(i, i.label_, i.label) for i in doc.ents]
entities

[(Amazon, 'ORG', 383),
 (Amazon, 'ORG', 383),
 (Jungle, 'PRODUCT', 386),
 (English, 'LANGUAGE', 389),
 (Amazonia, 'GPE', 384),
 (Amazon, 'ORG', 383),
 (Amazon, 'ORG', 383),
 (South America, 'LOC', 385),
 (7,000,000, 'CARDINAL', 397),
 (2,700,000 sq mi, 'QUANTITY', 395),
 (5,500,000, 'CARDINAL', 397),
 (2,100,000 sq mi, 'QUANTITY', 395),
 (nine, 'CARDINAL', 397)]

**Display Input Text With Detected Objects Highlighted**

In [11]:
displacy.render(doc, style = "ent",jupyter = True)

**Similarity**

In [12]:
tokens = nlp("dog cat banana afskfsd")

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 6.814786 True
cat True 7.3709016 True
banana True 7.6460695 True
afskfsd True 7.192256 True


In [13]:
tokens = nlp("dog cat banana")

for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

dog dog 1.0
dog cat 0.5957574844360352
dog banana 0.43743896484375
cat dog 0.5957574844360352
cat cat 1.0
cat banana 0.46431881189346313
banana dog 0.43743896484375
banana cat 0.46431881189346313
banana banana 1.0


  print(token1.text, token2.text, token1.similarity(token2))
