In [None]:
#SpaCy has POS tagging and Dependency Parsing;
import spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp("Company Y is planning to acquire stake in X company for $23 billion")
for token in doc:
    print(token.text, token.pos_, token.dep_)

Company NOUN compound
Y PROPN nsubj
is AUX aux
planning VERB ROOT
to PART aux
acquire VERB xcomp
stake NOUN dobj
in ADP prep
X PROPN compound
company NOUN pobj
for ADP prep
$ SYM quantmod
23 NUM compound
billion NUM pobj


In [None]:
# what are the spacy pipeline components?
# you can disable the ones you wouldn't need, for performance!
# nlp.disable_pipes('tagger', 'parser')
import spacy
nlp = spacy.load('en_core_web_sm')

# Create an nlp object
doc = nlp("He went to play cricket with friends in the stadium")
nlp.pipe_names

['tagger', 'parser', 'ner']

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Reliance is looking at buying U.K. based analytics startup for $7 billion")
for token in doc:
    print(token.text)

Reliance
is
looking
at
buying
U.K.
based
analytics
startup
for
$
7
billion


Part-of-Speech tags:
Noun
Pronoun
Adjective
Verb
Adverb
Preposition
Conjunction
Interjection

In [None]:
import spacy 
nlp = spacy.load('en_core_web_sm')

# Create an nlp object
doc = nlp("Reliance is looking at buying U.K. based analytics startup for $7 billion")
 
# Iterate over the tokens
for token in doc:
    # Print the token and its part-of-speech tag
    print(token, token.tag_, token.pos_, spacy.explain(token.tag_))

Reliance NN NOUN noun, singular or mass
is VBZ AUX verb, 3rd person singular present
looking VBG VERB verb, gerund or present participle
at IN ADP conjunction, subordinating or preposition
buying VBG VERB verb, gerund or present participle
U.K. NNP PROPN noun, proper singular
based VBN VERB verb, past participle
analytics NNS NOUN noun, plural
startup NN NOUN noun, singular or mass
for IN ADP conjunction, subordinating or preposition
$ $ SYM symbol, currency
7 CD NUM cardinal number
billion CD NUM cardinal number


In [None]:
import spacy 
nlp = spacy.load('en_core_web_sm')

# Create an nlp object
doc = nlp("Reliance is looking at buying U.K. based analytics startup for $7 billion")
 
# Iterate over the tokens
for token in doc:
    # Print the token and its part-of-speech tag
    print(token.text, "-->", token.dep_)

Reliance --> nsubj
is --> aux
looking --> ROOT
at --> prep
buying --> pcomp
U.K. --> nmod
based --> amod
analytics --> compound
startup --> dobj
for --> prep
$ --> quantmod
7 --> compound
billion --> pobj


In [None]:
spacy.explain("nsubj"), spacy.explain("ROOT"), spacy.explain("aux"), spacy.explain("advcl"), spacy.explain("dobj")

('nominal subject',
 None,
 'auxiliary',
 'adverbial clause modifier',
 'direct object')

In [None]:
import spacy
from spacy import displacy

doc = nlp("Reliance is looking at buying U.K. based analytics startup for $7 billion")
displacy.render(doc, style="dep" , jupyter=True)

In [None]:
# Lemmatization
import spacy 
nlp = spacy.load('en_core_web_sm')

# Create an nlp object
doc = nlp("Reliance is looking at buying U.K. based analytics startup for $7 billion")
 
# Iterate over the tokens
for token in doc:
    # Print the token and its part-of-speech tag
    print(token.text, "-->", token.lemma_)

Reliance --> reliance
is --> be
looking --> look
at --> at
buying --> buy
U.K. --> U.K.
based --> base
analytics --> analytic
startup --> startup
for --> for
$ --> $
7 --> 7
billion --> billion


In [None]:
# Sentence Boundary Detection SBD:
import spacy 
nlp = spacy.load('en_core_web_sm')

# Create an nlp object
doc = nlp("Reliance is looking at buying U.K. based analytics startup for $7 billion.This is India.India is great")
 
sentences = list(doc.sents)
len(sentences)

3

In [None]:
for sentence in sentences:
     print (sentence)

Reliance is looking at buying U.K. based analytics startup for $7 billion.
This is India.
India is great


In [None]:
# NER
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Reliance is looking at buying U.K. based analytics startup for $7 billion")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Reliance 0 8 ORG
U.K. 30 34 GPE
$7 billion 63 73 MONEY


In [None]:
# NED Name Entity Detection:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
doc= nlp(u"""The Amazon rainforest,[a] alternatively, the Amazon Jungle, also known in English as Amazonia, is a moist broadleaf tropical rainforest in the Amazon biome that covers most of the Amazon basin of South America. This basin encompasses 7,000,000 km2 (2,700,000 sq mi), of which 5,500,000 km2 (2,100,000 sq mi) are covered by the rainforest. This region includes territory belonging to nine nations.

The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and with minor amounts in Bolivia, Ecuador, French Guiana, Guyana, Suriname, and Venezuela. Four nations have "Amazonas" as the name of one of their first-level administrative regions and France uses the name "Guiana Amazonian Park" for its rainforest protected area. The Amazon represents over half of the planet's remaining rainforests,[2] and comprises the largest and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual trees divided into 16,000 species.[3]

Etymology
The name Amazon is said to arise from a war Francisco de Orellana fought with the Tapuyas and other tribes. The women of the tribe fought alongside the men, as was their custom.[4] Orellana derived the name Amazonas from the Amazons of Greek mythology, described by Herodotus and Diodorus.[4]

History
See also: History of South America § Amazon, and Amazon River § History
Tribal societies are well capable of escalation to all-out wars between tribes. Thus, in the Amazonas, there was perpetual animosity between the neighboring tribes of the Jivaro. Several tribes of the Jivaroan group, including the Shuar, practised headhunting for trophies and headshrinking.[5] The accounts of missionaries to the area in the borderlands between Brazil and Venezuela have recounted constant infighting in the Yanomami tribes. More than a third of the Yanomamo males, on average, died from warfare.[6]""")

entities=[(i, i.label_, i.label) for i in doc.ents]
entities

[(Amazon, 'ORG', 383),
 (English, 'LANGUAGE', 389),
 (Amazonia, 'GPE', 384),
 (Amazon, 'ORG', 383),
 (Amazon, 'LOC', 385),
 (South America, 'LOC', 385),
 (7,000,000 km2, 'QUANTITY', 395),
 (2,700,000 sq mi, 'MONEY', 394),
 (5,500,000 km2, 'QUANTITY', 395),
 (2,100,000 sq mi, 'MONEY', 394),
 (nine, 'CARDINAL', 397),
 (Brazil, 'GPE', 384),
 (60%, 'PERCENT', 393),
 (Peru, 'GPE', 384),
 (13%, 'PERCENT', 393),
 (Colombia, 'GPE', 384),
 (10%, 'PERCENT', 393),
 (Bolivia, 'GPE', 384),
 (Ecuador, 'GPE', 384),
 (French Guiana, 'PERSON', 380),
 (Guyana, 'PERSON', 380),
 (Venezuela, 'GPE', 384),
 (Four, 'CARDINAL', 397),
 (Amazonas, 'PERSON', 380),
 (one, 'CARDINAL', 397),
 (first, 'ORDINAL', 396),
 (France, 'GPE', 384),
 (Guiana Amazonian Park, 'WORK_OF_ART', 388),
 (Amazon, 'ORG', 383),
 (over half, 'CARDINAL', 397),
 (an estimated 390 billion, 'QUANTITY', 395),
 (16,000, 'CARDINAL', 397),
 (Amazon, 'ORG', 383),
 (Francisco de Orellana, 'PRODUCT', 386),
 (Orellana, 'PERSON', 380),
 (Amazonas, 'G

In [None]:
# use displayC:

displacy.render(doc, style = "ent",jupyter = True)

In [None]:
 # Similarity measures
 import spacy

nlp = spacy.load("en_core_web_sm")
tokens = nlp("dog cat banana afskfsd")

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 19.266302 True
cat True 19.220264 True
banana True 17.748499 True
afskfsd True 20.882006 True


In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")  # make sure to use larger model!
tokens = nlp("dog cat banana")

for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

dog dog 1.0
dog cat 0.47130838
dog banana 0.3275862
cat dog 0.47130838
cat cat 1.0
cat banana 0.3547838
banana dog 0.3275862
banana cat 0.3547838
banana banana 1.0


  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
