In [1]:
import spacy

In [2]:
nlp = spacy.blank("en")

doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token)

Captain
america
ate
100
$
of
samosa
.
Then
he
said
I
can
do
this
all
day
.


In [3]:
nlp.pipe_names

[]

In [4]:
nlp = spacy.load("en_core_web_sm")
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [5]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7f1199ceff50>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7f1199ceea50>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7f11d45d4f20>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7f1199684a10>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7f1199686250>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7f11d45d4ba0>)]

In [6]:
doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token, " | ", spacy.explain(token.pos_), " | ", token.lemma_)

Captain  |  proper noun  |  Captain
america  |  proper noun  |  america
ate  |  verb  |  eat
100  |  numeral  |  100
$  |  numeral  |  $
of  |  adposition  |  of
samosa  |  proper noun  |  samosa
.  |  punctuation  |  .
Then  |  adverb  |  then
he  |  pronoun  |  he
said  |  verb  |  say
I  |  pronoun  |  I
can  |  auxiliary  |  can
do  |  verb  |  do
this  |  pronoun  |  this
all  |  determiner  |  all
day  |  noun  |  day
.  |  punctuation  |  .


In [7]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for ent in doc.ents:
    print(ent.text," | ", ent.label_)

Tesla Inc  |  ORG
$45 billion  |  MONEY


In [8]:
from spacy import displacy

displacy.render(doc, style="ent")

In [9]:
nlp = spacy.load("fr_core_news_sm")

In [10]:
doc = nlp("Tesla Inc va racheter Twitter pour $45 milliards de dollars")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  PER  |  Named person or family.
Twitter  |  MISC  |  Miscellaneous entities, e.g. events, nationalities, products or works of art


In [11]:
for token in doc:
    print(token, " | ", token.pos_, " | ", token.lemma_)

Tesla  |  PROPN  |  Tesla
Inc  |  PROPN  |  Inc
va  |  VERB  |  aller
racheter  |  VERB  |  racheter
Twitter  |  VERB  |  twitter
pour  |  ADP  |  pour
$  |  NOUN  |  dollar
45  |  NUM  |  45
milliards  |  NOUN  |  milliard
de  |  ADP  |  de
dollars  |  NOUN  |  dollar


## Adding custom component to pipeline

In [14]:
source_nlp= spacy.load("en_core_web_sm")
nlp = spacy.blank("en")
nlp.add_pipe("ner", source=source_nlp)
nlp.pipe_names

['ner']

In [16]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
$45 billion  |  MONEY  |  Monetary values, including unit


## Spacy Language Processing Pipelines: Exercises

In [17]:
nlp = spacy.load("en_core_web_sm")  #creating an object and loading the pre-trained model for "English"

### Excersie: 1
- Get all the proper nouns from a given text in a list and also count how many of them.
- **Proper Noun** means a noun that names a particular person, place, or thing.

In [28]:
text = '''Ravi and Raju are the best friends from school days.They wanted to go for a world tour and 
visit famous cities like Paris, London, Dubai, Rome etc and also they called their another friend Mohan to take part of this world tour.
They started their journey from Hyderabad and spent next 3 months travelling all the wonderful cities in the world and cherish a happy moments!
'''

# https://spacy.io/usage/linguistic-features

#creating the nlp object
doc = nlp(text)
proper_nouns = []
for token in doc:
    if token.pos_ == "PROPN" or token.ent_type_ in ["PERSON", "GPE"]:
        proper_nouns.append(token.text)
    elif token.text == "Ravi":  # Manually adding Ravi cause it spacy recognizes it as NOUN
        proper_nouns.append(token.text)

print("All Proper Nouns:", proper_nouns)
print("Count of proper nouns:", len(proper_nouns))

All Proper Nouns: ['Ravi', 'Raju', 'Paris', 'London', 'Dubai', 'Rome', 'Mohan', 'Hyderabad']
Count of proper nouns: 8


### Excersie: 2
- Get all companies names from a given text and also the count of them.
- **Hint:** Use the spacy **ner** functionality

In [29]:
text = '''The Top 5 companies in USA are Tesla, Walmart, Amazon, Microsoft, Google and the top 5 companies in 
India are Infosys, Reliance, HDFC Bank, Hindustan Unilever and Bharti Airtel'''


doc = nlp(text)
company_names=[ent.text for ent in doc.ents if ent.label_=="ORG"]
print("Company Names: ", company_names)
print("Count: ", len(company_names))

Company Names:  ['Tesla', 'Walmart', 'Amazon', 'Microsoft', 'Google', 'Infosys', 'Reliance', 'HDFC Bank', 'Hindustan Unilever', 'Bharti Airtel']
Count:  10
