In [14]:
import spacy
import pandas as pd
from spacy import displacy
nlp = spacy.load('en_core_web_md')

In [2]:
doc = nlp("Alicia and me went to school by bus.")

for token in doc:
    print(token.text, token.pos_, spacy.explain(token.pos_), token.tag_, spacy.explain(token.tag_))

Alicia PROPN proper noun NNP noun, proper singular
and CCONJ coordinating conjunction CC conjunction, coordinating
me PRON pronoun PRP pronoun, personal
went VERB verb VBD verb, past tense
to ADP adposition IN conjunction, subordinating or preposition
school NOUN noun NN noun, singular or mass
by ADP adposition IN conjunction, subordinating or preposition
bus NOUN noun NN noun, singular or mass
. PUNCT punctuation . punctuation mark, sentence closer


In [3]:
doc2 = nlp("My friend will fly to New York fast and she is staying there for 3 days.")

for token in doc2:
    print(token.text, token.pos_, token.tag_, 
         spacy.explain(token.pos_), spacy.explain(token.tag_))

My PRON PRP$ pronoun pronoun, possessive
friend NOUN NN noun noun, singular or mass
will AUX MD auxiliary verb, modal auxiliary
fly VERB VB verb verb, base form
to ADP IN adposition conjunction, subordinating or preposition
New PROPN NNP proper noun noun, proper singular
York PROPN NNP proper noun noun, proper singular
fast ADV RB adverb adverb
and CCONJ CC coordinating conjunction conjunction, coordinating
she PRON PRP pronoun pronoun, personal
is AUX VBZ auxiliary verb, 3rd person singular present
staying VERB VBG verb verb, gerund or present participle
there ADV RB adverb adverb
for ADP IN adposition conjunction, subordinating or preposition
3 NUM CD numeral cardinal number
days NOUN NNS noun noun, plural
. PUNCT . punctuation punctuation mark, sentence closer


In [4]:
## Lets use a word that can be both noun/verb (ship)

doc3 = nlp("I will ship my the items tommorow. The ship is leaving tommorow.")
for token in doc3:
    print(token.text, token.pos_, token.tag_, spacy.explain(token.tag_))

I PRON PRP pronoun, personal
will AUX MD verb, modal auxiliary
ship VERB VB verb, base form
my PRON PRP$ pronoun, possessive
the DET DT determiner
items NOUN NNS noun, plural
tommorow PROPN NNP noun, proper singular
. PUNCT . punctuation mark, sentence closer
The DET DT determiner
ship NOUN NN noun, singular or mass
is AUX VBZ verb, 3rd person singular present
leaving VERB VBG verb, gerund or present participle
tommorow PROPN NNP noun, proper singular
. PUNCT . punctuation mark, sentence closer


In [5]:
doc4 = nlp("My cat will fish for a fish tommorow in a fishy way.")

for token in doc4:
    print(token.text, token.pos_, token.tag_, spacy.explain(token.pos_), spacy.explain(token.tag_))

My PRON PRP$ pronoun pronoun, possessive
cat NOUN NN noun noun, singular or mass
will AUX MD auxiliary verb, modal auxiliary
fish VERB VB verb verb, base form
for ADP IN adposition conjunction, subordinating or preposition
a DET DT determiner determiner
fish NOUN NN noun noun, singular or mass
tommorow NOUN NN noun noun, singular or mass
in ADP IN adposition conjunction, subordinating or preposition
a DET DT determiner determiner
fishy ADJ JJ adjective adjective (English), other noun-modifier (Chinese)
way NOUN NN noun noun, singular or mass
. PUNCT . punctuation punctuation mark, sentence closer


### Dealing with WSD (Word-Sense Disambiguation)

#### Filtering sentences with intent to book a flight

I flew to Rome 3 days ago. I still didn't get the bill, please send it ASAP. <br />
I have flown to Rome this morning and forgot my laptop on the
airplane. Can you please connect me to lost and found?<br />
I'm flying to Rome next week. Can you check flight availability? <br />
I need to fly to Rome. Can you check flights on next Tuesday? <br />
I will fly to Rome next week. Can you check the flights?

In [6]:
sent1 = "I flew to Rome"
sent2 = "I'm flying to Rome"
sent3 = "I will fly to Rome"
sent4 = "I have flown to Rome"

In [7]:
doc5 = nlp(sent1)
doc6 = nlp(sent2)
doc7 = nlp(sent3)
doc8 = nlp(sent4)

In [8]:
for doc in [doc5, doc6, doc7, doc8]:
    print([(w.text, w.lemma_) for w in doc if w.tag_ == 'VBG' or w.tag_ == 'VB'])
print("Therefore second and third sentence have shown interest in booking a flight")

[]
[('flying', 'fly')]
[('fly', 'fly')]
[]
Therefore second and third sentence have shown interest in booking a flight


### Understanding number, symbol, and punctuation tags

In [9]:
doc9 = nlp("He earned $5 million in 2020 and paid 35% in tax.")

for token in doc9:
    print(token.text, token.tag_, spacy.explain(token.tag_))

He PRP pronoun, personal
earned VBD verb, past tense
$ $ symbol, currency
5 CD cardinal number
million CD cardinal number
in IN conjunction, subordinating or preposition
2020 CD cardinal number
and CC conjunction, coordinating
paid VBD verb, past tense
35 CD cardinal number
% NN noun, singular or mass
in IN conjunction, subordinating or preposition
tax NN noun, singular or mass
. . punctuation mark, sentence closer


In [10]:
doc10 = nlp("red bottle")
for token in doc10:
    print(token.text, token.dep_)
# amod is a dependency label for an adjective-noun relation
# ROOT represents the root of the phrase

red amod
bottle ROOT


### Dependency tree

In [17]:
doc11 = nlp("I counted white sheep")
for token in doc11:
    print(token.text, token.pos_, token.dep_, token.head)
print(displacy.render(doc11, style = 'dep'))

I PRON nsubj counted
counted VERB ROOT counted
white ADJ amod sheep
sheep NOUN dobj counted


None


In [13]:
doc12 = nlp("We are trying to understand the difference.")

for token in doc12:
    print(token.text, token.tag_, token.pos_, token.dep_, token.head)

We PRP PRON nsubj trying
are VBP AUX aux trying
trying VBG VERB ROOT trying
to TO PART aux understand
understand VB VERB xcomp trying
the DT DET det difference
difference NN NOUN dobj understand
. . PUNCT punct trying


In [15]:
displacy.render(doc12, style = 'dep')

### Named Entity Recognition

In [18]:
doc13 = nlp("President Trump visited France")
doc13.ents

(Trump, France)

In [23]:
doc14 = nlp("I worked for Microsoft last year")
for token in doc14:
    print(token.text, token.ent_type_)

I 
worked 
for 
Microsoft ORG
last DATE
year DATE


In [27]:
doc15 = nlp("Albert Einstein was born in Ulm on 1879. He studied electronical engineering at ETH Zurich.")
for token in doc15:
    print(token.text, token.ent_type_, '[', spacy.explain(token.ent_type_), ']')

Albert PERSON [ People, including fictional ]
Einstein PERSON [ People, including fictional ]
was  [ None ]
born  [ None ]
in  [ None ]
Ulm GPE [ Countries, cities, states ]
on  [ None ]
1879 DATE [ Absolute or relative dates or periods ]
.  [ None ]
He  [ None ]
studied  [ None ]
electronical  [ None ]
engineering  [ None ]
at  [ None ]
ETH ORG [ Companies, agencies, institutions, etc. ]
Zurich ORG [ Companies, agencies, institutions, etc. ]
.  [ None ]


In [28]:
doc = nlp("Jean-Michel Basquiat was an American artist of Haitian and Puerto Rican descent who gained fame with"
          "his graffiti and street art work")
doc.ents

(Jean-Michel Basquiat, American, Haitian, Puerto Rican)

In [32]:
for ent in doc.ents:
    print(ent.text, ent.label_, '( %s )' %(spacy.explain(ent.label_)))

Jean-Michel Basquiat PERSON ( People, including fictional )
American NORP ( Nationalities or religious or political groups )
Haitian GPE ( Countries, cities, states )
Puerto Rican NORP ( Nationalities or religious or political groups )


### Real - world example

NER is a popular and frequently used pipeline component of spaCy. NER is
one of the key components of understanding the text topic, as named
entities usually belong to a semantic category. For instance, President
Trump invokes the politics subject in our minds, whereas Leonardo
DiCaprio is more about movies. If you want to go deeper into resolving the
text meaning and understanding who made what, you also need named
entities.
This real-world example includes processing a New York Times article. Let's
go ahead and download the article first by running the following code:

In [36]:
from bs4 import BeautifulSoup
import requests

In [45]:
def url_text(url_string):
    res = requests.get(url_string)
    
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')
    
    for script in soup(["script", "style", "aside"]):
        script.extract()
    text = soup.get_text()
    
    return " ".join(text.split())

In [51]:
ap_art = url_text("https://apnews.com/article/donald-trump-mar-a-lago-criminal-investigations-government-"
                  "and-politics-4ee554f467252929f96c70d42a93a05d?utm_source=homepage&utm_medium=TopNews&utm_campaign=position_1")
ap_article = open("article.txt", "w")
ap_article.writelines(ap_art)
ap_article.close()

In [53]:
doc = nlp(ap_art)
len(doc.ents)

118

In [54]:
from collections import Counter

labels = [ent.label_ for ent in doc.ents]

In [55]:
Counter(labels)

Counter({'ORG': 50,
         'NORP': 3,
         'GPE': 9,
         'PERSON': 19,
         'DATE': 21,
         'CARDINAL': 12,
         'ORDINAL': 2,
         'LAW': 1,
         'WORK_OF_ART': 1})

In [58]:
items = [ent.text for ent in doc.ents]
Counter(items).most_common(15)

[('Trump', 12),
 ('FBI', 10),
 ('Friday', 6),
 ('U.S.', 4),
 ('the Justice Department', 4),
 ('Bruce Reinhart', 3),
 ('15', 3),
 ('AP', 2),
 ("Donald Trump's", 2),
 ('Friday, Aug. 26, 2022', 2),
 ('the White House', 2),
 ('first', 2),
 ('Aug. 8', 2),
 ('Justice Department', 2),
 ('Congress', 2)]

In [67]:
for ent in doc[:100].ents:
    print(ent.text, ent.label_, '[%s]' %spacy.explain(ent.label_))

AP ORG [Companies, agencies, institutions, etc.]
NewsAP NEWS ORG [Companies, agencies, institutions, etc.]
NewsWorld NewsAfricaAsia ORG [Companies, agencies, institutions, etc.]
AmericaMiddle EastPoliticsPresident ORG [Companies, agencies, institutions, etc.]
CourtSportsMLBWNBA NORP [Nationalities or religious or political groups]
environmentOdditiesPhotographyTravelAP Fact CheckLifestyleReligionPress ORG [Companies, agencies, institutions, etc.]
copyhttps://apnews.com/article/donald-trump-mar-a-lago-criminal-investigations-government-and-politics-4ee554f467252929f96c70d42a93a05dClick GPE [Countries, cities, states]
NewsDonald GPE [Countries, cities, states]
ERIC TUCKER PERSON [People, including fictional]
MICHAEL PERSON [People, including fictional]
26, 2022 GMT1 DATE [Absolute or relative dates or periods]
FBI ORG [Companies, agencies, institutions, etc.]
Donald Trump's PERSON [People, including fictional]


In [64]:
part_article = doc[:200]

In [65]:
displacy.render(part_article, style = 'ent')