In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
doc1 = nlp("Tokyo, officially Tokyo Metropolis, is the capital city of Japan.")

for token in doc1:
    print(f'{token.text}|{token.pos_}|{token.tag_}|{spacy.explain(token.tag_)}')
print()
print('Entities')
for entity in doc1.ents:
    print(f'{entity}|{entity.label_}|{spacy.explain(entity.label_)}')

Tokyo|PROPN|NNP|noun, proper singular
,|PUNCT|,|punctuation mark, comma
officially|ADV|RB|adverb
Tokyo|PROPN|NNP|noun, proper singular
Metropolis|PROPN|NNP|noun, proper singular
,|PUNCT|,|punctuation mark, comma
is|AUX|VBZ|verb, 3rd person singular present
the|DET|DT|determiner
capital|NOUN|NN|noun, singular or mass
city|NOUN|NN|noun, singular or mass
of|ADP|IN|conjunction, subordinating or preposition
Japan|PROPN|NNP|noun, proper singular
.|PUNCT|.|punctuation mark, sentence closer

Entities
Tokyo|GPE|Countries, cities, states
Tokyo|GPE|Countries, cities, states
Metropolis|PERSON|People, including fictional
Japan|GPE|Countries, cities, states


In [3]:
doc2 = nlp("The twelve-year-old cat chased the mouse across the back yard.")
for chunk in doc2.noun_chunks:
    print(chunk)

The twelve-year-old cat
the mouse
the back yard


In [4]:
from spacy import displacy
doc3 = nlp("Apple Inc. is planning to buy a UK startup for $1 billion.")
displacy.render(doc3,style="ent",jupyter=True)

In [5]:
doc4 = nlp("I am running. He runs. We will be running a marathon.")
new_sentence = ""
length = len(doc4)
for i,token in enumerate(doc4):
    if(i != length - 1):
        if(doc4[i+1].text == "."):
            new_sentence += token.lemma_
        else:
            new_sentence += token.lemma_ + " "
new_sentence += doc4[length - 1].lemma_
new_doc4 = nlp(new_sentence)
print(new_doc4)

I be run. he run. we will be run a marathon.


In [6]:
def mark_changes_between_docs(old_sentence,new_sentence):
    changed_tokens = []
    for i,token in enumerate(old_sentence):
        if old_sentence[i].text == new_sentence[i].text:
            continue
        else:
            changed_tokens.append(new_sentence[i].text)
    print(f'Changed tokens: {changed_tokens}')

In [7]:
mark_changes_between_docs(doc4,new_doc4)

Changed tokens: ['be', 'run', 'he', 'run', 'we', 'run']


In [8]:
doc5 = nlp("It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.")
count_stop_words = 0
new_sentence = ""
for token in doc5:
    if nlp.vocab[token.text].is_stop:
        count_stop_words += 1
    else:
        new_sentence += token.text + " "
print(f'Number of stop words: {count_stop_words}')
print(f'Text without stop words: {new_sentence}')

Number of stop words: 13
Text without stop words: truth universally acknowledged , single man possession good fortune , want wife . 


In [9]:
stop_word = "brb"
nlp.Defaults.stop_words.add(stop_word)
nlp.vocab[stop_word].is_stop = True
doc6 = nlp("I will brb")
for token in doc6:
    if nlp.vocab[token.text].is_stop:
        print(f'nlp is recognizing "{token.text}" as stop word')

nlp is recognizing "I" as stop word
nlp is recognizing "will" as stop word
nlp is recognizing "brb" as stop word


In [10]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
pattern_1 = [{"LEMMA": "pat"},{"IS_PUNCT":True, "OP": "*"},{"LEMMA":"cat"}]
pattern_2 = [{"LOWER": "pat_cat"}]
matcher.add("Cat",[pattern_1,pattern_2])

In [11]:
doc7 = nlp("My grandmother has a pet cat.")
doc8 = nlp("Many people enjoy having pets. They calling them pat-cat or pat_cat")
doc9 = nlp("Do you think a cat makes a good pet? Pat cats are the best!")

matches_doc7 = matcher(doc7)
matches_doc8 = matcher(doc8)
matches_doc9 = matcher(doc9)

def print_matches(matches,doc):
    for match_id, start, end in matches:
        print(doc[start:end])

print_matches(matches_doc7,doc7)
print()
print_matches(matches_doc8,doc8)
print()
print_matches(matches_doc9,doc9)


pat-cat
pat_cat

Pat cats


In [12]:
#class exercise

In [13]:
doc1 = nlp("Barack Obama was born on August 4, 1961, in Honolulu, Hawaii")
for ent in doc1.ents:
    print(f'{ent}: {spacy.explain(ent.label_)} Hash: {ent.label}')

Barack Obama: People, including fictional Hash: 380
August 4, 1961: Absolute or relative dates or periods Hash: 391
Honolulu: Countries, cities, states Hash: 384
Hawaii: Countries, cities, states Hash: 384


In [14]:
text = "Maccabi Tel Aviv played against Hapoel Tel Aviv in the finals of the National Basketball League"

from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
pharse_list = ["Maccabi Tel Aviv","Hapoel Tel Aviv"]
pharses = [nlp(text) for text in pharse_list]
matcher.add("sport_teams",pharses)
doc2 = nlp(text)
matches = matcher(doc2)

doc2 = nlp(text)
doc2.ents = [ent for ent in doc2.ents if ent.text not in ['Tel Aviv','Hapoel Tel Aviv']]

from spacy.tokens import Span
hash_org = nlp.vocab.strings['ORG']
new_ent = [Span(doc2,match[1],match[2],label=hash_org) for match in matches]
doc2.ents = list(doc2.ents) + new_ent

for ent in doc2.ents:
    print(ent)

Maccabi Tel Aviv
Hapoel Tel Aviv
the National Basketball League


In [15]:
text = """Google announced new features for its product line including the Pixel 5 and Google"
Home. Meanwhile, Apple released the iPhone 13 and MacBook Pro in a recent event. Microsoft
introduced updates to the Surface Pro and Windows 11. Amazon continues to expand its
services, and Facebook is planning to rebrand its products to Meta. Samsung unveiled the
"Galaxy S21 and Galaxy Buds Pro."""
doc3 = nlp(text)
org_count = 0
prod_count = 0
for ent in doc3.ents:
    if ent.label_ == "ORG":
        org_count += 1

    elif ent.label_ == "PRODUCT":
        prod_count += 1
    print(f'{ent}: Start: {ent.start}, End: {ent.end}')
print(org_count,prod_count)

Google: Start: 0, End: 1
Google: Start: 13, End: 14
Apple: Start: 20, End: 21
MacBook Pro: Start: 26, End: 28
Microsoft: Start: 33, End: 34
the Surface Pro: Start: 38, End: 41
Amazon: Start: 45, End: 46
Meta: Start: 62, End: 63
Samsung: Start: 64, End: 65
Galaxy S21: Start: 69, End: 71
Galaxy Buds Pro: Start: 72, End: 75
8 0


In [16]:
from spacy import displacy
displacy.render(doc3, style="ent", jupyter=True,options={"ents":["ORG"]})

In [19]:
text = """User1: hello User2: Hey how are you? User1: I'm good how are you? User2: I'm good as
well, thanks"."""
doc5 = nlp(text)
# nlp.remove_pipe("custom_sentence_boundries")
from spacy.language import Language
@Language.component("custom_sentence_boundries")
def custom_sentaece_boundries(doc):
    for token in doc[:-1]:
        if token == ":":
            doc[token.i + 1].is_sent_start = True
    return doc


nlp.add_pipe("custom_sentence_boundries", before="parser")

<function __main__.custom_sentaece_boundries(doc)>

In [20]:
doc4 = nlp(text)
for sent in doc4.sents:
    print(sent)

User1: hello User2:
Hey how are you?
User1:
I'm good how are you?
User2:
I'm good as
well, thanks".
