In [1]:
import spacy
from spacy import displacy

In [2]:
nlp = spacy.load('en_core_web_md')

In [3]:
doc = nlp('I own a ginger cat')

In [4]:
displacy.render(doc, style='dep')

In [5]:
doc2 = nlp('Bill Gates is the CEO of Microsoft')
displacy.render(doc2, style = 'ent')

## Tokenization

In [6]:
doc3 = nlp("I went to the market")
token1 = [token.text for token in doc3]
token1

['I', 'went', 'to', 'the', 'market']

In [7]:
token2 = [token.text for token in doc2]
token2

['Bill', 'Gates', 'is', 'the', 'CEO', 'of', 'Microsoft']

In [8]:
#source: https://www.reuters.com/business/futures-rise-after-biden-xi-call-oil-bounce-2021-09-10/
text = '''
Sept 10 (Reuters) - Wall Street's main indexes were subdued on Friday as signs of higher inflation and a drop in Apple shares following an unfavorable court ruling offset expectations of an easing in U.S.-China tensions.

Data earlier in the day showed U.S. producer prices rose solidly in August, leading to the biggest annual gain in nearly 11 years and indicating that high inflation was likely to persist as the pandemic pressures supply chains. read more .

"Today's data on wholesale prices should be eye-opening for the Federal Reserve, as inflation pressures still don't appear to be easing and will likely continue to be felt by the consumer in the coming months," said Charlie Ripley, senior investment strategist for Allianz Investment Management.

Apple Inc (AAPL.O) fell 2.7% following a U.S. court ruling in "Fortnite" creator Epic Games' antitrust lawsuit that stroke down some of the iPhone maker's restrictions on how developers can collect payments in apps.


Sponsored by Advertising Partner
Sponsored Video
Watch to learn more
Report ad
Apple shares were set for their worst single-day fall since May this year, weighing on the Nasdaq (.IXIC) and the S&P 500 technology sub-index (.SPLRCT), which fell 0.1%.

Sentiment also took a hit from Cleveland Federal Reserve Bank President Loretta Mester's comments that she would still like the central bank to begin tapering asset purchases this year despite the weak August jobs report. read more

Investors have paid keen attention to the labor market and data hinting towards higher inflation recently for hints on a timeline for the Federal Reserve to begin tapering its massive bond-buying program.

The S&P 500 has risen around 19% so far this year on support from dovish central bank policies and re-opening optimism, but concerns over rising coronavirus infections and accelerating inflation have lately stalled its advance.


Report ad
The three main U.S. indexes got some support on Friday from news of a phone call between U.S. President Joe Biden and Chinese leader Xi Jinping that was taken as a positive sign which could bring a thaw in ties between the world's two most important trading partners.

At 1:01 p.m. ET, the Dow Jones Industrial Average (.DJI) was up 12.24 points, or 0.04%, at 34,891.62, the S&P 500 (.SPX) was up 2.83 points, or 0.06%, at 4,496.11, and the Nasdaq Composite (.IXIC) was up 12.85 points, or 0.08%, at 15,261.11.

Six of the eleven S&P 500 sub-indexes gained, with energy (.SPNY), materials (.SPLRCM) and consumer discretionary stocks (.SPLRCD) rising the most.

U.S.-listed Chinese e-commerce companies Alibaba and JD.com , music streaming company Tencent Music (TME.N) and electric car maker Nio Inc (NIO.N) all gained between 0.7% and 1.4%


Report ad
Grocer Kroger Co (KR.N) dropped 7.1% after it said global supply chain disruptions, freight costs, discounts and wastage would hit its profit margins.

Advancing issues outnumbered decliners by a 1.12-to-1 ratio on the NYSE and by a 1.02-to-1 ratio on the Nasdaq.

The S&P index recorded 14 new 52-week highs and three new lows, while the Nasdaq recorded 49 new highs and 38 new lows.
'''

### Elaborating differences between tokenization and split function

In [9]:
msg = "It has been an amazing week by the grace of God!!!"

In [10]:
msg.split()

['It',
 'has',
 'been',
 'an',
 'amazing',
 'week',
 'by',
 'the',
 'grace',
 'of',
 'God!!!']

In [11]:
doc4 = nlp(msg)
[token.text for token in doc4]

['It',
 'has',
 'been',
 'an',
 'amazing',
 'week',
 'by',
 'the',
 'grace',
 'of',
 'God',
 '!',
 '!',
 '!']

In [12]:
print("Split length: %d\nspaCy token length: %d" %(len(msg.split()), len(doc4)))

Split length: 11
spaCy token length: 14


In [13]:
doc5 = nlp("Let's go to the market")
[token.text for token in doc5]

['Let', "'s", 'go', 'to', 'the', 'market']

### Adding special rules of tokenization

In [14]:
from spacy.symbols import ORTH

In [15]:
text = "lemme get that hat"

In [16]:
doc6 = nlp(text)

In [17]:
print([w.text for w in doc6])

['lemme', 'get', 'that', 'hat']


In [18]:
## Adding the tokenizer rule
special_case = [{ORTH: "lem"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("lemme", special_case)

In [19]:
print("After tokenizer rule adjustment:")
print([w.text for w in nlp(text)])

After tokenizer rule adjustment:
['lem', 'me', 'get', 'that', 'hat']


### Debugging the tokenizer

In [20]:
token_exp = nlp.tokenizer.explain("Let's go now!")

In [21]:
for w in token_exp:
    print(w[1], w[0])

Let SPECIAL-1
's SPECIAL-2
go TOKEN
now TOKEN
! SUFFIX


### spaCy sentence segmentation

In [22]:
statement = "I took a bus to downtown. Then took a Toyo back home."
doc7 = nlp(statement)

for sent in doc7.sents:
    print(sent)

I took a bus to downtown.
Then took a Toyo back home.


In [24]:
print(doc7.sents)
print(list(doc7.sents))

<generator object at 0x00000232998620E0>
[I took a bus to downtown., Then took a Toyo back home.]


### Understanding Lemmatization

A lemma is the base form of a token. You can think of a lemma as the form
in which the token appears in a dictionary. For instance, the lemma of
eating is eat; the lemma of eats is eat; ate similarly maps to eat.

In [25]:
news = "Apple shares were set for their worst single-day fall since May this year"

doc8 = nlp(news)

for token in doc8:
    print(token.text, '-->', token.lemma_)

Apple --> Apple
shares --> share
were --> be
set --> set
for --> for
their --> their
worst --> bad
single --> single
- --> -
day --> day
fall --> fall
since --> since
May --> May
this --> this
year --> year


In [70]:
## Lemmatization in special case
from spacy.symbols import ORTH, LEMMA


nlp.get_pipe("attribute_ruler").add([[{"TEXT": 'Chuga'}]], {"LEMMA": 'Arusha'})
nlp.get_pipe("attribute_ruler").add([[{"TEXT": 'Bongo'}]], {"LEMMA": 'Dar Es Salaam'})
# nlp.tokenizer.add_special_case('Chuga', new_rules)
text = "I am going to visit my father in Chuga. Then my mother at Bongo"
doc8 = nlp(text)

for token in doc8:
    print(token.text, ' ====> ', token.lemma_)

I  ====>  I
am  ====>  be
going  ====>  go
to  ====>  to
visit  ====>  visit
my  ====>  my
father  ====>  father
in  ====>  in
Chuga  ====>  Arusha
.  ====>  .
Then  ====>  then
my  ====>  my
mother  ====>  mother
at  ====>  at
Bongo  ====>  Dar Es Salaam


In [86]:
## Dealing with named entities
doc9 = nlp("I flew to New York with Jordan and Asha")
list(doc9.ents)

[New York, Jordan]

In [88]:
## Detecting nouns in a sentence
list(doc9.noun_chunks)

[I, New York, Jordan, Asha]

## Token

In [92]:
doc9[3].text

'New'

In [93]:
doc9[3].text_with_ws # token with white space

'New '

In [95]:
doc9[3].i # returns index of a token

3

In [96]:
doc9[3].idx # Position of a token

10

In [98]:
token = doc9[3]
token.doc # accessing a doc that created a token

I flew to New York with Jordan and Asha

In [100]:
# Checking if a token starts a sentence
token.is_sent_start

False

In [101]:
token.lemma_

'New'

In [102]:
doc10 = nlp("President Uhuru visited Tanzania last year")
doc10.ents

(Uhuru, Tanzania, last year)

In [105]:
doc10[1].ent_type_

'NORP'

In [106]:
for token in doc10:
    print(token, token.ent_type_)

President 
Uhuru NORP
visited 
Tanzania GPE
last DATE
year DATE


In [108]:
for token in doc10:
    print(token, token.shape_) # capital letters are covered with X and small letters are in x

President Xxxxx
Uhuru Xxxxx
visited xxxx
Tanzania Xxxxx
last xxxx
year xxxx


In [110]:
# Tracking stop words (words that do not carry much meaning)
doc11 = nlp("I just wanted to say thank you!")
for token in doc11:
    print(token, '---->', token.is_stop)

I ----> True
just ----> True
wanted ----> False
to ----> True
say ----> True
thank ----> False
you ----> True
! ----> False
