# Chapter 2 Examples

#### Example 2-0

In [3]:
# Example 2-0. Prepare the environment
# # Run the command below once to download spaCy's English model
# !python -m spacy download en_core_web_sm

## Corpora, Tokens, and Types

#### Example 2-1

In [4]:
# Example 2-1. Tokenizing text
import spacy
nlp = spacy.load('en_core_web_sm')
text = "Mary, don’t slap the green witch"
print([str(token) for token in nlp(text.lower())])

['mary', ',', 'do', 'n’t', 'slap', 'the', 'green', 'witch']


In [5]:
from nltk.tokenize import TweetTokenizer
tweet=u"""Snow White and the Seven Degrees
    #MakeAMovieCold@midnight:-)"""
tokenizer = TweetTokenizer()
print(tokenizer.tokenize(tweet.lower()))

['snow', 'white', 'and', 'the', 'seven', 'degrees', '#makeamoviecold', '@midnight', ':-)']


## Unigrams, Bigrams, Trigrams, …, N-grams

#### Example 2-2

In [6]:
# Unigrams, Bigrams, Trigrams, …, N-grams
def n_grams(text, n):
    '''
    takes tokens or text, returns a list of n-grams
    '''
    return [text[i:i+n] for i in range(len(text)-n+1)]

cleaned = ['mary', ',', "n't", 'slap', 'green', 'witch', '.']
print(n_grams(cleaned, 3))

[['mary', ',', "n't"], [',', "n't", 'slap'], ["n't", 'slap', 'green'], ['slap', 'green', 'witch'], ['green', 'witch', '.']]


## Lemmas and Stems

#### Example 2-3

In [7]:
# Example 2-3. Lemmatization: reducing words to their root forms
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(u"he was running late")
for token in doc:
    print('{} --> {}'.format(token, token.lemma_))

he --> he
was --> be
running --> run
late --> late


## Categorizing Words: POS Tagging

#### Example 2-4

In [8]:
# Example 2-4. Part-of-speech tagging
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(u"Mary slapped the green witch.")
for token in doc:
    print('{} - {}'.format(token, token.pos_))

Mary - PROPN
slapped - VERB
the - DET
green - ADJ
witch - NOUN
. - PUNCT


## Categorizing Spans: Chunking and Named Entity Recognition

#### Example 2-5

In [9]:
# Example 2-5. Noun Phrase (NP) chunking
import spacy
nlp = spacy.load('en_core_web_sm')
doc  = nlp(u"Mary slapped the green witch.")
for chunk in doc.noun_chunks:
    print ('{} - {}'.format(chunk, chunk.label_))

Mary - NP
the green witch - NP
