# NLP Demystified | Preprocessing: Tokenization

In [60]:
import spacy

In [61]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


You should consider upgrading via the 'c:\Users\chiou\OneDrive\Bureau\Projects_CDL\NLP demystified\NLPdymenv\Scripts\python.exe -m pip install --upgrade pip' command.


In [62]:
nlp = spacy.load('en_core_web_sm')

In [63]:
type(nlp)

spacy.lang.en.English

#### Tokenizing a simple string 

In [64]:
s = "He didn't want to pay $20 for this book."
doc = nlp(s)

In [65]:
type(doc)

spacy.tokens.doc.Doc

In [66]:
print([t.text for t in doc ])

['He', 'did', "n't", 'want', 'to', 'pay', '$', '20', 'for', 'this', 'book', '.']


In [67]:
print(doc[0])


He


In [68]:
print(type(doc[0])) #Token object

<class 'spacy.tokens.token.Token'>


In [69]:
print(type(doc[1:4])) #span object

<class 'spacy.tokens.span.Span'>


In [70]:
print([(t.text, t.i) for t in doc ])

[('He', 0), ('did', 1), ("n't", 2), ('want', 3), ('to', 4), ('pay', 5), ('$', 6), ('20', 7), ('for', 8), ('this', 9), ('book', 10), ('.', 11)]


In [71]:
# Spacy does non destructive Tokenization which means the input is always retreivable after Tokenization 
print(doc.text)

He didn't want to pay $20 for this book.


In [72]:
# Tokenization of a full text 

s = """Either the well was very deep, or she fell very slowly, for she 
had plenty of time as she went down to look about her and to wonder what 
was going to happen next. First, she tried to look down and make out what 
she was coming to, but it was too dark to see anything; then she looked at 
the sides of the well, and noticed that they were filled with cupboards and 
book-shelves; here and there she saw maps and pictures hung upon pegs."""

doc = nlp(s)

print([sent for sent in doc.sents])

[Either the well was very deep, or she fell very slowly, for she 
had plenty of time as she went down to look about her and to wonder what 
was going to happen next., First, she tried to look down and make out what 
she was coming to, but it was too dark to see anything; then she looked at 
the sides of the well, and noticed that they were filled with cupboards and 
book-shelves; here and there she saw maps and pictures hung upon pegs.]


### Exercices 

In [73]:
#
# EXERCISE:
# 1) Tokenize the following text
# 2) Iterate through the tokens to check whether there's a currency symbol.
# 3) If there is, and the currency label is followed by a number, print
#    both the symbol and the number.
# 
# Look through https://spacy.io/api/token#attributes on how to check whether
# a token is a currency symbol or a number.
#
# Expected output: "$20".
s = "He didn't want to pay $20 for this book."
doc = nlp(s)
for token in doc: 
    if token.like_num:
        token_list = [(t.text, t.i) for t in doc]
        for j in range(len(token_list)):
            if token_list[j][1] == token.i -1:
                prev_token = token_list[j][0]
        print(str(prev_token) + str(token))

$20


In [74]:
import nltk
from nltk.tokenize import TreebankWordTokenizer
s = "Let's go to N.Y.C. for the weekend."
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(s)

['Let', "'s", 'go', 'to', 'N.Y.C.', 'for', 'the', 'weekend', '.']