## 2.1 - Tokenization

#### Word tokenization

In [2]:
s = "The quick brown fox jumped over the lazy dog"

s.split()

['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']

In [4]:
s = "The quick brown fox, and a lazy dog"

s.split()

['The', 'quick', 'brown', 'fox,', 'and', 'a', 'lazy', 'dog']

In [8]:
from nltk.tokenize import word_tokenize

s = "The quick brown fox, and a lazy dog"

word_tokenize(s)

['The', 'quick', 'brown', 'fox', ',', 'and', 'a', 'lazy', 'dog']

In [18]:
s = "Dr. Smith is visiting the patient."

word_tokenize(s)

['Dr.', 'Smith', 'is', 'visiting', 'the', 'patient', '.']

In [19]:
s = "USA vs. U.S.A."

word_tokenize(s)

['USA', 'vs.', 'U.S.A', '.']

In [20]:
s = "USA vs. U.S.A. and more words."

word_tokenize(s)

['USA', 'vs.', 'U.S.A.', 'and', 'more', 'words', '.']

#### Sentence tokenization a.k.a. sentence segmentation

In [21]:
s = "Text with many sentences. This is a sentence."

word_tokenize(s)

['Text', 'with', 'many', 'sentences', '.', 'This', 'is', 'a', 'sentence', '.']

In [22]:
from nltk.tokenize import sent_tokenize

s = "Text with many sentences. This is a sentence."

sent_tokenize(s)

['Text with many sentences.', 'This is a sentence.']

In [23]:
s = "Text with many sentences. This is a sentence."

for sentence in sent_tokenize(s):
    print(word_tokenize(sentence))

['Text', 'with', 'many', 'sentences', '.']
['This', 'is', 'a', 'sentence', '.']


#### Different data domains (e.g. Twitter)

In [24]:
s = 'Hi @marcobonzanini just an example! :D http://example.com #NLP'

word_tokenize(s)

['Hi',
 '@',
 'marcobonzanini',
 'just',
 'an',
 'example',
 '!',
 ':',
 'D',
 'http',
 ':',
 '//example.com',
 '#',
 'NLP']

In [25]:
from nltk.tokenize import TweetTokenizer

tokenizer = TweetTokenizer()

s = 'Hi @marcobonzanini just an example! :D http://example.com #NLP'
tokenizer.tokenize(s)

[u'Hi',
 u'@marcobonzanini',
 u'just',
 u'an',
 u'example',
 u'!',
 u':D',
 u'http://example.com',
 u'#NLP']

#### Phrases

Capture concepts like "quick brown fox", "good movie" or "nice restaurant"

In [26]:
from nltk import bigrams, trigrams, ngrams

s = "The quick brown fox jumped over the lazy dog"
tokens = word_tokenize(s)

list(bigrams(tokens))

[('The', 'quick'),
 ('quick', 'brown'),
 ('brown', 'fox'),
 ('fox', 'jumped'),
 ('jumped', 'over'),
 ('over', 'the'),
 ('the', 'lazy'),
 ('lazy', 'dog')]

In [27]:
list(trigrams(tokens))

[('The', 'quick', 'brown'),
 ('quick', 'brown', 'fox'),
 ('brown', 'fox', 'jumped'),
 ('fox', 'jumped', 'over'),
 ('jumped', 'over', 'the'),
 ('over', 'the', 'lazy'),
 ('the', 'lazy', 'dog')]

In [28]:
list(ngrams(tokens, 2))

[('The', 'quick'),
 ('quick', 'brown'),
 ('brown', 'fox'),
 ('fox', 'jumped'),
 ('jumped', 'over'),
 ('over', 'the'),
 ('the', 'lazy'),
 ('lazy', 'dog')]

In [30]:
list(ngrams(tokens, 4))

[('The', 'quick', 'brown', 'fox'),
 ('quick', 'brown', 'fox', 'jumped'),
 ('brown', 'fox', 'jumped', 'over'),
 ('fox', 'jumped', 'over', 'the'),
 ('jumped', 'over', 'the', 'lazy'),
 ('over', 'the', 'lazy', 'dog')]