# 2. Build your vocabulary (word tokenization)

## 2.2 Building your vocabulary with a tokenizer

In [1]:
sentence = "Thomas Jefferson began building Monticello at the age of twenty-six."
sentence.split()

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 'twenty-six.']

In [3]:
import numpy as np

token_sequence = str.split(sentence)

vocab = sorted(set(token_sequence))

', '.join(vocab)

'Jefferson, Monticello, Thomas, age, at, began, building, of, the, twenty-six.'

In [6]:
num_tokens = len(token_sequence)
vocab_size = len(vocab)
onehot_vectors = np.zeros((num_tokens, vocab_size), int)

for i, word in enumerate(token_sequence):
    onehot_vectors[i, vocab.index(word)] = 1
                   
', '.join(vocab)         

'Jefferson, Monticello, Thomas, age, at, began, building, of, the, twenty-six.'

In [7]:
onehot_vectors

array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

In [10]:
import pandas as pd

df = pd.DataFrame(onehot_vectors, columns=vocab)
df

Unnamed: 0,Jefferson,Monticello,Thomas,age,at,began,building,of,the,twenty-six.
0,0,0,1,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0
6,0,0,0,0,0,0,0,0,1,0
7,0,0,0,1,0,0,0,0,0,0
8,0,0,0,0,0,0,0,1,0,0
9,0,0,0,0,0,0,0,0,0,1


In [11]:
df[df == 0] = ''
df

Unnamed: 0,Jefferson,Monticello,Thomas,age,at,began,building,of,the,twenty-six.
0,,,1.0,,,,,,,
1,1.0,,,,,,,,,
2,,,,,,1.0,,,,
3,,,,,,,1.0,,,
4,,1.0,,,,,,,,
5,,,,,1.0,,,,,
6,,,,,,,,,1.0,
7,,,,1.0,,,,,,
8,,,,,,,,1.0,,
9,,,,,,,,,,1.0


In [12]:
num_rows = 3000 * 3500 * 15
num_rows

157500000

In [13]:
num_bytes = num_rows * 1000000
num_bytes

157500000000000

In [15]:
num_bytes / 1e9 # gigabytes

157500.0

In [16]:
_ / 1000 # terabytes

157.5

In [17]:
sentence_bow = {}

for token in sentence.split():
    sentence_bow[token] = 1
    
sorted(sentence_bow.items())

[('26.', 1),
 ('Jefferson', 1),
 ('Monticello', 1),
 ('Thomas', 1),
 ('age', 1),
 ('at', 1),
 ('began', 1),
 ('building', 1),
 ('of', 1),
 ('the', 1)]

In [18]:
df = pd.DataFrame(pd.Series(dict([(token, 1) for token in sentence.split()])), columns=['sent']).T
df

Unnamed: 0,Thomas,Jefferson,began,building,Monticello,at,the,age,of,26.
sent,1,1,1,1,1,1,1,1,1,1


In [37]:
sentences = "Thomas Jefferson began building Monticello at the age of 26.\n"
sentences += "Construction was done mostly by local masons and carpenters.\n"
sentences += "He moved into the South Pavilion in 1770.\n" 
sentences += "Turning Monticello into a neoclassical masterpiece was Jefferson's obsession."

corpus = {}
for i, sent in enumerate(sentences.split('\n')):
    corpus['sent{}'.format(i + 1)] = dict((tok, 1) for tok in sent.split())
    # print(corpus)
df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T
# df[df.columns[:10]]
df.head(10)

# corpus = {'sent0': df.T['sent'].to_dict()}
# for i, sent in enumerate(sentences.split('\n')):
#     corpus['sent{}'.format(i + 1)] = dict((tok, 1) for tok in sent.split())
# df = pd.DataFrame(corpus, dtype=int).fillna(0)
# df.head(10)  # show the first 10 tokens in our vocabulary for this 4-document corpus

Unnamed: 0,Thomas,Jefferson,began,building,Monticello,at,the,age,of,26.,...,South,Pavilion,in,1770.,Turning,a,neoclassical,masterpiece,Jefferson's,obsession.
sent1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
sent2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sent3,0,0,0,0,0,0,1,0,0,0,...,1,1,1,1,0,0,0,0,0,0
sent4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,1,1,1,1,1


### 2.2.1 Dot product

In [25]:
v1 = pd.np.array([1, 2, 3])
v2 = pd.np.array([2, 3, 4])

v1.dot(v2)

  v1 = pd.np.array([1, 2, 3])
  v2 = pd.np.array([2, 3, 4])


20

In [26]:
(v1 * v2).sum()

20

In [28]:
sum([x1 * x2 for x1, x2 in zip(v1, v2)])

20

### 2.2.2 Measuring bag-of-words overlap

In [38]:
df = df.T

df.sent1.dot(df.sent2)

0

In [39]:
df.sent1.dot(df.sent3)

1

In [40]:
df.sent1.dot(df.sent4)

1

In [41]:
[{k, v} for (k, v) in (df.sent1 & df.sent4).items() if v]

[{1, 'Monticello'}]

### 2.2.3 A token improvement

In [42]:
import re

sentence = "Thomas Jefferson began building Monticello at the age of 26."

tokens = re.split(r'[-\s.,;!?]+', sentence)
tokens

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '']

In [43]:
pattern= re.compile(r"([-\s.,;!?])+")

tokens = pattern.split(sentence)

tokens[-10:] # just the last 10 tokens

[' ', 'the', ' ', 'age', ' ', 'of', ' ', '26', '.', '']

In [45]:
sentence = "Thomas Jefferson began building Monticello at the age of 26."

tokens = pattern.split(sentence)

[x for x in tokens if x not in '- \t\n.,;!>']

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26']

In [46]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+|$[0-9.]+|\S+')
tokenizer.tokenize(sentence)

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '.']

In [47]:
from nltk.tokenize import TreebankWordTokenizer

sentence = """Monticello wasn't designated as UNESCO Word Heritage\
    Sit until 1987."""

tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(sentence)

['Monticello',
 'was',
 "n't",
 'designated',
 'as',
 'UNESCO',
 'Word',
 'Heritage',
 'Sit',
 'until',
 '1987',
 '.']

In [48]:
from nltk.tokenize.casual import casual_tokenize

message = """RT @TJMonticello Best day everrrrrrr at Monticello.\
    Awesommmmmeeeeeee day :*)"""

casual_tokenize(message)

['RT',
 '@TJMonticello',
 'Best',
 'day',
 'everrrrrrr',
 'at',
 'Monticello',
 '.',
 'Awesommmmmeeeeeee',
 'day',
 ':*)']

In [49]:
casual_tokenize(message, reduce_len=True, strip_handles=True)

['RT',
 'Best',
 'day',
 'everrr',
 'at',
 'Monticello',
 '.',
 'Awesommmeee',
 'day',
 ':*)']

### 2.2.4 Extending your vocabulary with n-grams