# 2. Build your vocabulary (word tokenization)

## 2.2 Building your vocabulary with a tokenizer

In [1]:
sentence = "Thomas Jefferson began building Monticello at the age of twenty-six."
sentence.split()

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 'twenty-six.']

In [2]:
import numpy as np

token_sequence = str.split(sentence)

vocab = sorted(set(token_sequence))

', '.join(vocab)

'Jefferson, Monticello, Thomas, age, at, began, building, of, the, twenty-six.'

In [3]:
num_tokens = len(token_sequence)
vocab_size = len(vocab)
onehot_vectors = np.zeros((num_tokens, vocab_size), int)

for i, word in enumerate(token_sequence):
    onehot_vectors[i, vocab.index(word)] = 1
                   
', '.join(vocab)         

'Jefferson, Monticello, Thomas, age, at, began, building, of, the, twenty-six.'

In [4]:
onehot_vectors

array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

In [5]:
import pandas as pd

df = pd.DataFrame(onehot_vectors, columns=vocab)
df

Unnamed: 0,Jefferson,Monticello,Thomas,age,at,began,building,of,the,twenty-six.
0,0,0,1,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0
6,0,0,0,0,0,0,0,0,1,0
7,0,0,0,1,0,0,0,0,0,0
8,0,0,0,0,0,0,0,1,0,0
9,0,0,0,0,0,0,0,0,0,1


In [6]:
df[df == 0] = ''
df

Unnamed: 0,Jefferson,Monticello,Thomas,age,at,began,building,of,the,twenty-six.
0,,,1.0,,,,,,,
1,1.0,,,,,,,,,
2,,,,,,1.0,,,,
3,,,,,,,1.0,,,
4,,1.0,,,,,,,,
5,,,,,1.0,,,,,
6,,,,,,,,,1.0,
7,,,,1.0,,,,,,
8,,,,,,,,1.0,,
9,,,,,,,,,,1.0


In [7]:
num_rows = 3000 * 3500 * 15
num_rows

157500000

In [8]:
num_bytes = num_rows * 1000000
num_bytes

157500000000000

In [9]:
num_bytes / 1e9 # gigabytes

157500.0

In [10]:
_ / 1000 # terabytes

157.5

In [11]:
sentence_bow = {}

for token in sentence.split():
    sentence_bow[token] = 1
    
sorted(sentence_bow.items())

[('Jefferson', 1),
 ('Monticello', 1),
 ('Thomas', 1),
 ('age', 1),
 ('at', 1),
 ('began', 1),
 ('building', 1),
 ('of', 1),
 ('the', 1),
 ('twenty-six.', 1)]

In [12]:
df = pd.DataFrame(pd.Series(dict([(token, 1) for token in sentence.split()])), columns=['sent']).T
df

Unnamed: 0,Thomas,Jefferson,began,building,Monticello,at,the,age,of,twenty-six.
sent,1,1,1,1,1,1,1,1,1,1


In [13]:
sentences = "Thomas Jefferson began building Monticello at the age of 26.\n"
sentences += "Construction was done mostly by local masons and carpenters.\n"
sentences += "He moved into the South Pavilion in 1770.\n" 
sentences += "Turning Monticello into a neoclassical masterpiece was Jefferson's obsession."

corpus = {}
for i, sent in enumerate(sentences.split('\n')):
    corpus['sent{}'.format(i + 1)] = dict((tok, 1) for tok in sent.split())
    # print(corpus)
df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T
# df[df.columns[:10]]
df.head(10)

# corpus = {'sent0': df.T['sent'].to_dict()}
# for i, sent in enumerate(sentences.split('\n')):
#     corpus['sent{}'.format(i + 1)] = dict((tok, 1) for tok in sent.split())
# df = pd.DataFrame(corpus, dtype=int).fillna(0)
# df.head(10)  # show the first 10 tokens in our vocabulary for this 4-document corpus

Unnamed: 0,Thomas,Jefferson,began,building,Monticello,at,the,age,of,26.,...,South,Pavilion,in,1770.,Turning,a,neoclassical,masterpiece,Jefferson's,obsession.
sent1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
sent2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sent3,0,0,0,0,0,0,1,0,0,0,...,1,1,1,1,0,0,0,0,0,0
sent4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,1,1,1,1,1


### 2.2.1 Dot product

In [14]:
v1 = pd.np.array([1, 2, 3])
v2 = pd.np.array([2, 3, 4])

v1.dot(v2)

  v1 = pd.np.array([1, 2, 3])
  v2 = pd.np.array([2, 3, 4])


20

In [15]:
(v1 * v2).sum()

20

In [16]:
sum([x1 * x2 for x1, x2 in zip(v1, v2)])

20

### 2.2.2 Measuring bag-of-words overlap

In [17]:
df = df.T

df.sent1.dot(df.sent2)

0

In [18]:
df.sent1.dot(df.sent3)

1

In [19]:
df.sent1.dot(df.sent4)

1

In [20]:
[{k, v} for (k, v) in (df.sent1 & df.sent4).items() if v]

[{1, 'Monticello'}]

### 2.2.3 A token improvement

In [21]:
import re

sentence = "Thomas Jefferson began building Monticello at the age of 26."

tokens = re.split(r'[-\s.,;!?]+', sentence)
tokens

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '']

In [22]:
pattern= re.compile(r"([-\s.,;!?])+")

tokens = pattern.split(sentence)

tokens[-10:] # just the last 10 tokens

[' ', 'the', ' ', 'age', ' ', 'of', ' ', '26', '.', '']

In [23]:
sentence = "Thomas Jefferson began building Monticello at the age of 26."

tokens = pattern.split(sentence)

[x for x in tokens if x not in '- \t\n.,;!>']

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26']

In [24]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+|$[0-9.]+|\S+')
tokenizer.tokenize(sentence)

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '.']

In [25]:
from nltk.tokenize import TreebankWordTokenizer

sentence = """Monticello wasn't designated as UNESCO Word Heritage\
    Sit until 1987."""

tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(sentence)

['Monticello',
 'was',
 "n't",
 'designated',
 'as',
 'UNESCO',
 'Word',
 'Heritage',
 'Sit',
 'until',
 '1987',
 '.']

In [26]:
from nltk.tokenize.casual import casual_tokenize

message = """RT @TJMonticello Best day everrrrrrr at Monticello.\
    Awesommmmmeeeeeee day :*)"""

casual_tokenize(message)

['RT',
 '@TJMonticello',
 'Best',
 'day',
 'everrrrrrr',
 'at',
 'Monticello',
 '.',
 'Awesommmmmeeeeeee',
 'day',
 ':*)']

In [27]:
casual_tokenize(message, reduce_len=True, strip_handles=True)

['RT',
 'Best',
 'day',
 'everrr',
 'at',
 'Monticello',
 '.',
 'Awesommmeee',
 'day',
 ':*)']

### 2.2.4 Extending your vocabulary with n-grams

In [28]:
sentence = "Thomas Jefferson began building Monticello at the age of 26."
pattern = re.compile(r"([-\s.,;!?])+")
tokens = [x for x in tokens if x and x not in '- \t\n.,;!?']
tokens

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26']

In [29]:
from nltk.util import ngrams

list(ngrams(tokens, 2))

[('Thomas', 'Jefferson'),
 ('Jefferson', 'began'),
 ('began', 'building'),
 ('building', 'Monticello'),
 ('Monticello', 'at'),
 ('at', 'the'),
 ('the', 'age'),
 ('age', 'of'),
 ('of', '26')]

In [30]:
list(ngrams(tokens, 3))

[('Thomas', 'Jefferson', 'began'),
 ('Jefferson', 'began', 'building'),
 ('began', 'building', 'Monticello'),
 ('building', 'Monticello', 'at'),
 ('Monticello', 'at', 'the'),
 ('at', 'the', 'age'),
 ('the', 'age', 'of'),
 ('age', 'of', '26')]

In [31]:
two_grams = list(ngrams(tokens, 2))
[" ".join(x) for x in two_grams]

['Thomas Jefferson',
 'Jefferson began',
 'began building',
 'building Monticello',
 'Monticello at',
 'at the',
 'the age',
 'age of',
 'of 26']

In [32]:
stop_words = ['a', 'an', 'the', 'on', 'of', 'off', 'this', 'is']
tokens = ['the', 'house', 'is', 'on', 'file']
tokens_without_stopwords = [x for x in tokens if x not in stop_words]
print(tokens_without_stopwords)

['house', 'file']


In [34]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [35]:
stop_words = nltk.corpus.stopwords.words('english')
len(stop_words)

179

In [37]:
stop_words[:7]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours']

In [38]:
[sw for sw in stop_words if len(sw) == 1]

['i', 'a', 's', 't', 'd', 'm', 'o', 'y']

In [39]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words

len(sklearn_stop_words)

318

In [41]:
len(set(stop_words).union(sklearn_stop_words))

378

In [42]:
len(set(stop_words).intersection(sklearn_stop_words))

119

### 2.2.5 Normalizing your vocabulary

In [43]:
tokens = ['House', 'Visitor', 'Center']
normalized_tokens = [x.lower() for x in tokens]
print(normalized_tokens)

['house', 'visitor', 'center']


In [45]:
def stem(phrase):
    return ' '.join([re.findall('^(.*ss|.*?)(s)?$', word)[0][0].strip("'") for word in phrase.lower().split()])

stem('houses')

'house'

In [46]:
stem("Doctor House's calls")

'doctor house call'

In [47]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

' '.join([stemmer.stem(w).strip("'") for w in "dish washer's washed dishes".split()])

'dish washer wash dish'

In [48]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [50]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [51]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

lemmatizer.lemmatize("better")

'better'

In [52]:
lemmatizer.lemmatize("better", pos="a")

'good'

In [53]:
lemmatizer.lemmatize("good", pos="a")

'good'

In [54]:
lemmatizer.lemmatize("goods", pos="a")

'goods'

In [55]:
lemmatizer.lemmatize("goods", pos="n")

'good'

In [56]:
lemmatizer.lemmatize("goodness", pos="n")

'goodness'

In [57]:
lemmatizer.lemmatize("best", pos="a")

'best'

In [58]:
stemmer.stem('goodness')

'good'

## 2.3 Sentiment

### 2.3.1 VADER - A rule-based sentiment analyzer

In [60]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 KB[0m [31m481.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [68]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sa = SentimentIntensityAnalyzer()
str(sa.lexicon)[:100]

'{\'$:\': -1.5, \'%)\': -0.4, \'%-)\': -1.5, \'&-:\': -0.4, \'&:\': -0.7, "( \'}{\' )": 1.6, \'(%\': -0.9, "(\'-:": '

In [73]:
[(tok, score) for tok, score in sa.lexicon.items() if " " in tok]

[("( '}{' )", 1.6),
 ("can't stand", -2.0),
 ('fed up', -1.8),
 ('screwed up', -1.5)]

In [74]:
sa.polarity_scores(text=\
    "Python is very readable and it's great for NLP.")

{'neg': 0.0, 'neu': 0.661, 'pos': 0.339, 'compound': 0.6249}

In [75]:
sa.polarity_scores(text=\
    "Python is not a bad choice for most applications.")

{'neg': 0.0, 'neu': 0.737, 'pos': 0.263, 'compound': 0.431}

In [76]:
corpus = ["Absolutely perfect! Love it! :-) :-) :-)",
    "Horrible! Completely useless. :(",
    "It was OK. Some good and some bad things."]
for doc in corpus:
    scores = sa.polarity_scores(doc)
    print('{:+}: {}'.format(scores['compound'], doc))

+0.9428: Absolutely perfect! Love it! :-) :-) :-)
-0.8768: Horrible! Completely useless. :(
-0.1531: It was OK. Some good and some bad things.


### 2.3.2 Naive Bayes

In [79]:
!pip install ipdb

Collecting ipdb
  Downloading ipdb-0.13.9.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting toml>=0.10.2
  Downloading toml-0.10.2-py2.py3-none-any.whl (16 kB)
Building wheels for collected packages: ipdb
  Building wheel for ipdb (setup.py) ... [?25ldone
[?25h  Created wheel for ipdb: filename=ipdb-0.13.9-py3-none-any.whl size=11648 sha256=396f7abe2c9adf7b918f085fae6699945b08a6d65c8a7925a2dd7c2dcdf92aaa
  Stored in directory: /home/jovyan/.cache/pip/wheels/f7/29/9a/cf774cd86e9802f075a0be1c9e0830bc062d07897b2e9e87cd
Successfully built ipdb
Installing collected packages: toml, ipdb
Successfully installed ipdb-0.13.9 toml-0.10.2


In [80]:
from nlpia.data.loaders import get_data
movies = get_data('hutto_movies')
movies.head().round(2)

Loading file with name: /home/jovyan/nlpia/src/nlpia/data/hutto_ICWSM_2014/movieReviewSnippets_GroundTruth.csv.gz


  np = pd.np


Unnamed: 0_level_0,sentiment,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2.27,The Rock is destined to be the 21st Century's ...
2,3.53,The gorgeously elaborate continuation of ''The...
3,-0.6,Effective but too tepid biopic
4,1.47,If you sometimes like to go to the movies to h...
5,1.73,"Emerges as something rare, an issue movie that..."


In [81]:
movies.describe().round(2)

Unnamed: 0,sentiment
count,10605.0
mean,0.0
std,1.92
min,-3.88
25%,-1.77
50%,-0.08
75%,1.83
max,3.94


In [82]:
import pandas as pd
pd.set_option('display.width', 75)
from nltk.tokenize import casual_tokenize
bags_of_words = []
from collections import Counter
for text in movies.text:
    bags_of_words.append(Counter(casual_tokenize(text)))
df_bows = pd.DataFrame.from_records(bags_of_words)
df_bows = df_bows.fillna(0).astype(int)
df_bows.shape

(10605, 20756)

In [83]:
df_bows.head()

Unnamed: 0,The,Rock,is,destined,to,be,the,21st,Century's,new,...,Ill,slummer,Rashomon,dipsticks,Bearable,Staggeringly,’,ve,muttering,dissing
0,1,1,1,1,2,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,4,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [84]:
df_bows.head()[list(bags_of_words[0].keys())]

Unnamed: 0,The,Rock,is,destined,to,be,the,21st,Century's,new,...,Schwarzenegger,",",Jean,Claud,Van,Damme,or,Steven,Segal,.
0,1,1,1,1,2,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,2,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,4
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,4,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [86]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb = nb.fit(df_bows, movies.sentiment > 0)
movies['predicted_sentiment'] = nb.predict(df_bows) * 8 - 4
movies['error'] = (movies.predicted_sentiment - movies.sentiment).abs()
movies.error.mean() #.round(1)

2.3911742904638262

In [88]:
movies['sentiment_ispositive'] = (movies.sentiment > 0).astype(int)
movies['predicted_ispos'] = (movies.predicted_sentiment > 0).astype(int)
movies['sentiment predicted_sentiment sentiment_ispositive predicted_ispos'.split()].head(8)

Unnamed: 0_level_0,sentiment,predicted_sentiment,sentiment_ispositive,predicted_ispos
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2.266667,4,1,1
2,3.533333,4,1,1
3,-0.6,-4,0,0
4,1.466667,4,1,1
5,1.733333,4,1,1
6,2.533333,4,1,1
7,2.466667,4,1,1
8,1.266667,-4,1,0


In [90]:
(movies.predicted_ispos == movies.sentiment_ispositive).sum() / len(movies)

0.9344648750589345

In [91]:
products = get_data('hutto_products')
bags_of_words = []
for text in products.text:
    bags_of_words.append(Counter(casual_tokenize(text)))
df_product_bows = pd.DataFrame.from_records(bags_of_words)
df_product_bows = df_product_bows.fillna(0).astype(int)
df_all_bows = df_bows.append(df_product_bows)
df_all_bows.columns

Loading file with name: /home/jovyan/nlpia/src/nlpia/data/hutto_ICWSM_2014/amazonReviewSnippets_GroundTruth.csv.gz


  df_all_bows = df_bows.append(df_product_bows)


Index(['The', 'Rock', 'is', 'destined', 'to', 'be', 'the', '21st',
       'Century's', 'new',
       ...
       'sligtly', 'owner', '81', 'defectively', 'warrranty', 'expire',
       'expired', 'voids', 'baghdad', 'harddisk'],
      dtype='object', length=23302)

In [93]:
df_product_bows = df_all_bows.iloc[len(movies):][df_bows.columns]
df_product_bows.shape

(3546, 20756)

In [94]:
df_bows.shape

(10605, 20756)

In [96]:
products['ispos'] = (products.sentiment > 0).astype(int)
products['pred'] = nb.predict(df_product_bows.fillna(0).values).astype(int)

products.head()



Unnamed: 0,id,sentiment,text,ispos,pred
0,1_1,-0.9,troubleshooting ad-2500 and ad-2600 no picture...,0,0
1,1_2,-0.15,"repost from january 13, 2004 with a better fit...",0,0
2,1_3,-0.2,does your apex dvd player only play dvd audio ...,0,0
3,1_4,-0.1,or does it play audio and video but scrolling ...,0,0
4,1_5,-0.5,before you try to return the player or waste h...,0,0


In [97]:
(products.pred == products.ispos).sum() / len(products)

0.5572476029328821