# Tokenization

In [1]:
lines = [
    'How to tokenize?\nLike a boss.',
    'Google is accessible via http://www.google.com',
    '1000 new followers! #TwitterFamous',
]

In [2]:
lines

['How to tokenize?\nLike a boss.',
 'Google is accessible via http://www.google.com',
 '1000 new followers! #TwitterFamous']

In [3]:
for line in lines:
    print(line.split())

['How', 'to', 'tokenize?', 'Like', 'a', 'boss.']
['Google', 'is', 'accessible', 'via', 'http://www.google.com']
['1000', 'new', 'followers!', '#TwitterFamous']


In [4]:
import re

_token_pattern = r"\w+"
token_pattern = re.compile(_token_pattern)
    
for line in lines:
    print(token_pattern.findall(line))

['How', 'to', 'tokenize', 'Like', 'a', 'boss']
['Google', 'is', 'accessible', 'via', 'http', 'www', 'google', 'com']
['1000', 'new', 'followers', 'TwitterFamous']


In [5]:
_token_pattern = r"\w+"
token_pattern = re.compile(_token_pattern)

def tokenizer(line):
    line = line.lower()
    line = re.sub(r'http[s]?://[\w\/\-\.\?]+','_url_', line)
    line = re.sub(r'\d+:\d+','_time_', line)
    line = re.sub(r'#\w+', '_hashtag_', line)
    line = re.sub(r'\d+','_num_', line)
    return token_pattern.findall(line)

for line in lines:
    print(tokenizer(line))

['how', 'to', 'tokenize', 'like', 'a', 'boss']
['google', 'is', 'accessible', 'via', '_url_']
['_num_', 'new', 'followers', '_hashtag_']


In [6]:
import re

_token_pattern = r"(?u)\b\w\w+\b"
token_pattern = re.compile(_token_pattern)
    
for line in lines:
    print(token_pattern.findall(line))

['How', 'to', 'tokenize', 'Like', 'boss']
['Google', 'is', 'accessible', 'via', 'http', 'www', 'google', 'com']
['1000', 'new', 'followers', 'TwitterFamous']


In [7]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(lowercase=True, tokenizer=tokenizer)

x = vec.fit_transform(lines)

In [8]:
print(list(vec.vocabulary_.keys()))

['how', 'to', 'tokenize', 'like', 'boss', 'google', 'is', 'accessible', 'via', '_url_', '_num_', 'new', 'followers', '_hashtag_']


In [9]:
df = pd.DataFrame(
    x.todense(), 
    columns=vec.get_feature_names(),
)

df.index.name = 'doc-id'

df

Unnamed: 0_level_0,_hashtag_,_num_,_url_,accessible,boss,followers,google,how,is,like,new,to,tokenize,via
doc-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0,0,0,0,1,0,0,1,0,1,0,1,1,0
1,0,0,1,1,0,0,1,0,1,0,0,0,0,1
2,1,1,0,0,0,1,0,0,0,0,1,0,0,0


In [10]:
flight_delayed_lines = [
    'Flight was delayed, I am not happy',
    'Flight was not delayed, I am happy'
]

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(ngram_range=(1,1))

x = vec.fit_transform(flight_delayed_lines)

df = pd.DataFrame(
    x.todense(), 
    columns=vec.get_feature_names(),
)

df.index.name = 'doc-id'

df

Unnamed: 0_level_0,am,delayed,flight,happy,not,was
doc-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,1,1,1,1,1
1,1,1,1,1,1,1


In [12]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(ngram_range=(2,2))

x = vec.fit_transform(flight_delayed_lines)

df = pd.DataFrame(
    x.todense(), 
    columns=vec.get_feature_names(),
)

df.index.name = 'doc-id'

df

Unnamed: 0_level_0,am happy,am not,delayed am,flight was,not delayed,not happy,was delayed,was not
doc-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,1,1,1,0,1,1,0
1,1,0,1,1,1,0,0,1


In [13]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(analyzer='char', ngram_range=(4,4))

x = vec.fit_transform(flight_delayed_lines)

df = pd.DataFrame(
    x.todense(), 
    columns=vec.get_feature_names(),
)

df.index.name = 'doc-id'

df

Unnamed: 0_level_0,am,del,hap,i a,not,was,", i",am h,am n,appy,...,not,ot d,ot h,s de,s no,t de,t ha,t wa,was,"yed,"
doc-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,1,1,1,1,1,0,1,1,...,1,0,1,1,0,0,1,1,1,1
1,1,1,1,1,1,1,1,1,0,1,...,1,1,0,0,1,1,0,1,1,1


In [14]:
lines_fruits = [
    'I like apples',
    'I like oranges',
    'I like pears',
]

In [15]:
from IPython.display import display_html

def display_side_by_side(*args):
    
    html_str=''
    
    for df in args:
        html_str += df.to_html()
        html_str += ''.join(['&nbsp;' for i in range(20)])
       
    html_str = html_str.replace('table','table style="display:inline;"')
    
    display_html(html_str, raw=True)

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(token_pattern=r'\w+')

x = vec.fit_transform(lines_fruits)

df1 = pd.DataFrame(
    x.todense().astype(float).round(2), 
    columns=vec.get_feature_names(),
)

df1.index.name = 'CountVectorizer'

from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(token_pattern=r'\w+')

x = vec.fit_transform(lines_fruits)

df2 = pd.DataFrame(
    x.todense().round(2), 
    columns=vec.get_feature_names(),
)

df2.index.name = 'TfidfVectorizer'


display_side_by_side(df1, df2)

Unnamed: 0_level_0,apples,i,like,oranges,pears
CountVectorizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1.0,1.0,1.0,0.0,0.0
1,0.0,1.0,1.0,1.0,0.0
2,0.0,1.0,1.0,0.0,1.0

Unnamed: 0_level_0,apples,i,like,oranges,pears
TfidfVectorizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.77,0.45,0.45,0.0,0.0
1,0.0,0.45,0.45,0.77,0.0
2,0.0,0.45,0.45,0.0,0.77


In [17]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(token_pattern=r'\w+')

x = vec.fit_transform(lines_fruits)

df1 = pd.DataFrame(
    x.todense().astype(float).round(2), 
    columns=vec.get_feature_names(),
)

df1.index.name = 'CountVectorizer'

df1.head(2)

Unnamed: 0_level_0,apples,i,like,oranges,pears
CountVectorizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1.0,1.0,1.0,0.0,0.0
1,0.0,1.0,1.0,1.0,0.0


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(token_pattern=r'\w+')

x = vec.fit_transform(lines_fruits)

df = pd.DataFrame(
    x.todense(), 
    columns=vec.get_feature_names(),
)

df.index.name = 'doc-id'

df

Unnamed: 0_level_0,apples,i,like,oranges,pears
doc-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.767495,0.453295,0.453295,0.0,0.0
1,0.0,0.453295,0.453295,0.767495,0.0
2,0.0,0.453295,0.453295,0.0,0.767495


In [19]:
import spacy

nlp = spacy.load('en_core_web_md')


terms = ['I', 'like', 'apples', 'oranges', 'pears']
vectors = [
    nlp(term).vector.tolist() for term in terms
]

In [20]:
len(vectors[terms.index('apples')])

300

In [21]:
pd.Series(vectors[terms.index('apples')]).rename('apples')

0     -0.633400
1      0.189810
2     -0.535440
3     -0.526580
4     -0.300010
         ...   
295    0.068773
296   -0.238810
297   -1.178400
298    0.255040
299    0.611710
Name: apples, Length: 300, dtype: float64

In [22]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_distances

pd.DataFrame(
    cosine_distances(vectors),
    index=terms,
    columns=terms,
)

Unnamed: 0,I,like,apples,oranges,pears
I,0.0,0.444509,0.795573,0.811759,0.795573
like,0.444509,0.0,0.670129,0.722825,0.670129
apples,0.795573,0.670129,0.0,0.221906,0.0
oranges,0.811759,0.722825,0.221906,0.0,0.221906
pears,0.795573,0.670129,0.0,0.221906,0.0


In [23]:
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

cm = sns.light_palette("Gray", as_cmap=True)

pd.DataFrame(
    cosine_similarity(vectors),
    index=terms,
    columns=terms,
).round(2).style.background_gradient(cmap=cm)

Unnamed: 0,I,like,apples,oranges,pears
I,1.0,0.56,0.2,0.19,0.2
like,0.56,1.0,0.33,0.28,0.33
apples,0.2,0.33,1.0,0.78,1.0
oranges,0.19,0.28,0.78,1.0,0.78
pears,0.2,0.33,1.0,0.78,1.0
