# Classifying Text Using Naive Bayes

## Tokenization

In [1]:
lines = [
    'How to tokenize?\nLike a boss.',
    'Google is accessible via http://www.google.com',
    '1000 new followers! #TwitterFamous',
]

lines

['How to tokenize?\nLike a boss.',
 'Google is accessible via http://www.google.com',
 '1000 new followers! #TwitterFamous']

In [2]:
for line in lines:
  print(line.split())

['How', 'to', 'tokenize?', 'Like', 'a', 'boss.']
['Google', 'is', 'accessible', 'via', 'http://www.google.com']
['1000', 'new', 'followers!', '#TwitterFamous']


### Regex

In [3]:
import re

_token_pattern = r"\w+"
token_pattern = re.compile(_token_pattern)
    
for line in lines:
    print(token_pattern.findall(line))

['How', 'to', 'tokenize', 'Like', 'a', 'boss']
['Google', 'is', 'accessible', 'via', 'http', 'www', 'google', 'com']
['1000', 'new', 'followers', 'TwitterFamous']


In [5]:
import re

_token_pattern = r"(?u)\b\w\w+\b"
token_pattern = re.compile(_token_pattern)
    
for line in lines:
    print(token_pattern.findall(line))


['How', 'to', 'tokenize', 'Like', 'boss']
['Google', 'is', 'accessible', 'via', 'http', 'www', 'google', 'com']
['1000', 'new', 'followers', 'TwitterFamous']


In [6]:
_token_pattern = r"\w+"
token_pattern = re.compile(_token_pattern)

def tokenizer(line):
    line = line.lower()
    line = re.sub(r'http[s]?://[\w\/\-\.\?]+','_url_', line)
    line = re.sub(r'\d+:\d+','_time_', line)
    line = re.sub(r'#\w+', '_hashtag_', line)
    line = re.sub(r'\d+','_num_', line)
    return token_pattern.findall(line)

for line in lines:
    print(tokenizer(line))

['how', 'to', 'tokenize', 'like', 'a', 'boss']
['google', 'is', 'accessible', 'via', '_url_']
['_num_', 'new', 'followers', '_hashtag_']


# Vectorizing text into matrices

## Bag of words

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(lowercase=True, tokenizer=tokenizer)

X = vec.fit_transform(lines)

type(X)

scipy.sparse.csr.csr_matrix

In [8]:
X

<3x15 sparse matrix of type '<class 'numpy.int64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [10]:
import pandas as pd


pd.DataFrame(X.todense(), columns=vec.get_feature_names_out())

Unnamed: 0,_hashtag_,_num_,_url_,a,accessible,boss,followers,google,how,is,like,new,to,tokenize,via
0,0,0,0,1,0,1,0,0,1,0,1,0,1,1,0
1,0,0,1,0,1,0,0,1,0,1,0,0,0,0,1
2,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0


### Different sentences, same representation

In [11]:
flight = [
          'Flight was delayed, I am not happy',
          'Flight was not delayed, I am happy'
]

In [14]:
flight_vec = CountVectorizer(tokenizer=tokenizer)
X_flight = flight_vec.fit_transform(flight)

flight_df = pd.DataFrame(X_flight.todense(), columns=flight_vec.get_feature_names_out())
flight_df

Unnamed: 0,am,delayed,flight,happy,i,not,was
0,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1
