### Introduction to Bag of words

In [2]:
data = ['The boy has a green bird',
              'The bird eats a duck',
              'The duck eats a worm']

In [3]:
sents = [sent.split() for sent in data]

In [4]:
sents

[['The', 'boy', 'has', 'a', 'green', 'bird'],
 ['The', 'bird', 'eats', 'a', 'duck'],
 ['The', 'duck', 'eats', 'a', 'worm']]

#### Computing vocabularies from the sentences

In [5]:
vocab = list(set([word for sent in sents for word in sent]))

In [6]:
sents

[['The', 'boy', 'has', 'a', 'green', 'bird'],
 ['The', 'bird', 'eats', 'a', 'duck'],
 ['The', 'duck', 'eats', 'a', 'worm']]

In [7]:
vocab

['boy', 'eats', 'has', 'bird', 'duck', 'a', 'The', 'worm', 'green']

In [8]:
sents_vec = [[1 if (v in sent) else 0 for v in vocab ] for sent in sents]

In [9]:
sents_vec

[[1, 0, 1, 1, 0, 1, 1, 0, 1],
 [0, 1, 0, 1, 1, 1, 1, 0, 0],
 [0, 1, 0, 0, 1, 1, 1, 1, 0]]

In [11]:
print(vocab)
print("S1 : ", sents_vec[0])
print("S2 : ", sents_vec[1])
print("S3 : ", sents_vec[2])

['boy', 'eats', 'has', 'bird', 'duck', 'a', 'The', 'worm', 'green']
S1 :  [1, 0, 1, 1, 0, 1, 1, 0, 1]
S2 :  [0, 1, 0, 1, 1, 1, 1, 0, 0]
S3 :  [0, 1, 0, 0, 1, 1, 1, 1, 0]


In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [56]:
cv = CountVectorizer()

In [57]:
matrix = cv.fit_transform(data)

In [63]:
cv.get_feature_names() # vocabulary of the term-document matrix

['bird', 'boy', 'duck', 'eats', 'green', 'has', 'the', 'worm']

In [68]:
for i,x in enumerate(matrix):
    print(f"sentence {i}")
    print(x)
    
# It is important to note that the letter a is removed by the CountVectorizer because 
# by default minimum 2 character words are counted as words.

sentence 0
  (0, 0)	1
  (0, 4)	1
  (0, 5)	1
  (0, 1)	1
  (0, 6)	1
sentence 1
  (0, 2)	1
  (0, 3)	1
  (0, 0)	1
  (0, 6)	1
sentence 2
  (0, 7)	1
  (0, 2)	1
  (0, 3)	1
  (0, 6)	1


### Tokenizers

In [10]:
from nltk import word_tokenize

In [11]:
foo = "I haven't had any of the cabbage-soups"

In [12]:
word_tokenize(foo)

['I', 'have', "n't", 'had', 'any', 'of', 'the', 'cabbage-soups']

In [13]:
footweet = "@Tomato Wassssssup man, that's so cooool"

In [14]:
from nltk.tokenize import TweetTokenizer

In [15]:
TweetTokenizer(strip_handles=True, reduce_len=True).tokenize(footweet)

['Wasssup', 'man', ',', "that's", 'so', 'coool']

In [16]:
from nltk import sent_tokenize

In [17]:
bar = "Sent tokenize knows that time period from 10 a.m. to 1 p.m. are not sentence boundaries. neither are the names G.H.Hardy and J.J.Thompson. you can even start the sentence without Caps"

In [18]:
sent_tokenize(bar)

['Sent tokenize knows that time period from 10 a.m. to 1 p.m. are not sentence boundaries.',
 'neither are the names G.H.Hardy and J.J.Thompson.',
 'you can even start the sentence without Caps']

### Stemming & Lemmatization

In [None]:
from nltk import PorterStemmer, SnowballStemmer, 

In [19]:
from nltk import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [20]:
ps = PorterStemmer()

In [21]:
foo = "Babies in my buildings are crying laughing and playing all the time"

In [22]:
print([ps.stem(tok) for tok in foo.split()])

['babi', 'in', 'my', 'build', 'are', 'cri', 'laugh', 'and', 'play', 'all', 'the', 'time']


In [23]:
wnl = WordNetLemmatizer()

In [24]:
bar = "Those people were crying and running every day"

In [25]:
[wnl.lemmatize(tok, pos="v") for tok in bar.split()]

['Those', 'people', 'be', 'cry', 'and', 'run', 'every', 'day']