### Bag of Words

In [1]:
import pandas as pd

In [2]:
df = pd.DataFrame({'text_data':['This meal is very tasty and affordable',
                                'This meal is not tasty and is affordable',
                                'This meal is delicious and cheap',
                                'meal is tasty and meal tastes good'],
                   'output': [1, 0, 1, 1]})

df

Unnamed: 0,text_data,output
0,This meal is very tasty and affordable,1
1,This meal is not tasty and is affordable,0
2,This meal is delicious and cheap,1
3,meal is tasty and meal tastes good,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text_data  4 non-null      object
 1   output     4 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 192.0+ bytes


In [4]:
# feature extraction method - bag of words - countvectorization
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer() # Frequency BOW
cv1 = CountVectorizer(binary = True) # Binary BOW

In [5]:
BOW = cv.fit_transform(df['text_data'])
print(cv.vocabulary_)

{'this': 10, 'meal': 6, 'is': 5, 'very': 11, 'tasty': 9, 'and': 1, 'affordable': 0, 'not': 7, 'delicious': 3, 'cheap': 2, 'tastes': 8, 'good': 4}


In [6]:
BOW1 = cv1.fit_transform(df['text_data'])
print(cv1.vocabulary_)

{'this': 10, 'meal': 6, 'is': 5, 'very': 11, 'tasty': 9, 'and': 1, 'affordable': 0, 'not': 7, 'delicious': 3, 'cheap': 2, 'tastes': 8, 'good': 4}


In [7]:
BOW.toarray() # It created a sparse matrix

array([[1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1],
       [1, 1, 0, 0, 0, 2, 1, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 1, 1, 2, 0, 1, 1, 0, 0]])

In [8]:
BOW1.toarray()

array([[1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1],
       [1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0]])

### N-gram

In [9]:
# CountVectorizer(ngram_range=(1, 1), binary = True) # uni-gram is Binary BOW
# Bi-gram (ngram_range=(2, 2))

In [10]:
cv = CountVectorizer(ngram_range = (1,1), binary =True)
BOW = cv.fit_transform(df['text_data'])
print(cv.vocabulary_)

{'this': 10, 'meal': 6, 'is': 5, 'very': 11, 'tasty': 9, 'and': 1, 'affordable': 0, 'not': 7, 'delicious': 3, 'cheap': 2, 'tastes': 8, 'good': 4}


In [11]:
# bi-gram
cv = CountVectorizer(ngram_range=(2,2), binary=True)
BOW = cv.fit_transform(df['text_data'])
print(cv.vocabulary_)
print(BOW.toarray())

{'this meal': 15, 'meal is': 10, 'is very': 9, 'very tasty': 16, 'tasty and': 14, 'and affordable': 0, 'is not': 7, 'not tasty': 12, 'and is': 2, 'is affordable': 5, 'is delicious': 6, 'delicious and': 4, 'and cheap': 1, 'is tasty': 8, 'and meal': 3, 'meal tastes': 11, 'tastes good': 13}
[[1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 1]
 [0 0 1 0 0 1 0 1 0 0 1 0 1 0 1 1 0]
 [0 1 0 0 1 0 1 0 0 0 1 0 0 0 0 1 0]
 [0 0 0 1 0 0 0 0 1 0 1 1 0 1 1 0 0]]


In [12]:
# uni & bi-gram
cv = CountVectorizer(ngram_range=(1,2), binary=True)
BOW = cv.fit_transform(df['text_data'])
print(cv.vocabulary_)
print(BOW.toarray())

{'this': 25, 'meal': 16, 'is': 10, 'very': 27, 'tasty': 23, 'and': 1, 'affordable': 0, 'this meal': 26, 'meal is': 17, 'is very': 15, 'very tasty': 28, 'tasty and': 24, 'and affordable': 2, 'not': 19, 'is not': 13, 'not tasty': 20, 'and is': 4, 'is affordable': 11, 'delicious': 7, 'cheap': 6, 'is delicious': 12, 'delicious and': 8, 'and cheap': 3, 'tastes': 21, 'good': 9, 'is tasty': 14, 'and meal': 5, 'meal tastes': 18, 'tastes good': 22}
[[1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 0 1 1 1 1 1 1]
 [1 1 0 0 1 0 0 0 0 0 1 1 0 1 0 0 1 1 0 1 1 0 0 1 1 1 1 0 0]
 [0 1 0 1 0 0 1 1 1 0 1 0 1 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 0]
 [0 1 0 0 0 1 0 0 0 1 1 0 0 0 1 0 1 1 1 0 0 1 1 1 1 0 0 0 0]]


In [13]:
# tri-gram
cv = CountVectorizer(ngram_range=(3,3), binary=True)
BOW = cv.fit_transform(df['text_data'])
print(cv.vocabulary_)
print(BOW.toarray())

{'this meal is': 16, 'meal is very': 10, 'is very tasty': 6, 'very tasty and': 17, 'tasty and affordable': 13, 'meal is not': 8, 'is not tasty': 4, 'not tasty and': 12, 'tasty and is': 14, 'and is affordable': 0, 'meal is delicious': 7, 'is delicious and': 3, 'delicious and cheap': 2, 'meal is tasty': 9, 'is tasty and': 5, 'tasty and meal': 15, 'and meal tastes': 1, 'meal tastes good': 11}
[[0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 1 1]
 [1 0 0 0 1 0 0 0 1 0 0 0 1 0 1 0 1 0]
 [0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0]
 [0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0]]


In [14]:
# uni, bi & tri-gram
cv = CountVectorizer(ngram_range=(2,3), binary=True)
BOW = cv.fit_transform(df['text_data'])
print(cv.vocabulary_)
print(BOW.toarray())

{'this meal': 31, 'meal is': 17, 'is very': 15, 'very tasty': 33, 'tasty and': 27, 'and affordable': 0, 'this meal is': 32, 'meal is very': 21, 'is very tasty': 16, 'very tasty and': 34, 'tasty and affordable': 28, 'is not': 11, 'not tasty': 24, 'and is': 2, 'is affordable': 8, 'meal is not': 19, 'is not tasty': 12, 'not tasty and': 25, 'tasty and is': 29, 'and is affordable': 3, 'is delicious': 9, 'delicious and': 6, 'and cheap': 1, 'meal is delicious': 18, 'is delicious and': 10, 'delicious and cheap': 7, 'is tasty': 13, 'and meal': 4, 'meal tastes': 22, 'tastes good': 26, 'meal is tasty': 20, 'is tasty and': 14, 'tasty and meal': 30, 'and meal tastes': 5, 'meal tastes good': 23}
[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 0 0 0 0 1 1 0 0 1 1 1 1]
 [0 0 1 1 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 1 0 1 1 0 0]
 [0 1 0 0 0 0 1 1 0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0]
 [0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 1 1 0 0 1 1 0 0 1 0 0 0 0]]


In [15]:
# bi-gram & tri-gram
cv = CountVectorizer(ngram_range=(1,3), binary=True)
BOW = cv.fit_transform(df['text_data'])
print(cv.vocabulary_)
print(BOW.toarray())

{'this': 41, 'meal': 23, 'is': 13, 'very': 44, 'tasty': 36, 'and': 1, 'affordable': 0, 'this meal': 42, 'meal is': 24, 'is very': 21, 'very tasty': 45, 'tasty and': 37, 'and affordable': 2, 'this meal is': 43, 'meal is very': 28, 'is very tasty': 22, 'very tasty and': 46, 'tasty and affordable': 38, 'not': 31, 'is not': 17, 'not tasty': 32, 'and is': 4, 'is affordable': 14, 'meal is not': 26, 'is not tasty': 18, 'not tasty and': 33, 'tasty and is': 39, 'and is affordable': 5, 'delicious': 9, 'cheap': 8, 'is delicious': 15, 'delicious and': 10, 'and cheap': 3, 'meal is delicious': 25, 'is delicious and': 16, 'delicious and cheap': 11, 'tastes': 34, 'good': 12, 'is tasty': 19, 'and meal': 6, 'meal tastes': 29, 'tastes good': 35, 'meal is tasty': 27, 'is tasty and': 20, 'tasty and meal': 40, 'and meal tastes': 7, 'meal tastes good': 30}
[[1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0
  1 1 1 0 0 1 1 1 1 1 1]
 [1 1 0 0 1 1 0 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 0 1 1 0 1

In [17]:
cv = CountVectorizer(ngram_range=(1,3), binary=True)
feature_vector = cv.fit(df['text_data'])
feature_names = cv.get_feature_names_out()
BOW = cv.transform(df['text_data'])
pd.DataFrame(BOW.toarray(), columns=feature_names)

Unnamed: 0,affordable,and,and affordable,and cheap,and is,and is affordable,and meal,and meal tastes,cheap,delicious,...,tasty and,tasty and affordable,tasty and is,tasty and meal,this,this meal,this meal is,very,very tasty,very tasty and
0,1,1,1,0,0,0,0,0,0,0,...,1,1,0,0,1,1,1,1,1,1
1,1,1,0,0,1,1,0,0,0,0,...,1,0,1,0,1,1,1,0,0,0
2,0,1,0,1,0,0,0,0,1,1,...,0,0,0,0,1,1,1,0,0,0
3,0,1,0,0,0,0,1,1,0,0,...,1,0,0,1,0,0,0,0,0,0


### TF-IDF (Term Frequency - Inverse Document Frequency)

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1,3))
print(tfidf.fit_transform(df['text_data']).toarray())

[[0.23002364 0.15225029 0.29175583 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.15225029 0.         0.         0.         0.
  0.         0.         0.         0.29175583 0.29175583 0.15225029
  0.15225029 0.         0.         0.         0.29175583 0.
  0.         0.         0.         0.         0.         0.
  0.18622396 0.18622396 0.29175583 0.         0.         0.18622396
  0.18622396 0.18622396 0.29175583 0.29175583 0.29175583]
 [0.20658538 0.13673674 0.         0.         0.26202735 0.26202735
  0.         0.         0.         0.         0.         0.
  0.         0.27347349 0.26202735 0.         0.         0.26202735
  0.26202735 0.         0.         0.         0.         0.13673674
  0.13673674 0.         0.26202735 0.         0.         0.
  0.         0.26202735 0.26202735 0.26202735 0.         0.
  0.16724867 0.16724867 0.         0.26202735 0.         0.16724867
  0.16724867 0.16724867 0.         0.         0.      

In [19]:
print(tfidf.idf_)

[1.51082562 1.         1.91629073 1.91629073 1.91629073 1.91629073
 1.91629073 1.91629073 1.91629073 1.91629073 1.91629073 1.91629073
 1.91629073 1.         1.91629073 1.91629073 1.91629073 1.91629073
 1.91629073 1.91629073 1.91629073 1.91629073 1.91629073 1.
 1.         1.91629073 1.91629073 1.91629073 1.91629073 1.91629073
 1.91629073 1.91629073 1.91629073 1.91629073 1.91629073 1.91629073
 1.22314355 1.22314355 1.91629073 1.91629073 1.91629073 1.22314355
 1.22314355 1.22314355 1.91629073 1.91629073 1.91629073]


In [20]:
print(tfidf.get_feature_names_out())

['affordable' 'and' 'and affordable' 'and cheap' 'and is'
 'and is affordable' 'and meal' 'and meal tastes' 'cheap' 'delicious'
 'delicious and' 'delicious and cheap' 'good' 'is' 'is affordable'
 'is delicious' 'is delicious and' 'is not' 'is not tasty' 'is tasty'
 'is tasty and' 'is very' 'is very tasty' 'meal' 'meal is'
 'meal is delicious' 'meal is not' 'meal is tasty' 'meal is very'
 'meal tastes' 'meal tastes good' 'not' 'not tasty' 'not tasty and'
 'tastes' 'tastes good' 'tasty' 'tasty and' 'tasty and affordable'
 'tasty and is' 'tasty and meal' 'this' 'this meal' 'this meal is' 'very'
 'very tasty' 'very tasty and']


### n-gram: ngramise the sentence using TextBlob & NLTK

In [21]:
from nltk import ngrams

In [None]:
input='I want to ngramise the given sentences'

for i in ngrams(input.split(),3):
  print(i)

('I', 'want', 'to')
('want', 'to', 'ngramise')
('to', 'ngramise', 'the')
('ngramise', 'the', 'given')
('the', 'given', 'sentences')


In [None]:
# !pip install -U textblob
# !python -m textblob.download_corpora
# !pip install textblob


In [None]:
from textblob import TextBlob

input = TextBlob("I want to ngramise the given sentences")

In [None]:
print(input.ngrams(1)) # uni-gram

[WordList(['I']), WordList(['want']), WordList(['to']), WordList(['ngramise']), WordList(['the']), WordList(['given']), WordList(['sentences'])]


In [None]:
print(input.ngrams(2)) # bi-gram

[WordList(['I', 'want']), WordList(['want', 'to']), WordList(['to', 'ngramise']), WordList(['ngramise', 'the']), WordList(['the', 'given']), WordList(['given', 'sentences'])]


In [None]:
print(input.ngrams(3)) # tri-gram

[WordList(['I', 'want', 'to']), WordList(['want', 'to', 'ngramise']), WordList(['to', 'ngramise', 'the']), WordList(['ngramise', 'the', 'given']), WordList(['the', 'given', 'sentences'])]


In [None]:
# TextBlob is useful for translation purpose
# chinese_lang = TextBlob("赖清德致词说：“中华民国已经在台澎金马落地生根，和中华人民共和国互不隶属；民主自由在这块土地上成长茁壮，中华人民共和国无权代表台湾；2300万台湾人民更要向全世界开枝散叶，迎向未来。”")
# chinese_lang.translate(from_lang='zh-CN', to='en')

In [None]:
# Correct Spell checks
spell_check = TextBlob("I haave tw saay thatt youu havv aa goood knoledgee about englich")
spell_check.correct()

TextBlob("I have to say that you have a good knowledge about english")

#### (https://textblob.readthedocs.io/en/dev/)