# NLP Tutorial: Text Representation - Bag Of Words (BOW)

In [1]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv("spam.csv")
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [11]:
df['spam']=df['Category'].apply(lambda x: 1 if x=='spam' else 0)

In [12]:
df.shape

(5572, 3)

In [13]:
df

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [38]:
X_train.head()

391                             Huh so late... Fr dinner?
4702                               I liked the new mobile
1089    You are awarded a SiPix Digital Camera! call 0...
4947             I'm already back home so no probably not
1665                                        Ü v ma fan...
Name: Message, dtype: object

In [17]:
y_train

391     0
4702    0
1089    1
4947    0
1665    0
       ..
1112    0
3001    0
3707    0
3280    0
5090    0
Name: spam, Length: 4457, dtype: int64

In [23]:
X_train.shape

(4457,)

In [24]:
X_test.shape

(1115,)

In [29]:
y_train[:4]

391     0
4702    0
1089    1
4947    0
Name: spam, dtype: int64

In [35]:
X_train.values

array(['Huh so late... Fr dinner?', 'I liked the new mobile',
       'You are awarded a SiPix Digital Camera! call 09061221061 from landline. Delivery within 28days. T Cs Box177. M221BP. 2yr warranty. 150ppm. 16 . p p£3.99',
       ...,
       'Reading gud habit.. Nan bari hudgi yorge pataistha ertini kano:-)',
       "Solve d Case : A Man Was Found Murdered On  &lt;DECIMAL&gt; . &lt;#&gt;  AfterNoon. 1,His wife called Police. 2,Police questioned everyone. 3,Wife: Sir,I was sleeping, when the murder took place. 4.Cook: I was cooking. 5.Gardener: I was picking vegetables. 6.House-Maid: I went 2 d post office. 7.Children: We went 2 play. 8.Neighbour: We went 2 a marriage. Police arrested d murderer Immediately. Who's It? Reply With Reason, If U r Brilliant.",
       "St andre, virgil's cream"], dtype=object)

# Create bag of words representation using CountVectorizer

In [32]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7781 sparse matrix of type '<class 'numpy.int64'>'
	with 59115 stored elements in Compressed Sparse Row format>

In [42]:
X_train_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [44]:
X_train_cv.shape

(4457, 7781)

In [54]:
pd.DataFrame(data=X_train_cv.toarray(),columns=v.get_feature_names_out())

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,zeros,zindgi,zoe,zogtorius,zoom,zouk,zyada,èn,ú1,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4452,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4453,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [62]:
v.get_feature_names_out()[3769]

'is'

In [61]:
v.vocabulary_

{'huh': 3562,
 'so': 6328,
 'late': 4051,
 'fr': 2980,
 'dinner': 2323,
 'liked': 4142,
 'the': 6863,
 'new': 4797,
 'mobile': 4585,
 'you': 7743,
 'are': 1045,
 'awarded': 1161,
 'sipix': 6233,
 'digital': 2312,
 'camera': 1624,
 'call': 1604,
 '09061221061': 179,
 'from': 3028,
 'landline': 4029,
 'delivery': 2230,
 'within': 7595,
 '28days': 368,
 'cs': 2088,
 'box177': 1453,
 'm221bp': 4314,
 '2yr': 415,
 'warranty': 7432,
 '150ppm': 295,
 '16': 303,
 '99': 729,
 'already': 918,
 'back': 1183,
 'home': 3492,
 'no': 4837,
 'probably': 5472,
 'not': 4873,
 'ma': 4324,
 'fan': 2770,
 'de': 2181,
 'am': 929,
 'seeing': 6038,
 'in': 3661,
 'online': 4993,
 'shop': 6163,
 'that': 6860,
 'asked': 1091,
 'we': 7460,
 'spend': 6422,
 'our': 5054,
 'days': 2178,
 'waiting': 7402,
 'for': 2947,
 'ideal': 3611,
 'path': 5157,
 'to': 6978,
 'appear': 1018,
 'front': 3031,
 'of': 4938,
 'us': 7263,
 'but': 1573,
 'what': 7521,
 'forget': 2954,
 'is': 3769,
 'paths': 5159,
 'made': 4340,
 'by': 1

In [65]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [79]:
np.where(X_train_np[0]!=0)

(array([2323, 2980, 3562, 4051, 6328], dtype=int64),)

In [80]:
v.get_feature_names_out()[2323]

'dinner'

In [88]:
X_train_np

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [83]:
ts=pd.DataFrame(data=X_train_np[0])

In [87]:
ts[ts[0]!=0]

Unnamed: 0,0
2323,1
2980,1
3562,1
4051,1
6328,1


In [93]:
X_train.head()

391                             Huh so late... Fr dinner?
4702                               I liked the new mobile
1089    You are awarded a SiPix Digital Camera! call 0...
4947             I'm already back home so no probably not
1665                                        Ü v ma fan...
Name: Message, dtype: object

# Train the naive bayes model

In [94]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [95]:
X_test_cv = v.transform(X_test)

# Evaluate Performance

In [98]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       946
           1       0.98      0.95      0.97       169

    accuracy                           0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [99]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

In [100]:
emails_count

<2x7781 sparse matrix of type '<class 'numpy.int64'>'
	with 22 stored elements in Compressed Sparse Row format>

In [101]:
pd.DataFrame(data=emails_count.toarray(),columns=v.get_feature_names_out())

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,zeros,zindgi,zoe,zogtorius,zoom,zouk,zyada,èn,ú1,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Train the model using sklearn pipeline and reduce number of lines of code

By use below pipeline, we can reduce the code 

In [107]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [108]:
clf.fit(X_train, y_train)

In [109]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       946
           1       0.98      0.95      0.97       169

    accuracy                           0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



# Here precision / recall is high because of unbalnced data

In [110]:
np.where(X_train_np[0]!=0)

(array([2323, 2980, 3562, 4051, 6328], dtype=int64),)