<h2 align='center'>NLP Tutorial: Text Representation - Bag Of Words (BOW)</h2>

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [4]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [5]:
df.shape

(5572, 3)

In [6]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


<h3>Train test split</h3>

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2, stratify=df.spam)

In [9]:
X_train.shape

(4457,)

In [10]:
X_test.shape

(1115,)

In [12]:
y_train.value_counts()

spam
0    3859
1     598
Name: count, dtype: int64

In [13]:
type(X_train)

pandas.core.series.Series

In [14]:
X_train[:4]

1196    You have 1 new voicemail. Please call 08719181503
1396    Thats cool! I am a gentleman and will treat yo...
77      I like you peoples very much:) but am very shy...
3458    Friendship poem: Dear O Dear U R Not Near But ...
Name: Message, dtype: object

In [15]:
type(y_train)

pandas.core.series.Series

In [16]:
y_train[:4]

1196    1
1396    0
77      0
3458    0
Name: spam, dtype: int64

In [17]:
type(X_train.values)

numpy.ndarray

<h3>Create bag of words representation using CountVectorizer</h3>

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7759 sparse matrix of type '<class 'numpy.int64'>'
	with 59335 stored elements in Compressed Sparse Row format>

In [19]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [20]:
X_train_cv.shape

(4457, 7759)

In [21]:
v.get_feature_names_out()[1771]

'cheered'

In [22]:
v.vocabulary_

{'you': 7719,
 'have': 3364,
 'new': 4771,
 'voicemail': 7336,
 'please': 5267,
 'call': 1619,
 '08719181503': 153,
 'thats': 6831,
 'cool': 2018,
 'am': 943,
 'gentleman': 3116,
 'and': 967,
 'will': 7535,
 'treat': 7050,
 'with': 7565,
 'dignity': 2332,
 'respect': 5759,
 'like': 4116,
 'peoples': 5160,
 'very': 7290,
 'much': 4645,
 'but': 1587,
 'shy': 6166,
 'pa': 5057,
 'friendship': 3012,
 'poem': 5302,
 'dear': 2203,
 'not': 4843,
 'near': 4730,
 'can': 1642,
 'hear': 3386,
 'dont': 2423,
 'get': 3125,
 'fear': 2803,
 'live': 4156,
 'cheer': 1770,
 'no': 4808,
 'more': 4593,
 'tear': 6758,
 'always': 941,
 'my': 4678,
 'gud': 3270,
 'ni8': 4779,
 'good': 3191,
 'afternoon': 864,
 'boytoy': 1485,
 'how': 3529,
 'goes': 3175,
 'that': 6828,
 'walking': 7385,
 'here': 3419,
 'there': 6849,
 'day': 2190,
 'did': 2312,
 'police': 5308,
 'abstract': 777,
 'are': 1054,
 'still': 6510,
 'out': 5023,
 'about': 771,
 'wake': 7377,
 'miss': 4515,
 'babe': 1197,
 'should': 6141,
 'picked':

In [23]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [24]:
np.where(X_train_np[0]!=0)

(array([ 153, 1619, 3364, 4771, 5267, 7336, 7719]),)

In [27]:
X_train[:4][1196]

'You have 1 new voicemail. Please call 08719181503'

In [31]:
X_train_np[0][153]

np.int64(1)

<h3>Train the naive bayes model</h3>

In [32]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [33]:
X_test_cv = v.transform(X_test)

<h3>Evaluate Performance</h3>

In [34]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.98      0.90      0.94       149

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [35]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

<h3>Train the model using sklearn pipeline and reduce number of lines of code</h3>

In [36]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [38]:
clf.fit(X_train, y_train)

In [39]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.98      0.90      0.94       149

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

