In [1]:
######################### Bag of words ######################### 

In [2]:
# Importing the libraries

import pandas as pd
import numpy as np

In [4]:
# Reading the spam file and saving it to a dataframe

df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# Checking the number of all the categories of emails in the df

df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [7]:
# creating another column spam based on the category column

df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
# Checking the data volume

df.shape

(5572, 3)

In [9]:
# Splitting the dataset into train and test for training

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.25)

In [10]:
# Taking a peek in X_train and y_train

X_train[:5]

2026    Yes obviously, but you are the eggs-pert and t...
2122    In xam hall boy asked girl Tell me the startin...
4485         Shopping? Eh ger i toking abt syd leh...Haha
3196    Great. P diddy is my neighbor and comes for to...
4408    Awesome, plan to get here any time after like ...
Name: Message, dtype: object

In [11]:
y_train[:5]

2026    0
2122    0
4485    0
3196    0
4408    0
Name: spam, dtype: int64

In [12]:
# Creating bag of words

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

Xtrain_cv = cv.fit_transform(X_train.values)
Xtrain_cv

<4179x7506 sparse matrix of type '<class 'numpy.int64'>'
	with 55372 stored elements in Compressed Sparse Row format>

In [14]:
Xtrain_cv.toarray()[:2][0]

cv.get_feature_names_out()[1771]

'clocks'

In [15]:
cv.vocabulary_

{'yes': 7457,
 'obviously': 4754,
 'but': 1510,
 'you': 7473,
 'are': 1007,
 'the': 6615,
 'eggs': 2461,
 'pert': 5024,
 'and': 925,
 'potato': 5191,
 'head': 3266,
 'speak': 6169,
 'soon': 6135,
 'in': 3544,
 'xam': 7415,
 'hall': 3203,
 'boy': 1411,
 'asked': 1051,
 'girl': 3047,
 'tell': 6556,
 'me': 4283,
 'starting': 6254,
 'term': 6573,
 'for': 2843,
 'dis': 2251,
 'answer': 948,
 'can': 1565,
 'den': 2156,
 'manage': 4213,
 'on': 4801,
 'my': 4538,
 'own': 4900,
 'after': 810,
 'lot': 4094,
 'of': 4763,
 'hesitation': 3318,
 'lookin': 4079,
 'around': 1028,
 'silently': 5985,
 'she': 5900,
 'said': 5707,
 'intha': 3614,
 'ponnungale': 5157,
 'ipaditan': 3633,
 'shopping': 5934,
 'eh': 2463,
 'ger': 3023,
 'toking': 6746,
 'abt': 734,
 'syd': 6465,
 'leh': 3953,
 'haha': 3195,
 'great': 3135,
 'diddy': 2214,
 'is': 3642,
 'neighbor': 4607,
 'comes': 1829,
 'toothpaste': 6778,
 'every': 2581,
 'morning': 4462,
 'awesome': 1121,
 'plan': 5089,
 'to': 6732,
 'get': 3025,
 'here': 33

In [17]:
Xtrain_np = Xtrain_cv.toarray()

In [31]:
# checking indexes where value is not 0

np.where(Xtrain_np[0]!=0)

(array([ 925, 1007, 1510, 2461, 3266, 4754, 5024, 5191, 6135, 6169, 6615,
        7457, 7473]),)

In [36]:
# Checking the content of the first email in first 5 emails in X_train

X_train[:5][2026]

'Yes obviously, but you are the eggs-pert and the potato head… Speak soon!'

In [None]:
Xtrain_np[0][1510]

In [23]:
# Training the naive bayes model

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(Xtrain_cv, y_train)

In [24]:
Xtest_cv = cv.transform(X_test)

In [25]:
# Validation

from sklearn.metrics import classification_report

y_pred = model.predict(Xtest_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1201
           1       0.97      0.93      0.95       192

    accuracy                           0.99      1393
   macro avg       0.98      0.96      0.97      1393
weighted avg       0.99      0.99      0.99      1393



In [37]:
# Checking the spam email, where 1 represents that it is a spam!

test_emails = [
    'Hey, can we watch Oppenheimer tomorrow?',
    '30% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

count = cv.transform(test_emails)
model.predict(count)

array([0, 1])