In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import ConfusionMatrixDisplay



In [27]:
data = pd.read_csv('../datasets/spam_or_not_spam.csv')
data

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0
...,...,...
2995,abc s good morning america ranks it the NUMBE...,1
2996,hyperlink hyperlink hyperlink let mortgage le...,1
2997,thank you for shopping with us gifts for all ...,1
2998,the famous ebay marketing e course learn to s...,1


In [28]:
#check for NA values
data.isna().sum()

email    1
label    0
dtype: int64

In [29]:
# remove row with NA in email:
data.dropna(inplace=True)

In [30]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['email'], data['label'], test_size=0.4, random_state=46)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1799,)
(1799,)
(1200,)
(1200,)


In [31]:
# Convert text data into TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [32]:
len(vectorizer.vocabulary_)

27058

In [33]:
# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Predict
y_pred = model.predict(X_test_tfidf)

In [34]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)


In [35]:
print('accuracy:', accuracy)

accuracy: 0.8916666666666667


In [36]:
# # Plot the confusion matrix
# ConfusionMatrixDisplay.from_estimator(model, X_test_tfidf, y_test, cmap=plt.cm.Blues)
# plt.title("Confusion Matrix for Spam/Ham Classifier")
# plt.show()

In [37]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)


[[1001    0]
 [ 130   69]]
