In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
data = pd.read_csv('spam_ham_dataset.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [5]:
data.shape

(5171, 4)

In [6]:
data.drop(columns=["Unnamed: 0","label"],axis=1)

Unnamed: 0,text,label_num
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,"Subject: photoshop , windows , office . cheap ...",1
4,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...
5166,Subject: put the 10 on the ft\r\nthe transport...,0
5167,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,Subject: industrial worksheets for august 2000...,0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    data["text"], data["label_num"], test_size=0.2, random_state=42
)

In [9]:
#TEXT → COUNT VECTORIZATION
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [10]:
X_train_vec.shape

(4136, 44934)

In [11]:
y_train.shape

(4136,)

In [12]:
X_test_vec.shape

(1035, 44934)

In [13]:
y_test.shape

(1035,)

In [14]:
#TRAIN MULTINOMIAL NAIVE BAYES
multinb_vectorizer = MultinomialNB()
multinb_vectorizer.fit(X_train_vec, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [15]:
#PREDICT
y_pred = multinb_vectorizer.predict(X_test_vec)

In [16]:
#EVALUATION
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9710144927536232

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98       742
           1       0.95      0.95      0.95       293

    accuracy                           0.97      1035
   macro avg       0.97      0.96      0.96      1035
weighted avg       0.97      0.97      0.97      1035


Confusion Matrix:
 [[728  14]
 [ 16 277]]


In [17]:
#TEST ON CUSTOM MESSAGE (for Spam)
sample = ["Congratulations! You won a free lottery worth 50000"]
sample_vec = vectorizer.transform(sample)
print("\nPrediction for sample message:", multinb_vectorizer.predict(sample_vec))


Prediction for sample message: [1]


In [18]:
#TEST ON CUSTOM MESSAGE (for not spam)
sample = ["This is a meeting for discussing Machine Learning Chapters"]
sample_vec = vectorizer.transform(sample)
print("\nPrediction for sample message:", multinb_vectorizer.predict(sample_vec))


Prediction for sample message: [0]


In [19]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words="english")

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [20]:
#TRAIN MULTINOMIAL NAIVE BAYES
multinb_tfidf = MultinomialNB()
multinb_tfidf.fit(X_train_tfidf, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [21]:
#PREDICT
y_pred = multinb_tfidf.predict(X_test_tfidf)

In [22]:
#EVALUATION
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.923671497584541

Classification Report:
               precision    recall  f1-score   support

           0       0.90      1.00      0.95       742
           1       1.00      0.73      0.84       293

    accuracy                           0.92      1035
   macro avg       0.95      0.87      0.90      1035
weighted avg       0.93      0.92      0.92      1035


Confusion Matrix:
 [[741   1]
 [ 78 215]]


In [23]:
#TEST ON CUSTOM MESSAGE (for Spam)
sample = ["Congratulations! You won a free lottery worth 50000"]
sample_vec = vectorizer.transform(sample)
print("\nPrediction for sample message:", multinb_tfidf.predict(sample_vec))


Prediction for sample message: [1]


In [24]:
#TEST ON CUSTOM MESSAGE (for not spam)
sample = ["This is a meeting for discussing Machine Learning Chapters"]
sample_vec = vectorizer.transform(sample)
print("\nPrediction for sample message:", multinb_tfidf.predict(sample_vec))


Prediction for sample message: [0]
