In [24]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [7]:
# Load the dataset
data = pd.read_csv('../datasets/email_spam.csv').drop("Unnamed: 0.1",axis=1).drop("Unnamed: 0",axis=1)
data.head()
data.groupby('Label').describe()


Unnamed: 0_level_0,Body,Body,Body,Body
Unnamed: 0_level_1,count,unique,top,freq
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,5000,4927,Subject: calpine daily gas nomination\r\n >\r\...,9
1,5000,4760,Subject: \r\n,16


In [11]:
#spam if it is one and not spam if it is a zero
data['spam']=data['Label'].apply(lambda x:1 if x=='spam' else 0)
data

Unnamed: 0,Body,Label,spam
0,Subject: stock promo mover : cwtd\r\n * * * ur...,1,0
1,Subject: are you listed in major search engine...,1,0
2,"Subject: important information thu , 30 jun 20...",1,0
3,Subject: = ? utf - 8 ? q ? bask your life with...,1,0
4,"Subject: "" bidstogo "" is places to go , things...",1,0
...,...,...,...
9995,"Subject: monday 22 nd oct\r\n louise ,\r\n do ...",0,0
9996,Subject: missing bloomberg deals\r\n stephanie...,0,0
9997,Subject: eops salary survey questionnaire\r\n ...,0,0
9998,"Subject: q 3 comparison\r\n hi louise ,\r\n i ...",0,0


In [20]:
# Split the dataset into training and testing data
X = data["Body"].values
y = data['Label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [22]:
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)
# Initialize the Naive Bayes classifier
naive_bayes = MultinomialNB()

# Train the classifier
naive_bayes.fit(X_train_vectorized, y_train)



In [23]:
# Predictions on the testing set
y_pred = naive_bayes.predict(X_test_vectorized)


In [28]:
test_email = ["reward money click"]

test_email_vectorized = vectorizer.transform(test_email)

prediction = naive_bayes.predict(test_email_vectorized)

if prediction[0] == 'spam':
    print("The email is classified as spam.")
else:
    print("The email is not classified as spam.")


The email is not classified as spam.


In [26]:
# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


# Calculating metrics
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
f1 = metrics.f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Accuracy: 0.99
Accuracy: 0.9865
Precision: 0.9910269192422732
Recall: 0.9822134387351779
F1 Score: 0.9866004962779157
