In [None]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
import matplotlib.pyplot as plt
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from sklearn.metrics import classification_report

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df = pd.read_csv('spam.csv', encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])
display(df.head())

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df['v2'] = df['v2'].str.lower()
df.head()

Unnamed: 0,v1,v2
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."


In [None]:
# Define a function to clean the text by removing punctuation, digits, and stopwords
def clean_text(text):
    text = ''.join([char for char in text if char not in string.punctuation])  #removing  punctuations
    text = ''.join([char for char in text if not char.isdigit()])  #removing digits
    stop_words = set(stopwords.words('english'))  #removing stopwords
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return text

# Apply the clean_text function to the 'v2' column
df['v2'] = df['v2'].apply(clean_text)
display(df.head())

Unnamed: 0,v1,v2
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts st ...
3,ham,u dun say early hor u c already say
4,ham,nah dont think goes usf lives around though


In [None]:
# Initialize the PorterStemmer for stemming
stemmer = PorterStemmer()  #stemming
# Apply stemming to the 'v2' column
df['v2'] = df['v2'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
display(df.head())

Unnamed: 0,v1,v2
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri wkli comp win fa cup final tkt st m...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah dont think goe usf live around though


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Initialize TF-IDF Vectorizer and convert text to numeric form
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df['v2'])
y = df['v1']
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 45170 stored elements and shape (5572, 7111)>
  Coords	Values
  (0, 2346)	0.1407985047780855
  (0, 3118)	0.351010008864237
  (0, 4597)	0.23967705518047827
  (0, 1275)	0.2718330894367227
  (0, 402)	0.2669147226548332
  (0, 786)	0.2965346739155272
  (0, 2436)	0.19536728958722402
  (0, 6847)	0.2385379711411889
  (0, 3254)	0.2877661035163326
  (0, 784)	0.33507699478462705
  (0, 1057)	0.2965346739155272
  (0, 2399)	0.16472605029470383
  (0, 210)	0.351010008864237
  (0, 6640)	0.19499481620328865
  (1, 4218)	0.2827396376113674
  (1, 3286)	0.42078899608869724
  (1, 3084)	0.47731294876998304
  (1, 6757)	0.44480400570972006
  (1, 4248)	0.5633086751818669
  (2, 2166)	0.1234436032789319
  (2, 1823)	0.38445868660308324
  (2, 6806)	0.2034958059952195
  (2, 1153)	0.21006000796381494
  (2, 6770)	0.15537688175582845
  (2, 1942)	0.501681110754003
  :	:
  (5568, 2346)	0.295966624030453
  (5568, 2691)	0.37536575942320294
  (5568, 2155)	0.577107

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Initialize and train the Multinomial Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [None]:
# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%\n")

Accuracy: 95.93%



In [None]:
# Print the classification report
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1453
        spam       1.00      0.69      0.82       219

    accuracy                           0.96      1672
   macro avg       0.98      0.84      0.90      1672
weighted avg       0.96      0.96      0.96      1672



The Multinomial Naive Bayes model achieved a high accuracy of 95.93%, indicating that it correctly classified a large percentage of the emails. The precision for spam was 100.00%, which means that when the model predicted an email as spam, it was always correct. However, the recall for spam was lower at 68.95%, suggesting that the model missed approximately 31% of the actual spam emails. The F1-score of 81.62% provides a combined measure of precision and recall, indicating a reasonably good balance.

In this context, achieving 100% precision for spam is highly desirable, as it minimizes the inconvenience of legitimate emails being incorrectly filtered as spam. The lower recall suggests there is room for improvement in identifying all spam emails. Depending on the specific application, a higher recall might be prioritized, even if it means a slight decrease in precision. Further
improvements could involve exploring different text preprocessing techniques, using other feature extraction methods (e.g., word embeddings), or trying different classification algorithms.

In [None]:
def clean_text(text):
    text = ''.join([char for char in text if char not in string.punctuation])  #removing  punctuations
    text = ''.join([char for char in text if not char.isdigit()])  #removing digits
    stop_words = set(stopwords.words('english'))  #removing stopwords
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return text

stemmer = PorterStemmer()  #stemming

email = 'Your Weekly Writing Update It looks like you might be signed out of your Grammarly account. To keep tracking your achievements, try signing back in when you have a chance'

# Preprocess the email
email = email.lower()
# Apply the same cleaning and stemming as done for the training data
email = clean_text(email)
email = ' '.join([stemmer.stem(word) for word in email.split()])

# Transform the email using the fitted TF-IDF vectorizer
email_transformed = tfidf_vectorizer.transform([email])

# Predict the class of the email using the trained model
prediction = model.predict(email_transformed)

print(f"The email is predicted as: {prediction[0]}")

The email is predicted as: ham


USE K-Fold method to recalculate the accuracy

In [None]:
from sklearn.model_selection import cross_val_score

# Apply K-Fold Cross Validation with k folds
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

# Print the accuracy scores for each fold
print("Accuracy scores for each fold:", scores)

# Calculate and print the mean and standard deviation of the accuracy scores
print("Mean accuracy:", scores.mean())
print("Standard deviation of accuracy:", scores.std())

Accuracy scores for each fold: [0.96950673 0.96053812 0.95691203 0.96229803 0.96409336]
Mean accuracy: 0.9626696508360773
Standard deviation of accuracy: 0.0041618337143877705
