In [None]:
import pandas as pd

import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB #Added Multinomial NB.
from sklearn.metrics import accuracy_score, classification_report #Interpretation of result.

from nltk.stem import PorterStemmer, WordNetLemmatizer #Word pre-processing
nltk.download('stopwords') #Downloaded stop words


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from google.colab import files
uploaded = files.upload()

Saving emails.csv to emails.csv


In [None]:
emails = pd.read_csv("emails.csv")
emails.head()
emails.head()

Unnamed: 0,label,text
0,1,Congratulations! You've won a free gift. Click...
1,1,Exclusive deal just for you! 50% off all produ...
2,1,Urgent: Your account has been compromised. Ver...
3,1,Get rich quick with this simple investment opp...
4,1,You've been selected for a special reward. Don...


In [None]:
stemmer = PorterStemmer()

def preprocess_text(text):
    text = str(text).lower() #applying string to the model.
    text = re.sub(r'\W', ' ', text) #delete special characters.
    words = text.split() #split into words
    stop_words = set(stopwords.words('english')) #Loads the stop words in english
    words = [stemmer.stem(word) for word in words if word not in stop_words] #Uses the Stemmer.
    return ' '.join(words)

emails['processed_text'] = emails['text'].apply(preprocess_text) #Applies the pre processing for the emals.

#Devides the data into testing and training with test size of 30% and train size of 70%.
X_train, X_test, y_train, y_test = train_test_split(emails['processed_text'], emails['label'], test_size=0.3, random_state=42)

vectorizer = CountVectorizer() #Convert text to counts for Multinomial NB
X_train_vec = vectorizer.fit_transform(X_train) #Fits the vectorizer on the training data and transforms it into a numerical matrix.
X_test_vec = vectorizer.transform(X_test) #Transforms the test data using the previously fitted vectorizer.

model = MultinomialNB() #Load Multinomial model.
model.fit(X_train_vec, y_train) #Fit the model into the trainig and testing data.

y_pred = model.predict(X_test_vec) #Make predictions.

print("Classification Report:")
print(classification_report(y_test, y_pred)) #Print the classification result to see the model performance.

#I used a stemmer to reduce words to their root form , which helps in
#normalizing text data for better matching and reduced dimensionality in NLP tasks. This improves efficiency in text classification, search,
#and sentiment analysis by treating words with the same root as equivalent.

#Class 0 (ham):
#High precision (1.00): The model is very confident when predicting ham, meaning no spam was incorrectly labeled as ham.
#Low recall (0.33): The model only correctly identified 33% of actual ham emails.
#F1-score (0.50): A balance of precision and recall, showing poor recall performance.

#Class 1 (spam):
#Low precision (0.33): Many predictions labeled as spam were actually ham.
#High recall (1.00): The model identified all actual spam emails correctly.
#F1-score (0.50): The poor precision pulls down the F1-score.

#Overall Model Performance:
#Accuracy: 50% – The model correctly classified only half of the emails.
#Macro Average F1-score: 0.50 – Shows the overall balance between precision and recall.
#Weighted Average F1-score: 0.50 – Similar to the macro average but accounts for class imbalance.

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.33      1.00      0.50         1

    accuracy                           0.50         4
   macro avg       0.67      0.67      0.50         4
weighted avg       0.83      0.50      0.50         4

