In [1]:
"""
Created on Fri Sep  6 21:13:18 2024

@author: cyphereman
github: https://github.com/cyphereman/

"""

'\nCreated on Fri Sep  6 21:13:18 2024\n\n@author: cyphereman\ngithub: https://github.com/cyphereman/\n\n'

In [3]:
"""Step 1: Collect Data Set"""

import pandas as pd
# Load the dataset provided sourced from Kaggle @ https://www.kaggle.com/datasets/purusinghvi/email-spam-classification-dataset

file_path = 'data/combined_data.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
df.head()

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


In [5]:
"""Step2: Preprocessing """

import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer


# Initialize stopwords, stemmer, and lemmatizer
#nltk.download('stopwords')
#nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Function for preprocessing using stopwords, stemming, and lemmatization
def preprocess_text_advanced(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the text into words
    words = text.split()
    
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    
    # Apply stemming and lemmatization
    words = [stemmer.stem(lemmatizer.lemmatize(word)) for word in words]
    
    # Join the words back into a single string
    return ' '.join(words)

# Apply the advanced preprocessing function to the 'processed_text' column
df['processed_text'] = df['text'].apply(preprocess_text_advanced)

# Display the preprocessed dataset
# Corrected code for displaying without ace_tools

# Apply the advanced preprocessing function to the 'processed_text' column
df['processed_text'] = df['text'].apply(preprocess_text_advanced)

# Display the first few rows of the preprocessed dataset
print(df.head())

   label                                               text  \
0      1  ounce feather bowl hummingbird opec moment ala...   
1      1  wulvob get your medircations online qnb ikud v...   
2      0   computer connection from cnn com wednesday es...   
3      1  university degree obtain a prosperous future m...   
4      0  thanks for all your answers guys i know i shou...   

                                      processed_text  
0  ounc feather bowl hummingbird opec moment alab...  
1  wulvob get medirc onlin qnb ikud viagra escape...  
2  comput connect cnn com wednesday escapenumb ma...  
3  univers degre obtain prosper futur money earn ...  
4  thank answer guy know check rsync manual would...  


In [6]:
"""Step 3:Feature Extraction"""

#Using Bag of Words with CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer to the data and transform the text into feature vectors
X = vectorizer.fit_transform(df['processed_text'])

# Extract the labels
y = df['label']

# Display the shape of the feature matrix
print(f"Feature matrix shape: {X.shape}")

#Using TF-IDF with TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer to the data and transform the text into TF-IDF features
X_tfidf = tfidf_vectorizer.fit_transform(df['processed_text'])

# Labels remain the same
y = df['label']

# Display the shape of the TF-IDF feature matrix
print(f"TF-IDF feature matrix shape: {X_tfidf.shape}")


Feature matrix shape: (83448, 268077)
TF-IDF feature matrix shape: (83448, 268077)


In [10]:
"""Step 4: Feature Selection"""
#Using Chi-Squared Test for Feature Selection

from sklearn.feature_selection import SelectKBest, chi2

# Choose the number of top features you want to select
k = 1000  # You can adjust this number based on your needs

# Apply Chi-Squared feature selection
selector = SelectKBest(chi2, k=k)
X_new = selector.fit_transform(X, y)

# If using TF-IDF features
X_tfidf_new = selector.fit_transform(X_tfidf, y)

# Display the new shape after feature selection
print(f"New feature matrix shape after selection: {X_new.shape}")

New feature matrix shape after selection: (83448, 1000)


In [11]:
"""Step 5: Splitting the Data into Training(70%) and Testing(30%) Sets"""
#train_test_split splits the data into training and testing sets.
#test_size=0.3 indicates 30% of the data is used for testing.
#random_state=42 ensures reproducibility.

from sklearn.model_selection import train_test_split

# For CountVectorizer features
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)

# For TF-IDF features
X_tfidf_train, X_tfidf_test, y_train, y_test = train_test_split(X_tfidf_new, y, test_size=0.3, random_state=42)

In [12]:
"""Step 6: Train the models (Multinomial Naive Byes and Decision Tree)"""

# Import necessary libraries for model training
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

# Train a Multinomial Naive Bayes model on the CountVectorizer features
mnb_model = MultinomialNB()
mnb_model.fit(X_train, y_train)

# Train a Decision Tree Classifier on the CountVectorizer features
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Output confirmation of model training completion
"Multinomial Naive Bayes and Decision Tree models have been successfully trained."


'Multinomial Naive Bayes and Decision Tree models have been successfully trained.'

In [13]:
"""Step 7: Make Predictions"""

# Making predictions on the test set using both models

# Predictions with Multinomial Naive Bayes
mnb_predictions = mnb_model.predict(X_test)

# Predictions with Decision Tree Classifier
dt_predictions = dt_model.predict(X_test)

# Output confirmation of predictions
"Predictions made using both Multinomial Naive Bayes and Decision Tree Classifier models."

'Predictions made using both Multinomial Naive Bayes and Decision Tree Classifier models.'

In [14]:
"""Step 8: Model Evaluation"""

# Importing libraries for evaluation
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Evaluate Multinomial Naive Bayes model
mnb_accuracy = accuracy_score(y_test, mnb_predictions)
mnb_confusion = confusion_matrix(y_test, mnb_predictions)
mnb_classification_report = classification_report(y_test, mnb_predictions)

# Evaluate Decision Tree model
dt_accuracy = accuracy_score(y_test, dt_predictions)
dt_confusion = confusion_matrix(y_test, dt_predictions)
dt_classification_report = classification_report(y_test, dt_predictions)

# Output the evaluation results
evaluation_results = {
    "Multinomial Naive Bayes": {
        "Accuracy": mnb_accuracy,
        "Confusion Matrix": mnb_confusion,
        "Classification Report": mnb_classification_report
    },
    "Decision Tree Classifier": {
        "Accuracy": dt_accuracy,
        "Confusion Matrix": dt_confusion,
        "Classification Report": dt_classification_report
    }
}

evaluation_results

{'Multinomial Naive Bayes': {'Accuracy': 0.920511284202117,
  'Confusion Matrix': array([[11479,   339],
         [ 1651, 11566]], dtype=int64),
  'Classification Report': '              precision    recall  f1-score   support\n\n           0       0.87      0.97      0.92     11818\n           1       0.97      0.88      0.92     13217\n\n    accuracy                           0.92     25035\n   macro avg       0.92      0.92      0.92     25035\nweighted avg       0.93      0.92      0.92     25035\n'},
 'Decision Tree Classifier': {'Accuracy': 0.9516676652686239,
  'Confusion Matrix': array([[11239,   579],
         [  631, 12586]], dtype=int64),
  'Classification Report': '              precision    recall  f1-score   support\n\n           0       0.95      0.95      0.95     11818\n           1       0.96      0.95      0.95     13217\n\n    accuracy                           0.95     25035\n   macro avg       0.95      0.95      0.95     25035\nweighted avg       0.95      0.95  

In [15]:
# Evaluate Multinomial Naive Bayes model
mnb_accuracy = accuracy_score(y_test, mnb_predictions)
mnb_confusion = confusion_matrix(y_test, mnb_predictions)
mnb_classification_report = classification_report(y_test, mnb_predictions)

print("Multinomial Naive Bayes - Accuracy:", mnb_accuracy)
print("Confusion Matrix:\n", mnb_confusion)
print("Classification Report:\n", mnb_classification_report)

Multinomial Naive Bayes - Accuracy: 0.920511284202117
Confusion Matrix:
 [[11479   339]
 [ 1651 11566]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.97      0.92     11818
           1       0.97      0.88      0.92     13217

    accuracy                           0.92     25035
   macro avg       0.92      0.92      0.92     25035
weighted avg       0.93      0.92      0.92     25035



In [16]:
# Evaluate Decision Tree Classifier model
dt_accuracy = accuracy_score(y_test, dt_predictions)
dt_confusion = confusion_matrix(y_test, dt_predictions)
dt_classification_report = classification_report(y_test, dt_predictions)

print("Decision Tree Classifier - Accuracy:", dt_accuracy)
print("Confusion Matrix:\n", dt_confusion)
print("Classification Report:\n", dt_classification_report)


Decision Tree Classifier - Accuracy: 0.9516676652686239
Confusion Matrix:
 [[11239   579]
 [  631 12586]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.95      0.95     11818
           1       0.96      0.95      0.95     13217

    accuracy                           0.95     25035
   macro avg       0.95      0.95      0.95     25035
weighted avg       0.95      0.95      0.95     25035

