In [None]:
# Set Up Your Environment: Create and activate a virtual environment.
# python -m venv myenv

# On Windows:
# myenv\Scripts\activate


In [1]:
# Install Required Libraries:
# pip install pandas scikit-learn


In [4]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

In [5]:
# Function to clean text data
def clean_text(text):
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and numbers
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

In [6]:
# 1. Load the dataset from messages.csv
df = pd.read_csv('messages.csv')

# Display dataset preview and information
print("Dataset Preview:")
print(df.head())
print("\nDataset Info:")
print(df.info())


Dataset Preview:
                                             subject  \
0            job posting - apple-iss research center   
1                                                NaN   
2  query : letter frequencies for text identifica...   
3                                               risk   
4                           request book information   

                                             message  label  
0  content - length : 3386 apple-iss research cen...      0  
1  lang classification grimes , joseph e . and ba...      0  
2  i am posting this inquiry for sergei atamas ( ...      0  
3  a colleague and i are researching the differin...      0  
4  earlier this morning i was on the phone with a...      0  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2893 entries, 0 to 2892
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   subject  2831 non-null   object
 1   message  2893 non-null   object
 2   

In [7]:
# 2. Check for required text column
if 'text' in df.columns:
    df['clean_text'] = df['text'].apply(clean_text)
elif 'message' in df.columns:
    # Use 'message' column if 'text' column is not available
    df['clean_text'] = df['message'].apply(clean_text)
else:
    raise ValueError("Expected either a 'text' or 'message' column in the dataset.")

In [8]:
# 3. Check for the label column
if 'label' not in df.columns:
    raise ValueError("Expected a 'label' column in the dataset.")

# 4. Define features and target
X = df['clean_text']
y = df['label']

# 5. Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
# 6. Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# 7. Train a Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_vect, y_train)

In [10]:
# 8. Make predictions and evaluate the model
y_pred = nb_model.predict(X_test_vect)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')

print("Spam Email Detection Evaluation:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Spam Email Detection Evaluation:
Accuracy: 0.9930915371329879
Precision: 1.0
Recall: 0.9652173913043478

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       464
           1       1.00      0.97      0.98       115

    accuracy                           0.99       579
   macro avg       1.00      0.98      0.99       579
weighted avg       0.99      0.99      0.99       579

