In [12]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [13]:
# Step 2: Load the dataset
# Replace 'your_username' with your actual macOS username
csv_file_path = '/Users/filiparodrigues/Downloads/emails.csv'
df = pd.read_csv(csv_file_path)

In [14]:
# Step 3: Explore the dataset
print("First few rows of the dataset:")
print(df.head())
print("\nColumn names:")
print(df.columns)
print("\nMissing values in each column:")
print(df.isnull().sum())


First few rows of the dataset:
                       file                                            message
0     allen-p/_sent_mail/1.  Message-ID: <18782981.1075855378110.JavaMail.e...
1    allen-p/_sent_mail/10.  Message-ID: <15464986.1075855378456.JavaMail.e...
2   allen-p/_sent_mail/100.  Message-ID: <24216240.1075855687451.JavaMail.e...
3  allen-p/_sent_mail/1000.  Message-ID: <13505866.1075863688222.JavaMail.e...
4  allen-p/_sent_mail/1001.  Message-ID: <30922949.1075863688243.JavaMail.e...

Column names:
Index(['file', 'message'], dtype='object')

Missing values in each column:
file       0
message    0
dtype: int64


In [15]:
# Step 4: Preprocess the data
# Since your dataset has only 'message', we need to create a target label (spam/not spam)
# For this example, let's create a simple labeling mechanism
# You will need to label the messages appropriately (0 for not spam, 1 for spam)

# This is a placeholder. You'll need to replace it with actual labels based on your dataset
# Assuming you have some logic or manual labeling to classify the messages
# For example: you might read a separate file or hard-code labels for demonstration
# Here’s a simple way to generate dummy labels:
import numpy as np

In [16]:
# Generate dummy labels for demonstration (replace with actual logic)
np.random.seed(42)  # For reproducibility
df['label'] = np.random.choice(['not spam', 'spam'], size=len(df))

In [17]:
# Now set up the features (X) and target labels (y)
X = df['message']  # Feature: email content
y = df['label']    # Target: spam or not spam

In [18]:
# Step 5: Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [19]:
# Step 6: Vectorize the email content
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [20]:
# Step 7: Train the model
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

In [21]:
# Step 8: Evaluate the model
y_pred = model.predict(X_test_vectorized)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

    not spam       0.50      0.49      0.49     51798
        spam       0.50      0.51      0.51     51683

    accuracy                           0.50    103481
   macro avg       0.50      0.50      0.50    103481
weighted avg       0.50      0.50      0.50    103481


Confusion Matrix:
[[25217 26581]
 [25079 26604]]


In [22]:
# Step 9: Save the model and vectorizer
joblib.dump(model, 'email_classifier.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')
print("\nModel and vectorizer saved successfully.")



Model and vectorizer saved successfully.
