In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("balaka18/email-spam-classification-dataset-csv")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/email-spam-classification-dataset-csv


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, f1_score

# Load dataset
df = pd.read_csv("/kaggle/input/email-spam-classification-dataset-csv/emails.csv")

# Drop 'Email No.' column if present
if 'Email No.' in df.columns:
    df = df.drop(columns=['Email No.'])

# Rename target column if necessary
X = df.drop(columns=['Prediction'])  # Features: word counts
y = df['Prediction']                # Labels: 0 = ham, 1 = spam

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predict
y_pred = nb_model.predict(X_test)

# Evaluation
conf_matrix = confusion_matrix(y_test, y_pred)
print("=== Confusion Matrix ===")
print(conf_matrix)

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

f1 = f1_score(y_test, y_pred)
print("F1 Score: {:.4f}".format(f1))

print("""
 Analysis:
- The Naive Bayes classifier works well with discrete features like word counts.
- This model assumes conditional independence between word features.
- Performance is typically strong for spam detection problems, and the model is fast to train.
- Confusion matrix shows how well spam/ham are separated; F1 score reflects the balance between precision and recall.
""")


=== Confusion Matrix ===
[[704  35]
 [ 12 284]]

=== Classification Report ===
              precision    recall  f1-score   support

           0       0.98      0.95      0.97       739
           1       0.89      0.96      0.92       296

    accuracy                           0.95      1035
   macro avg       0.94      0.96      0.95      1035
weighted avg       0.96      0.95      0.96      1035

F1 Score: 0.9236

🧠 Analysis:
- The Naive Bayes classifier works well with discrete features like word counts.
- This model assumes conditional independence between word features.
- Performance is typically strong for spam detection problems, and the model is fast to train.
- Confusion matrix shows how well spam/ham are separated; F1 score reflects the balance between precision and recall.

