In [13]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spam-mails-dataset/spam_ham_dataset.csv


In [14]:
df = pd.read_csv('/kaggle/input/spam-mails-dataset/spam_ham_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [15]:
df.tail()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0
5170,4807,spam,Subject: important online banking alert\r\ndea...,1


## drop the Unnamed :0 column

In [16]:
df = df.drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,label,text,label_num
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [17]:
df.tail()

Unnamed: 0,label,text,label_num
5166,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,ham,Subject: industrial worksheets for august 2000...,0
5170,spam,Subject: important online banking alert\r\ndea...,1


In [18]:
df.isnull().sum()

label        0
text         0
label_num    0
dtype: int64

In [21]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Define the preprocessing function
def preprocess_email_data(df):
    # Initialize the stemmer and lemmatizer
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    # Define stopwords list
    stop_words = set(stopwords.words('english'))
    
    # Define the preprocessing function for each email
    def clean_text(text):
        # Convert to lowercase
        text = text.lower()

        # Remove "subject" at the start of the email
        if text.startswith("subject"):
            text = text[7:].strip()  # Remove the word "subject" (7 characters) and any leading spaces
        
        # Remove punctuation and non-alphabetic characters
        text = re.sub(r'[^a-z\s]', '', text)
        
        # Remove stopwords
        text = ' '.join([word for word in text.split() if word not in stop_words])
        
        # Apply stemming (optional, you can also choose lemmatization instead)
        text = ' '.join([stemmer.stem(word) for word in text.split()])
        
        # Alternatively, you could use lemmatization like this:
        # text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
        
        return text
    
    # Apply the clean_text function to the 'mail' column
    df['text'] = df['text'].apply(clean_text)
    
    return df


# Preprocess the dataset
df_cleaned = preprocess_email_data(df)

# Show the cleaned data
print(df_cleaned.head())


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  label                                               text  label_num
0   ham  enron methanol meter follow note gave monday p...          0
1   ham  hpl nom januari see attach file hplnol xl hpln...          0
2   ham  neon retreat ho ho ho around wonder time year ...          0
3  spam  photoshop window offic cheap main trend aba da...          1
4   ham  indian spring deal book teco pvr revenu unders...          0


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report



# Vectorize using TF-IDF
tfidf_vectorizer = TfidfVectorizer()

# Apply TF-IDF vectorizer on the 'mail' column
X = tfidf_vectorizer.fit_transform(df['text'])

# Step 2: Split the dataset into training and test sets
y = df['label_num']  # Target variable (spam or not)

# Split the data (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Train Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

# Step 4: Predict using the trained model
y_pred = model.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 91.24%

Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94      1121
           1       1.00      0.68      0.81       431

    accuracy                           0.91      1552
   macro avg       0.95      0.84      0.88      1552
weighted avg       0.92      0.91      0.91      1552



In [23]:
# Define the preprocessing function for a single email
def preprocess_single_email(text):
    # Preprocess using the same steps as before
    # Convert to lowercase
    text = text.lower()

    # Remove "subject" at the start of the email
    if text.startswith("subject"):
        text = text[7:].strip()  # Remove the word "subject" (7 characters) and any leading spaces

    # Remove punctuation and non-alphabetic characters (except spaces)
    text = re.sub(r'[^a-z\s]', '', text)

    # Apply stemming
    stemmer = PorterStemmer()
    text = ' '.join([stemmer.stem(word) for word in text.split()])

    return text

# Predict a single email
def predict_email_spam(model, vectorizer, email):
    # Preprocess the email first
    preprocessed_email = preprocess_single_email(email)
    
    # Transform the email to TF-IDF vector
    email_vector = vectorizer.transform([preprocessed_email])
    
    # Predict using the trained model
    prediction = model.predict(email_vector)
    
    # Return prediction (1 = spam, 0 = not spam)
    return "Spam" if prediction == 1 else "Not Spam"

# Example usage:
sample_email = "Subject: Win a free iPhone now! Claim your prize by clicking the link."

# Preprocess and predict the sample email
result = predict_email_spam(model, tfidf_vectorizer, sample_email)
print(f"The email is: {result}")


The email is: Spam
