In [1]:
import nltk
import os
import re
import pandas as pd
import matplotlib.pyplot as plt

import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report
from nltk.stem import WordNetLemmatizer




from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report
# from sklearn.preprocessing import LabelEncoder
# import seaborn as sns

from sklearn.pipeline import Pipeline


from sklearn.model_selection import train_test_split
# Set the NLTK data download path
# nltk_data_path = '/Users/gowthamkishorevijay/Desktop/Playground/projects/my-venv/CEAS_08.csv'

# Create the directory if it doesn't exist
# if not os.path.exists(nltk_data_path):
#     os.makedirs(nltk_data_path)

# # Add the path where NLTK data will be downloaded
# nltk.data.path.append(nltk_data_path)

# # Ensure necessary NLTK resources are downloaded
# nltk.download('punkt', download_dir=nltk_data_path)
# nltk.download('stopwords', download_dir=nltk_data_path)
# nltk.download('wordnet', download_dir=nltk_data_path)
# nltk.download('omw-1.4', download_dir=nltk_data_path)

# Initialize the lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()

nltk.download('stopwords')
# stop_words = set(stopwords.words('english'))
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    # 1. Lowercasing
    text = text.lower()
    
    # 2. Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # 3. Tokenization
    tokens = word_tokenize(text)
    
    # 4. Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # 5. Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

# Load the CSV file (replace this path with your actual file path)
file_path = '/Users/gowthamkishorevijay/Desktop/Playground/projects/my-venv/CEAS_08.csv'  # Adjust based on your local environment
df = pd.read_csv(file_path)


def clean_text(text):
    # Check if the value is a string, otherwise convert it to an empty string
    if not isinstance(text, str):
        text = ''
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

# Apply the cleaning function
if 'subject' in df.columns and 'body' in df.columns:
    # Fill NaN values with empty strings
    df['subject'] = df['subject'].fillna('')
    df['body'] = df['body'].fillna('')
    
    # Apply the clean_text function to 'subject' and 'body'
    df['subject'] = df['subject'].apply(clean_text)
    df['body'] = df['body'].apply(clean_text)

    # Display cleaned data
    print(df[['subject', 'body']].head())
else:
    print("The CSV file does not contain 'subject' or 'body' columns.")
df.to_csv('preprocessed_emails.csv', index=False)
print(df.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gowthamkishorevijay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                             subject  \
0                          Never agree to be a loser   
1                             Befriend Jenna Jameson   
2                                  CNNcom Daily Top    
3  Re svn commit r  in spamassassintrunk libMailS...   
4                         SpecialPricesPharmMoreinfo   

                                                body  
0  Buck up your troubles caused by small dimensio...  
1  \nUpgrade your sex and pleasures with these te...  
2   THE DAILY TOP  from CNNcom Top videos and sto...  
3  Would anyone object to removing so from this l...  
4  \nWelcomeFastShippingCustomerSupport\nhttpiwfn...  
                                              sender  \
0                   Young Esposito <Young@iworld.de>   
1                       Mok <ipline's1983@icable.ph>   
2  Daily Top 10 <Karmandeep-opengevl@universalnet...   
3                 Michael Parker <ivqrnai@pobox.com>   
4  Gretchen Suggs <externalsep1@loanofficertool.com> 

In [2]:
X = df['body'] 
y = df['label'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


print(y_train)

model = Pipeline([
    ('vectorizer', TfidfVectorizer()),  # Step 1: Text data transformation
    ('nb', RandomForestClassifier())  # Step 2: Classification using Naive Bayes
])

model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
pred_prob_train = model.predict_proba(X_train)[:,1]
pred_prob_test = model.predict_proba(X_test)[:,1]


# calculate ROC AUC score
roc_auc_train = roc_auc_score(y_train, y_pred_train)
roc_auc_test = roc_auc_score(y_test, y_pred_test)
print("\nTrain ROC AUC:", roc_auc_train)
print("Test ROC AUC:", roc_auc_test)

# plot the ROC curve
fpr_train, tpr_train, thresholds_train = roc_curve(y_train, pred_prob_train)
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, pred_prob_test)
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr_train, tpr_train, label="Train ROC AUC: {:.2f}".format(roc_auc_train))
plt.plot(fpr_test, tpr_test, label="Test ROC AUC: {:.2f}".format(roc_auc_test))
plt.legend()
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

# calculate confusion matrix
cm_train = confusion_matrix(y_train, y_pred_train)
cm_test = confusion_matrix(y_test, y_pred_test)

fig, ax = plt.subplots(1, 2, figsize=(11,4))

print("\nConfusion Matrix:")
sns.heatmap(cm_train, annot=True, xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'], cmap="Oranges", fmt='.4g', ax=ax[0])
ax[0].set_xlabel("Predicted Label")
ax[0].set_ylabel("True Label")
ax[0].set_title("Train Confusion Matrix")

sns.heatmap(cm_test, annot=True, xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'], cmap="Oranges", fmt='.4g', ax=ax[1])
ax[1].set_xlabel("Predicted Label")
ax[1].set_ylabel("True Label")
ax[1].set_title("Test Confusion Matrix")

plt.tight_layout()
plt.show()

10951    1
8668     1
8200     1
14981    0
23343    0
        ..
6265     0
11284    1
38158    0
860      1
15795    0
Name: label, Length: 31323, dtype: int64

Train ROC AUC: 1.0
Test ROC AUC: 0.9889738608813544


NameError: name 'plt' is not defined

In [16]:
# new_email = """SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info"""
new_email = "How are you mate"


prediction = model.predict([new_email])

if prediction == 0:
    print( "This is a not a spam Email!")
else:
    print( "This is a Spam Email!")


This is a Spam Email!
