In [40]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [33]:
# Load dataset
df = pd.read_csv("email.csv")

In [34]:
# Download stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\corma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [35]:
# Check for missing values
df.dropna(inplace=True)

df.columns = ["Category", "Message"]

# Convert labels to binary (0 = Ham, 1 = Spam) and handle NaN values
df["label"] = df["Category"].map({"ham": 0, "spam": 1}).fillna(0).astype(int)

# Text Cleaning Function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = " ".join([word for word in text.split() if word not in stopwords.words("english")])  # Remove stopwords
    return text

# Apply cleaning function
df["clean_text"] = df["Message"].apply(clean_text)

In [37]:
# Convert text into numerical form using TF-IDF
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(df["clean_text"])  # Feature Matrix
y = df["label"].astype(int)  # Target (0 = ham, 1 = spam)

In [39]:
# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)


In [41]:
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)