In [2]:
# Step 1: Import Libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 2: Download Required NLTK Resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Step 3: Initialize Tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Step 4: Define Preprocessing Functions
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize_and_normalize(text):
    words = word_tokenize(text)
    return [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

def preprocess_email(text):
    return ' '.join(tokenize_and_normalize(clean_text(text)))

# Step 5: Load Dataset
df = pd.read_csv("C:/Users/Dell/Desktop/PROJECTS/Email-spam-Detection-Using-Blockchain/archive1/emails.csv")
df.columns = df.columns.str.strip().str.lower()

# Ensure required columns exist
assert 'text' in df.columns and 'spam' in df.columns, "Dataset must contain 'text' and 'spam' columns."

# Step 6: Apply Preprocessing
df['text'] = df['text'].fillna('')
df['processed_email'] = df['text'].apply(preprocess_email)

# Step 7: Remove Empty Entries
df = df[df['processed_email'].str.strip() != '']

# Step 8: Save Cleaned Data
df.to_csv('cleaned_data.csv', index=False)
print("✅ Cleaned data saved to 'cleaned_data.csv'")

# Step 9: Feature Extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df['processed_email']).toarray()
y = df['spam']

print("✅ TF-IDF features extracted.")
print("Feature matrix shape:", X.shape)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


✅ Cleaned data saved to 'cleaned_data.csv'
✅ TF-IDF features extracted.
Feature matrix shape: (5728, 5000)
