In [4]:
# Step 1: Import Libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Step 2: Download NLTK Resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Step 3: Initialize Tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Step 4: Cleaning Functions
def clean_text(text):
    """Clean and normalize input text."""
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
    return text

def tokenize_text(text):
    """Tokenize, remove stopwords, and lemmatize."""
    words = word_tokenize(text)
    return [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

def preprocess_email(text):
    """Full preprocessing pipeline: clean, tokenize, lemmatize."""
    return ' '.join(tokenize_text(clean_text(text)))

# Step 5: Load Dataset
df = pd.read_csv("C:/Users/Dell/Desktop/PROJECTS/Email-spam-Detection-Using-Blockchain/archive1/emails.csv")

# Step 6: Ensure Columns Are Named Correctly
df.columns = df.columns.str.strip().str.lower()
assert 'text' in df.columns, "Dataset must have a 'text' column."

# Step 7: Clean Text Data
df['text'] = df['text'].fillna('')
df['processed_email'] = df['text'].apply(preprocess_email)
df = df[df['processed_email'].str.strip() != '']  # Remove empty results

# Step 8: Save to File
df.to_csv('cleaned_data.csv', index=False)
print("✅ Cleaned data saved to 'cleaned_data.csv'")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


✅ Cleaned data saved to 'cleaned_data.csv'
