In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

# Download required resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define text cleaning and normalization
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize_and_normalize(text):
    words = word_tokenize(text)
    return [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

# Load dataset
df = pd.read_csv("C:/Users/Dell/Desktop/PROJECTS/Email-spam-Detection-Using-Blockchain/archive1/emails.csv")

# Ensure correct column
df.columns = df.columns.str.strip().str.lower()
assert 'text' in df.columns, "Dataset must have a 'text' column."

# Clean and tokenize the text
df['text'] = df['text'].fillna('')
df['tokens'] = df['text'].apply(lambda x: tokenize_and_normalize(clean_text(x)))

# Show sample
print(df[['text', 'tokens']].head())

# Optionally save to file
df.to_csv("tokenized_cleaned_data.csv", index=False)
print("✅ Tokenized and cleaned data saved to 'tokenized_cleaned_data.csv'")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                                text  \
0  Subject: naturally irresistible your corporate...   
1  Subject: the stock trading gunslinger  fanny i...   
2  Subject: unbelievable new homes made easy  im ...   
3  Subject: 4 color printing special  request add...   
4  Subject: do not have money , get software cds ...   

                                              tokens  
0  [subject, naturally, irresistible, corporate, ...  
1  [subject, stock, trading, gunslinger, fanny, m...  
2  [subject, unbelievable, new, home, made, easy,...  
3  [subject, color, printing, special, request, a...  
4  [subject, money, get, software, cd, software, ...  
✅ Tokenized and cleaned data saved to 'tokenized_cleaned_data.csv'
