# 02 â€“ Preprocessing

In this notebook we:
- clean the email body text
- remove noise (URLs, HTML, punctuation)
- lowercase
- optionally remove stopwords
- save a cleaned CSV for modeling

In [None]:
import pandas as pd
import re
import nltk
from pathlib import Path

# Download stopwords once
nltk.download('stopwords')
from nltk.corpus import stopwords

DATA_PATH = Path("../data/raw/CEAS_08.csv")
df = pd.read_csv(DATA_PATH, low_memory=False)

df.head()

In [2]:
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    
    # remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # remove numbers
    text = re.sub(r'\d+', '', text)
    
    # remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # collapse whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [None]:
df['clean_body'] = df['body'].apply(clean_text)
df[['body', 'clean_body']].head(5)

In [None]:
df_clean = df[['clean_body', 'label']]
df_clean.head()

In [None]:
OUTPUT_PATH = Path("../data/processed/cleaned.csv")
df_clean.to_csv(OUTPUT_PATH, index=False)

OUTPUT_PATH