# 02 â€“ Preprocessing

In this notebook we:
- clean the email body text
- remove noise (URLs, HTML, punctuation)
- lowercase
- optionally remove stopwords
- save a cleaned CSV for modeling

In [1]:
import pandas as pd
import re
import nltk
from pathlib import Path

# Download stopwords once
nltk.download('stopwords')
from nltk.corpus import stopwords

DATA_PATH = Path("../data/raw/CEAS_08.csv")
df = pd.read_csv(DATA_PATH, low_memory=False)

df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Darko\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,sender,receiver,date,subject,body,label,urls
0,Young Esposito <Young@iworld.de>,user4@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 16:31:02 -0700",Never agree to be a loser,"Buck up, your troubles caused by small dimensi...",1,1
1,Mok <ipline's1983@icable.ph>,user2.2@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 18:31:03 -0500",Befriend Jenna Jameson,\nUpgrade your sex and pleasures with these te...,1,1
2,Daily Top 10 <Karmandeep-opengevl@universalnet...,user2.9@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 20:28:00 -1200",CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1
3,Michael Parker <ivqrnai@pobox.com>,SpamAssassin Dev <xrh@spamassassin.apache.org>,"Tue, 05 Aug 2008 17:31:20 -0600",Re: svn commit: r619753 - in /spamassassin/tru...,Would anyone object to removing .so from this ...,0,1
4,Gretchen Suggs <externalsep1@loanofficertool.com>,user2.2@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 19:31:21 -0400",SpecialPricesPharmMoreinfo,\nWelcomeFastShippingCustomerSupport\nhttp://7...,1,1


In [2]:
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    
    # remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # remove numbers
    text = re.sub(r'\d+', '', text)
    
    # remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # collapse whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [3]:
df['clean_body'] = df['body'].apply(clean_text)
df[['body', 'clean_body']].head(5)

Unnamed: 0,body,clean_body
0,"Buck up, your troubles caused by small dimensi...",buck up your troubles caused by small dimensio...
1,\nUpgrade your sex and pleasures with these te...,upgrade your sex and pleasures with these tech...
2,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,the daily top from cnncom top videos and stori...
3,Would anyone object to removing .so from this ...,would anyone object to removing so from this l...
4,\nWelcomeFastShippingCustomerSupport\nhttp://7...,welcomefastshippingcustomersupport


In [4]:
df_clean = df[['clean_body', 'label']]
df_clean.head()

Unnamed: 0,clean_body,label
0,buck up your troubles caused by small dimensio...,1
1,upgrade your sex and pleasures with these tech...,1
2,the daily top from cnncom top videos and stori...,1
3,would anyone object to removing so from this l...,0
4,welcomefastshippingcustomersupport,1


In [5]:
OUTPUT_PATH = Path("../data/processed/cleaned.csv")
df_clean.to_csv(OUTPUT_PATH, index=False)

OUTPUT_PATH

WindowsPath('../data/processed/cleaned.csv')