In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import unicodedata

In [4]:
df = pd.read_csv('../data/data.csv')

In [5]:
# pd.set_option('display.max_colwidth', None)

In [6]:
df.groupby('label').head(5)

Unnamed: 0,text,label
0,"Subject: [External Email] Amit, your applicati...",applied
1,Subject: [External Email] Barclays: We’ve rece...,applied
2,Subject: [External Email] Thank you for applyi...,applied
3,"Subject: [External Email] Amit, your applicati...",applied
4,"Subject: [External Email] Amit, your applicati...",applied
500,Subject: [External Email] Your application to ...,rejects
501,Subject: [External Email] Regarding your appli...,rejects
502,Subject: [External Email] Application Follow U...,rejects
503,Subject: [External Email] Your Job Application...,rejects
504,Subject: [External Email] State Street Externa...,rejects


In [7]:
def clean_text(text):
    """
    Applies a series of cleaning operations to a given text.
    """
    if not isinstance(text, str):
        return "" # Handle non-string inputs (e.g., None, NaN)

    # 1. Decode HTML entities (if any, though BeautifulSoup often handles this)
    # This step is often implicitly handled by BeautifulSoup's get_text()
    # text = html.unescape(text) # Requires import html

    # 2. Remove HTML tags using BeautifulSoup
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text(separator=' ', strip=True) # strip=True removes leading/trailing whitespace from each line

    # 3. Remove URLs
    clean_text = re.sub(r'http\S+|www\S+|https\S+', '', clean_text, flags=re.MULTILINE)

    # 4. Remove email addresses (optional, but emails often contain unique IDs/names)
    clean_text = re.sub(r'\S*@\S*\s?', '', clean_text)

    # 5. Remove non-alphanumeric characters (keeping spaces and some punctuation for now)
    # This regex keeps letters, numbers, and basic punctuation, and whitespace
    # You might adjust this depending on how much punctuation you want to keep.
    # For general classification, removing most is common.
    # Let's keep alphanumeric and basic spaces initially.
    clean_text = re.sub(r'[^a-zA-Z0-9\s.,?!]', '', clean_text) # keeps letters, numbers, spaces, and basic punctuation

    # 6. Convert to lowercase
    clean_text = clean_text.lower()

    # 7. Remove extra whitespaces (tabs, multiple spaces, newlines)
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()

    # 8. Remove leading/trailing specific characters that might remain
    clean_text = clean_text.strip('.-_ ') # Remove common leading/trailing artifacts

    # 9. Handle Unicode characters (e.g., accented characters to ASCII equivalents)
    clean_text = unicodedata.normalize('NFKD', clean_text).encode('ascii', 'ignore').decode('utf-8')

    return clean_text

In [8]:
df['plain_text'] = df['text'].apply(clean_text)

In [9]:
print(df[['text', 'plain_text']].head())

                                                text  \
0  Subject: [External Email] Amit, your applicati...   
1  Subject: [External Email] Barclays: We’ve rece...   
2  Subject: [External Email] Thank you for applyi...   
3  Subject: [External Email] Amit, your applicati...   
4  Subject: [External Email] Amit, your applicati...   

                                          plain_text  
0  subject external email amit, your application ...  
1  subject external email barclays weve received ...  
2  subject external email thank you for applying ...  
3  subject external email amit, your application ...  
4  subject external email amit, your application ...  


In [10]:
df = df.drop(columns='text')
df = df[['plain_text','label']]

In [11]:
output_filename = "clean_data.csv"
df.to_csv(output_filename, index=False)