# Mail Preprocessor

In [36]:
import os
import email
import pandas as pd
import mailparser
from bs4 import BeautifulSoup
import quopri
import mailparser
from bs4 import Comment


def decode_payload(payload):
    if payload is None:
        return None, 0

    encodings = ['utf-8', 'cp437', 'ISO-8859-1']
    for encoding in encodings:
        try:
            decoded_text = quopri.decodestring(payload).decode(encoding, errors='replace')
            return decoded_text, decoded_text.count('�')
        except UnicodeDecodeError:
            pass
    return None, 0

def extract_info_from_email(file_path):

    mail = mailparser.parse_from_file(file_path)
    
    with open(file_path, 'r', encoding='cp437', errors='replace') as f:
        msg = email.message_from_file(f)

        unknown_chars_count = 0
        body = ''
        unsubscribe_links = []
        comments = []
        soup = None  # Initialize soup here

        if msg.is_multipart():
            for part in msg.walk():
                content_disposition = str(part.get("Content-Disposition"))
                if "attachment" not in content_disposition:
                    part_body, unknown_count = decode_payload(part.get_payload(decode=True))
                    if part_body:
                        body += part_body
                        unknown_chars_count += unknown_count

                        if part_body.strip():  # Check if part_body is not empty
                            try:
                                part_soup = BeautifulSoup(part_body, 'html5lib')
                                unsubscribe_links.extend([link['href'] for link in part_soup.find_all('a', href=True) if "unsubscribe" in link.text.lower()])
                                if soup is None:
                                    soup = part_soup
                                    # Extract comments from the soup
                                    comments = [str(comment) for comment in soup.find_all(string=lambda text: isinstance(text, Comment))]
                            except Exception as e:
                                print(f"Failed to parse part of {file_path} with html5lib due to: {e}")

        else:
            body, unknown_count = decode_payload(msg.get_payload(decode=True))
            unknown_chars_count += unknown_count

            try:
                soup = BeautifulSoup(body, 'html5lib')
                unsubscribe_links.extend([link['href'] for link in soup.find_all('a', href=True) if "unsubscribe" in link.text.lower()])
                # Extract comments from the soup
                comments = [str(comment) for comment in soup.find_all(string=lambda text: isinstance(text, Comment))]
            except Exception as e:
                print(f"Failed to parse {file_path} with html5lib due to: {e}")

        if not body:
            print(f"No body content found for {file_path}")
            return None

        if soup:
            tracking_pixel = len(soup.find_all('img', width='1', height='1')) > 0
            total_links = len(soup.find_all('a'))

        else:
            tracking_pixel = False
            total_links = 0

        dkim_signature = 'Present' if msg.get('DKIM-Signature') else 'Absent'


    return {
        'filename': os.path.basename(file_path),
        'body': mail.body,
        'subject': mail.subject,
        'comments': ' '.join(comments),
        'text_plain': mail.text_plain,
        'text_html': mail.text_html,
        'text_not_managed': mail.text_not_managed,
        'defects': str(mail.defects),
        'defects_categories': str(mail.defects_categories),
        'number of unsubscribe links': len(unsubscribe_links),
        'number of undecodable characters': unknown_chars_count,
        'tracking pixel present': tracking_pixel,
        'total links in email': total_links,
        'email size (bytes)': len(body),
        'dkim-signature': dkim_signature
    }



# Directory to search
# directory = 'C:\\Users\\ericb\\Desktop\\Research\\Primary@gmail.com\\Cleaned_Mail\\2023_test\\08\\'
# output_csv = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\data\\test\\warranted_data_test_output\\mailparser_test_output.csv'
directory = 'C:\\Users\\ericb\\Desktop\\Research\\Bruce\\test\\'
output_csv = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\data\\test\\unwarranted_data_test_output\\mailparser_test_output.csv'



infos = []

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory, filename)
        info = extract_info_from_email(file_path)
        infos.append(info)

# Convert to DataFrame and save to CSV
if infos:
    df = pd.DataFrame(infos)
    df.to_csv(output_csv, index=False)

print(f"Data saved to {output_csv}")

Data saved to C:\Users\ericb\Desktop\Research\542_Project\data\test\unwarranted_data_test_output\mailparser_test_output.csv




In [37]:
# Now that I have the data, I can clean the textual data for use in the model
# print(df['body'][0])
df.head()

Unnamed: 0,filename,body,subject,comments,text_plain,text_html,text_not_managed,defects,defects_categories,number of unsubscribe links,number of undecodable characters,tracking pixel present,total links in email,email size (bytes),dkim-signature
0,1680369071.M301864P1688939.txt,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",New Styles for 2023. Louis Vuitton bags only $99,Peanut butter: add a spoonful of peanut butte...,[],"[<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 T...",[],[],set(),1,16,False,8,16106,Absent
1,1680385869.1693585_1.txt,\r\n\r\n\r\nنقدم لكم الخطة التدريبية لعام 202...,نقدم لكم الخطة التدريبية لعام 2023 عبر الرابط ...,,[ \r\n\r\n\r\nنقدم لكم الخطة التدريبية لعام 20...,"[<html xmlns:v=""urn:schemas-microsoft-com:vml""...",[],[],set(),0,87,False,0,4696,Present
2,1680385869.1693585_11.txt,"<p><span style=""font-size:18px;""><span style=""...",fruit mixture triggers hard wood in 93% of men,,[],"[<p><span style=""font-size:18px;""><span style=...",[],[],set(),1,0,False,5,11153,Absent
3,1680385869.1693585_13.txt,<html>\n <head> \n <title>recipes</title> \n ...,Keep Gunk Out Of Your Gutters,,[],[<html>\n <head> \n <title>recipes</title> \n...,[],[],set(),0,0,False,4,4144,Present
4,1680385869.1693585_15.txt,<html>\n <head> \n <title>Malaysia</title> \n...,Keep Your Gutters Clean Forever,,[],[<html>\n <head> \n <title>Malaysia</title> \...,[],[],set(),0,0,False,4,4128,Present


In [38]:
# Open the CSV file and read it into a DataFrame
df_to_clean = pd.read_csv(output_csv)

In [42]:
import re
import emoji
from bs4 import BeautifulSoup, NavigableString
import quopri
import base64

def replace_emojis(text):
    return emoji.demojize(text, delimiters=("", ""))

def replace_urls_based_on_context(html_content):
    soup = BeautifulSoup(html_content, 'lxml')
    for a_tag in soup.find_all('a'):
        href = a_tag.get('href', '')
        url_type = 'UNSAFE_' if href.startswith('http://') else ''
        if a_tag.img:
            a_tag.string = f'{url_type}IMAGE_URL'
        elif isinstance(a_tag.next, NavigableString) and a_tag.next.strip():
            a_tag.string = f'{url_type}LINK_URL'
        else:
            a_tag.string = f'{url_type}BUTTON_URL'
    return str(soup)

def replace_urls_in_text(text):
    http_url_pattern = re.compile(r'http://\S+')
    text = http_url_pattern.sub('UNSAFE_LINK_URL', text)
    https_url_pattern = re.compile(r'https://\S+')
    text = https_url_pattern.sub('LINK_URL', text)
    return text

def decode_quoted_printable(input_data):
    if isinstance(input_data, bytes):
        return quopri.decodestring(input_data).decode('utf-8', errors='replace')
    else:
        return quopri.decodestring(input_data.encode()).decode('utf-8', errors='replace')

def decode_base64(text):
    return base64.b64decode(text).decode('utf-8', errors='replace')

def clean_text(raw_text):
    # If raw_text is None, return an empty string or you could return None depending on your requirement
    if raw_text is None:
        return ''
        
    if not isinstance(raw_text, str):
        # If raw_text is not a string, return it as is or convert it to a string
        return raw_text

    #Remove line breaks and continuation equals signs
    raw_text = re.sub(r'=\n', '', raw_text)
    # Decode any quoted-printable text
    raw_text = quopri.decodestring(raw_text.encode()).decode('utf-8', errors='replace')

    # Create a BeautifulSoup object
    soup = BeautifulSoup(raw_text, 'lxml')
    
    # Remove style and script tags and their content
    for tag in soup(['style', 'script', 'img']):
        tag.decompose()

    # Replace URLs in 'a' tags
    for a_tag in soup.find_all('a'):
        href = a_tag.get('href', '')
        url_type = 'UNSAFE ' if href.startswith('http://') else ''
        if a_tag.img:
            a_tag.string = f'{url_type}IMAGE URL'
        elif isinstance(a_tag.next, NavigableString) and a_tag.next.strip():
            a_tag.string = f'{url_type}LINK URL'
        else:
            a_tag.string = f'{url_type}BUTTON URL'

    # Now proceed with extracting text and further cleaning
    text = soup.get_text(separator=' ', strip=True)
    text = replace_emojis(text)

    text = replace_urls_in_text(text)

    # Remove any remaining HTML encoded characters
    text = re.sub(r'&[a-zA-Z0-9#]+;', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove Zero Width Non-Joiner characters
    text = text.replace('\u200b', '')  # Unicode for ZWNBSP
    text = text.replace('\u200c', '')  # Unicode for ZWNJ
    text = text.replace('\u200d', '')  # Unicode for ZWJ
    text = text.replace('\u200e', '')  # Unicode for LEFT-TO-RIGHT MARK
    text = text.replace('\u200f', '')  # Unicode for RIGHT-TO-LEFT MARK


    # Strip string of leading/trailing whitespace
    return text.strip()


In [43]:


# run the clean_text function on the subject and body columns
df_to_clean['subject'] = df_to_clean['subject'].apply(lambda x: clean_text(x) if x is not None else '')
df_to_clean['body'] = df_to_clean['body'].apply(lambda x: clean_text(x) if x is not None else '')
df_to_clean['comments'] = df_to_clean['comments'].apply(lambda x: clean_text(x) if x is not None else '')
print(df_to_clean['body'][1])


نقدم لكم الخطة التدريبية لعام 2023 عبر الرابط التالي LINK_URL --- mail_boundary --- Content-Type: image/jpeg; name="image001.jpg" Content-Transfer-Encoding: base64 Content-ID: --- mail_boundary --- نقدم لكم الخطة التدريبية لعام 2023 عبر الرابط التالي LINK URL


In [44]:
df_to_clean.head()


Unnamed: 0,filename,body,subject,comments,text_plain,text_html,text_not_managed,defects,defects_categories,number of unsubscribe links,number of undecodable characters,tracking pixel present,total links in email,email size (bytes),dkim-signature
0,1680369071.M301864P1688939.txt,No images? LINK URL LINK URL LINK URL LINK URL...,New Styles for 2023. Louis Vuitton bags only $99,Peanut butter: add a spoonful of peanut butter...,[],"['<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 ...",[],[],set(),1,16,False,8,16106,Absent
1,1680385869.1693585_1.txt,نقدم لكم الخطة التدريبية لعام 2023 عبر الرابط ...,نقدم لكم الخطة التدريبية لعام 2023 عبر الرابط ...,,[' \r\n\r\n\r\nنقدم لكم الخطة التدريبية لعام 2...,"['<html xmlns:v=""urn:schemas-microsoft-com:vml...",[],[],set(),0,87,False,0,4696,Present
2,1680385869.1693585_11.txt,Research shows that nearly 60% of men between ...,fruit mixture triggers hard wood in 93% of men,,[],"['<p><span style=""font-size:18px;""><span style...",[],[],set(),1,0,False,5,11153,Absent
3,1680385869.1693585_13.txt,recipes UNSAFE LINK URL UNSAFE BUTTON URL *For...,Keep Gunk Out Of Your Gutters,,[],['<html>\n <head> \n <title>recipes</title> \...,[],[],set(),0,0,False,4,4144,Present
4,1680385869.1693585_15.txt,Malaysia UNSAFE LINK URL UNSAFE BUTTON URL *Fo...,Keep Your Gutters Clean Forever,,[],['<html>\n <head> \n <title>Malaysia</title> ...,[],[],set(),0,0,False,4,4128,Present


In [None]:
print(df_to_clean['body'][1])

# save df_to_clean['body'][i] to txt file
for i in range (0, len(df_to_clean['body'])):
    with open('C:\\Users\\ericb\\Desktop\\Research\\542_Project\\data\\test\\warranted_data_test_output\\mailparser_test_output_body' + str(i) + '.txt', 'w') as f:
        f.write(df_to_clean['body'][i])

# save df_to_clean['text_not_managed'][i] to txt file
for i in range(0, len(df_to_clean['text_not_managed'])):
    with open('C:\\Users\\ericb\\Desktop\\Research\\542_Project\\data\\test\\warranted_data_test_output\\mailparser_test_output_text_not_managed' + str(i) + '.txt', 'w') as f:
        f.write(df_to_clean['text_not_managed'][i])


نقدم لكم الخطة التدريبية لعام 2023 عبر الرابط التالي LINK_URL --- mail_boundary --- Content-Type: image/jpeg; name="image001.jpg" Content-Transfer-Encoding: base64 Content-ID: --- mail_boundary --- نقدم لكم الخطة التدريبية لعام 2023 عبر الرابط التالي LINK URL
