# Mail Preprocessor

In [40]:
import os
import email
import pandas as pd
import re
import emoji
import mailparser
from bs4 import BeautifulSoup
import quopri
import mailparser
from bs4 import Comment
import time


def decode_payload(payload):
    if payload is None:
        return None, 0

    encodings = ['utf-8', 'cp437', 'ISO-8859-1']
    for encoding in encodings:
        try:
            decoded_text = quopri.decodestring(payload).decode(encoding, errors='replace')
            return decoded_text, decoded_text.count('�')
        except UnicodeDecodeError:
            pass
    return None, 0

def extract_info_from_email(file_path):

    # mail = mailparser.parse_from_file(file_path)
    encodings = [
                'utf-8',          # Most common encoding for web content
                'cp437',          # Original character set of IBM PC (DOS)
                'ISO-8859-1',     # Western European/Latin-1
                'ISO-8859-2',     # Central and Eastern European/Latin-2
                'ISO-8859-3',     # South European/Latin-3
                'ISO-8859-4',     # North European/Latin-4
                'ISO-8859-5',     # Latin/Cyrillic
                'ISO-8859-6',     # Latin/Arabic
                'windows-1256',   # Arabic
                'koi8-r',         # Russian
                'koi8-u',         # Ukrainian
                'big5',           # Traditional Chinese
                'gb2312',         # Simplified Chinese
                'euc-kr',         # Korean
                'shift_jis',      # Japanese
            ]
    mail = None

    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding, errors='replace') as file:
                file_content = file.read()
            mail = mailparser.parse_from_string(file_content)
            break  # Break the loop if parsing is successful
        except (UnicodeDecodeError, Exception) as e:
            print(f"Failed to parse {file_path} with encoding {encoding} due to: {e}")
            # Optionally, you can log the error or handle it as required

    if not mail:
        print(f"Failed to parse {file_path} with all tried encodings")
        return None
    
    with open(file_path, 'r', encoding='cp437', errors='replace') as f:
        msg = email.message_from_file(f)

        unknown_chars_count = 0
        body = ''
        unsubscribe_links = []
        comments = []
        soup = None  # Initialize soup here

        if msg.is_multipart():
            for part in msg.walk():
                content_disposition = str(part.get("Content-Disposition"))
                if "attachment" not in content_disposition:
                    part_body, unknown_count = decode_payload(part.get_payload(decode=True))
                    if part_body:
                        body += part_body
                        unknown_chars_count += unknown_count

                        if part_body.strip():  # Check if part_body is not empty
                            try:
                                part_soup = BeautifulSoup(part_body, 'html5lib')
                                unsubscribe_links.extend([link['href'] for link in part_soup.find_all('a', href=True) if "unsubscribe" in link.text.lower()])
                                if soup is None:
                                    soup = part_soup
                                    # Extract comments from the soup
                                    comments = [str(comment) for comment in soup.find_all(string=lambda text: isinstance(text, Comment))]
                            except Exception as e:
                                print(f"Failed to parse part of {file_path} with html5lib due to: {e}")

        else:
            body, unknown_count = decode_payload(msg.get_payload(decode=True))
            unknown_chars_count += unknown_count

            try:
                soup = BeautifulSoup(body, 'html5lib')

                # Define a list of possible unsubscribe indicators
                unsubscribe_indicators = ["unsubscribe", "opt out", "opt-out", "stop receiving", "email preferences", "subscription settings"]

                # Search for links that contain any of the indicators in their text or title attribute
                unsubscribe_links = [link['href'] for link in soup.find_all('a', href=True) if any(indicator in link.get_text().lower() or indicator in link.get('title', '').lower() for indicator in unsubscribe_indicators)]

                # Additionally, check in the href attribute for some common patterns
                unsubscribe_links.extend([link['href'] for link in soup.find_all('a', href=True) if "unsubscribe" in link['href'].lower() or "optout" in link['href'].lower()])

                comments = [str(comment) for comment in soup.find_all(string=lambda text: isinstance(text, Comment))]
            except Exception as e:
                print(f"Failed to parse {file_path} with html5lib due to: {e}")

        if not body:
            print(f"No body content found for {file_path}")
            return None

        # Count all occurrences of "http:" and "https:" in the email's body and subject
        link_pattern = re.compile(r'https?:')
        total_links_in_body = len(link_pattern.findall(body))
        total_links_in_subject = len(link_pattern.findall(mail.subject))
        total_links_in_comments = len(link_pattern.findall(' '.join(comments)))

        total_links = total_links_in_body + total_links_in_subject + total_links_in_comments

        if soup:
            tracking_pixel = len(soup.find_all('img', width='1', height='1')) > 0
            # total_links = len(soup.find_all('a'))

        else:
            tracking_pixel = False
            total_links = 0

        dkim_signature = 'Present' if msg.get('DKIM-Signature') else 'Absent'


    return {
        'filename': os.path.basename(file_path),
        'body': mail.body,
        'subject': mail.subject,
        'comments': ' '.join(comments),
        'text_plain': 1 if mail.text_plain else 0,
        'text_html': 1 if mail.text_html else 0,
        'text_not_managed': 1 if mail.text_not_managed else 0,
        'defects': str(mail.defects),
        'defects_categories': str(mail.defects_categories),
        'number of unsubscribe links': len(unsubscribe_links),
        'number of undecodable characters': unknown_chars_count,
        'tracking pixel present': tracking_pixel,
        'total num of images': len(soup.find_all('img')) if soup else 0,
        'total links in email': total_links,
        'email size (bytes)': len(body),
        'dkim-signature': dkim_signature
    }




warranted = False
unwarranted = True

infos = []
failed_files = []



start_time = time.time()

# Use for warranted
if warranted:

    # Directory to search
    directory = 'C:\\Users\\ericb\\Desktop\\Research\\Primary@gmail.com\\Cleaned_Mail\\2023\\'
    output_directory = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\cleaned_data\\warranted_data\\'

    # Ensure output directory exists
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    for dirpath, dirnames, filenames in os.walk(directory):
        print(f"Processing {dirpath}")
        for file_name in filenames:
            file_path = os.path.join(dirpath, file_name)
            info = extract_info_from_email(file_path)
            if info:
                infos.append(info)

if unwarranted:

    directory = 'C:\\Users\\ericb\\Desktop\\Research\\Bruce\\'
    output_directory = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\cleaned_data\\unwarranted_data\\'

    # Ensure output directory exists
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    for dirpath, dirnames, filenames in os.walk(directory):
        # Split the directory path to get individual parts
        path_parts = dirpath.split(os.sep)

        # Check if the path has at least two parts (to avoid index error)
        # and if the second last part is a year between 2018 and 2023
        if len(path_parts) > 2 and path_parts[-2].isdigit() and 2018 <= int(path_parts[-2]) <= 2023:
            print(f"Processing {dirpath}")
            for file_name in filenames:
                file_path = os.path.join(dirpath, file_name)
                info = extract_info_from_email(file_path)
                if info:
                    infos.append(info)
        else:
            print(f"Skipping {dirpath}")

if infos:
    df = pd.DataFrame(infos)
    if not df.empty:
        if warranted:
            csv_path = os.path.join(output_directory, f"warranted_preprocessed_files_v2.csv")
        elif unwarranted:
            csv_path = os.path.join(output_directory, f"unwarranted_preprocessed_files_v2.csv")
        print('saving to excel')
        df.to_csv(csv_path, index=False)

# Saving the failed filenames to a CSV
if failed_files:
    failed_df = pd.DataFrame({'Failed Files': failed_files})
    failed_df.to_csv("failed_files.csv", index=False)
    print(f"Failed to parse {len(failed_files)} files. See failed_files.csv for details.")

print("Script execution completed.")
print(f"Time taken: {time.time() - start_time} seconds")

Skipping C:\Users\ericb\Desktop\Research\Bruce\
Skipping C:\Users\ericb\Desktop\Research\Bruce\2013
Skipping C:\Users\ericb\Desktop\Research\Bruce\2013\01
Skipping C:\Users\ericb\Desktop\Research\Bruce\2013\02
Skipping C:\Users\ericb\Desktop\Research\Bruce\2013\03
Skipping C:\Users\ericb\Desktop\Research\Bruce\2013\04
Skipping C:\Users\ericb\Desktop\Research\Bruce\2013\05
Skipping C:\Users\ericb\Desktop\Research\Bruce\2013\06
Skipping C:\Users\ericb\Desktop\Research\Bruce\2013\07
Skipping C:\Users\ericb\Desktop\Research\Bruce\2013\08
Skipping C:\Users\ericb\Desktop\Research\Bruce\2013\09
Skipping C:\Users\ericb\Desktop\Research\Bruce\2013\10
Skipping C:\Users\ericb\Desktop\Research\Bruce\2013\11
Skipping C:\Users\ericb\Desktop\Research\Bruce\2013\12
Skipping C:\Users\ericb\Desktop\Research\Bruce\2014
Skipping C:\Users\ericb\Desktop\Research\Bruce\2014\2014
Skipping C:\Users\ericb\Desktop\Research\Bruce\2014\2014\01
Skipping C:\Users\ericb\Desktop\Research\Bruce\2014\2014\02
Skipping C:

More than one match found for (?:(?:^|\s)from\s+(?P<from>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+by|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in from 61.206.120.243 153.127.216.225 by 103.30.243.137 Postfix with ESMTP 20151028 S8257F224EB04475AA2B8A21F37B22469 id 196042D6 for <bruce@bruce-guenter.dyndns.org> from <mailmaster@fut-----ure2o50.com>; Mon, 1 Jan 2018 21:31:52 +0900 JST
More than one match found for (?:(?:^|\s)from\s+(?P<from>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+by|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in from 61.206.120.243 153.127.216.225 by 103.30.243.137 Postfix with ESMTP 20151028 S8257F224EB04475AA2B8A21F37B22469 id 196042D6 for <bruce@bruce-guenter.dyndns.org> from <mailmaster@fut-----ure2o50.com>; Mon, 1 Jan 2018 21:31:52 +0900 JST
More than one match found for (?:(?:^|\s)from\s+(?P<from>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+by|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in from 61.206.120.243 153.127.216.225 by 103

Processing C:\Users\ericb\Desktop\Research\Bruce\2018\02


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in by mail.three.co.id Postfix, from userid 359 id DEAD6B16CBA; Fri, 02 Feb 2018 11:14:06 +0700
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in by mail.three.co.id Postfix, from userid 359 id DEAD6B16CBA; Fri, 02 Feb 2018 11:14:06 +0700
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in by mail.unifa.ac.id Postfix, from userid 952 id 8047EA4DD14; Fri, 02 Feb 2018 10:58:00 +0530
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in by mail.unifa.ac.id Postfix, from userid 952 id 8047EA4DD14; Fri, 02 Feb 2018 10:58:00 +0530
More tha

Processing C:\Users\ericb\Desktop\Research\Bruce\2018\03


More than one match found for (?:(?:^|\s)from\s+(?P<from>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+by|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in from sd103-198-4-13.tl-host.com 103.198.4.13 by 120.143.54.114 Postfix with ESMTP 20180104-64 S2693E581B6E64484891CB9C4CC99EA0C id F56A902A for <lists-bikini@bruce-guenter.dyndns.org> from <error_mail-lists-bikini=bruce-guenter.dyndns.org@eme-rald.net>; Sun, 4 Mar 2018 19:57:49 +0900 JST
More than one match found for (?:(?:^|\s)from\s+(?P<from>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+by|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in from sd103-198-4-13.tl-host.com 103.198.4.13 by 120.143.54.114 Postfix with ESMTP 20180104-64 S2693E581B6E64484891CB9C4CC99EA0C id F56A902A for <lists-bikini@bruce-guenter.dyndns.org> from <error_mail-lists-bikini=bruce-guenter.dyndns.org@eme-rald.net>; Sun, 4 Mar 2018 19:57:49 +0900 JST
More than one match found for (?:(?:^|\s)from\s+(?P<from>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-s

Processing C:\Users\ericb\Desktop\Research\Bruce\2018\04


More than one match found for (?:(?:^|\s)from\s+(?P<from>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+by|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in from W-51-3 192.168.169.128 by m.ranking-search.jp with XMail 1.12 Win32/Ix86 ESMTP Server id <S6ABD9989> for <bruce@bruce-guenter.dyndns.org> from <error@ranking-search.jp>; Sun, 01 Apr 2018 21:39:08 +0900
More than one match found for (?:(?:^|\s)from\s+(?P<from>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+by|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in from W-51-3 192.168.169.128 by m.ranking-search.jp with XMail 1.12 Win32/Ix86 ESMTP Server id <S6ABD9989> for <bruce@bruce-guenter.dyndns.org> from <error@ranking-search.jp>; Sun, 01 Apr 2018 21:39:08 +0900
More than one match found for (?:(?:^|\s)from\s+(?P<from>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+by|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in from W-51-3 192.168.169.128 by m.ranking-search.jp with XMail 1.12 Win32/Ix86 ESMTP Server id <S6AC25E48> 

No body content found for C:\Users\ericb\Desktop\Research\Bruce\2018\04\1524796568.16000_39.txt
Processing C:\Users\ericb\Desktop\Research\Bruce\2018\05


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by mailhost8.tgg.net.id Postfix with ESMTP id B87233E26E9 for <bruce@untroubled.org>; Tue, 8 May 2018 17:56:47 +0700 WIT
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mailhost8.tgg.net.id 127.0.0.1 by localhost mailhost8.tgg.net.id 127.0.0.1 amavisd-new, port 10032 with ESMTP id UA-B00_HOTVk for <bruce@untroubled.org>; Tue, 8 May 2018 17:56:46 +0700 WIT
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by mailhost8.tgg.net.id Postfix with ESMTP id E6C833E26F2 for <bruce@untroubled.org>; Tue, 8 May 2018 17:56:45 +0700 WIT
More than one match found for

No body content found for C:\Users\ericb\Desktop\Research\Bruce\2018\05\1525900162.18576_5.txt
Processing C:\Users\ericb\Desktop\Research\Bruce\2018\06


Email content 'alternative' not handled


Processing C:\Users\ericb\Desktop\Research\Bruce\2018\07


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from www-data by 0to10rm.com by mail.0to10rm.com with esmtpsa TLSv1.2:ULI-KBG-AES256-AXU-SHA207:506 with local Exim 4.84_2 envelope-from <xnueorwcvfo@0to10rm.com> id dS07ta-u1745a-zE for cvs@bruce-guenter.dyndns.org; id YeLvMS-8blt70-s3; Mon, 16 Jul 2018 17:51:42 -0800
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from www-data by 0to10rm.com by mail.0to10rm.com with esmtpsa TLSv1.2:ULI-KBG-AES256-AXU-SHA207:506 with local Exim 4.84_2 envelope-from <xnueorwcvfo@0to10rm.com> id dS07ta-u1745a-zE for cvs@bruce-guenter.dyndns.org; id YeLvMS-8blt70-s3; Mon, 16 Jul 2018 17:51:42 -0800
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)

Processing C:\Users\ericb\Desktop\Research\Bruce\2018\08


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from www-data by 03.com by mail.03.com with esmtpsa TLSv1.2:UVQ-RAX-AES256-MHI-SHA774:663 with local Exim 4.84_2 envelope-from <vogeseqayn@03.com> id 6u3Ec4-5lTtIv-i2 for cvs@bruce-guenter.dyndns.org; id T4gvvQ-C1H756-K1; Thu, 02 Aug 2018 03:44:39 -0800
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from www-data by 03.com by mail.03.com with esmtpsa TLSv1.2:UVQ-RAX-AES256-MHI-SHA774:663 with local Exim 4.84_2 envelope-from <vogeseqayn@03.com> id 6u3Ec4-5lTtIv-i2 for cvs@bruce-guenter.dyndns.org; id T4gvvQ-C1H756-K1; Thu, 02 Aug 2018 03:44:39 -0800
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from www-data by 03.com by 

Processing C:\Users\ericb\Desktop\Research\Bruce\2018\09
Processing C:\Users\ericb\Desktop\Research\Bruce\2018\10


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost.localdomain 127.0.0.1 by mail.rs-jih.co.id Postfix with ESMTP id 2C7055AE4B9B for <bgware-owner@lists.untroubled.org>; Tue, 19 Jun 2018 15:22:10 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mail.rs-jih.co.id 127.0.0.1 by localhost mail.rs-jih.co.id 127.0.0.1 amavisd-new, port 10032 with ESMTP id PQyCcFY1jl02 for <bgware-owner@lists.untroubled.org>; Tue, 19 Jun 2018 15:22:08 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost.localdomain 127.0.0.1 by mail.rs-jih.co.id Postfix with ESMTP id 7D8D2502896E for <bgware-owner@lists.untroubled.org>; Mon, 23 A

Processing C:\Users\ericb\Desktop\Research\Bruce\2018\11


More than one match found for (?:by\s+(?P<by>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in majordomo@vger.kernel.org by vger.kernel.org via listexpand id S1732237AbeKVEBy ORCPT <rfc822;bruce@untroubled.org> ; Wed, 21 Nov 2018 23:01:54 -0500
More than one match found for (?:by\s+(?P<by>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in from mail-ot1-f65.google.com 209.85.210.65 :42437 "EHLO mail-ot1-f65.google.com" rhost-flags-OK-OK-OK-OK by vger.kernel.org with ESMTP id S1726711AbeKVEBy ORCPT <rfc822;linux-kernel@vger.kernel.org> ; Wed, 21 Nov 2018 23:01:54 -0500
More than one match found for (?:by\s+(?P<by>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in majordomo@vger.kernel.org by vger.kernel.org via listexpand id S1732237AbeKVEBy ORCPT <rfc822;bruce@untroubled.org> ; Wed, 21 Nov 2018 23:01:54 -0500
More than one mat

Processing C:\Users\ericb\Desktop\Research\Bruce\2018\12
Skipping C:\Users\ericb\Desktop\Research\Bruce\2019
Processing C:\Users\ericb\Desktop\Research\Bruce\2019\01
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2019\01\1548690882.12485_3.txt
Processing C:\Users\ericb\Desktop\Research\Bruce\2019\02


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from 114.129.17.74 port=46264 helo= ip-17-74.hypernet.co.id by belle.uairs.com with esmtpsa TLSv1:ECDHE-RSA-AES256-SHA:256 Exim 4.91 envelope-from <reservations@uairs.com> id 1gw4Os-0001xQ-KO for bruceg@untroubled.org; Tue, 19 Feb 2019 07:21:27 -0500
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from 114.129.17.74 port=46264 helo= ip-17-74.hypernet.co.id by belle.uairs.com with esmtpsa TLSv1:ECDHE-RSA-AES256-SHA:256 Exim 4.91 envelope-from <reservations@uairs.com> id 1gw4Os-0001xQ-KO for bruceg@untroubled.org; Tue, 19 Feb 2019 07:21:27 -0500


Processing C:\Users\ericb\Desktop\Research\Bruce\2019\03
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2019\03\1552425548.19820_27.txt
Processing C:\Users\ericb\Desktop\Research\Bruce\2019\04


More than one match found for (?:by\s+(?P<by>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in majordomo@vger.kernel.org by vger.kernel.org via listexpand id S1726931AbfDLKby convert rfc822-to-8bit ORCPT <rfc822;bruce@untroubled.org> ; Fri, 12 Apr 2019 06:31:54 -0400
More than one match found for (?:by\s+(?P<by>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in majordomo@vger.kernel.org by vger.kernel.org via listexpand id S1726931AbfDLKby convert rfc822-to-8bit ORCPT <rfc822;bruce@untroubled.org> ; Fri, 12 Apr 2019 06:31:54 -0400
More than one match found for (?:(?:^|\s)from\s+(?P<from>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+by|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in from mail.funed.mg.gov.br 172.16.110.15 by funed.mg.gov.br stage1 with esmtps Exim MailCleaner id 1hHtkO-0003z8-Bm from <andre.moreti@funed.mg.gov.br>; Sat, 20 Apr 2019 14:25:52 -0300
More than 

Processing C:\Users\ericb\Desktop\Research\Bruce\2019\05
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2019\05\1558182193.7123_61.txt
Processing C:\Users\ericb\Desktop\Research\Bruce\2019\06


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from ip69-151.cbn.net.id 202.158.69.151 :40772 "EHLO mail.seinoindomobil.co.id" rhost-flags-OK-OK-OK-OK by vger.kernel.org with ESMTP id S2405070AbfFKT6D ORCPT <rfc822;linux-kernel@vger.kernel.org> ; Tue, 11 Jun 2019 15:58:03 -0400
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by mail.seinoindomobil.co.id Postfix with ESMTP id 708EC6B13F0; Tue, 11 Jun 2019 22:16:11 -0400 EDT
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mail.seinoindomobil.co.id 127.0.0.1 by localhost mail.seinoindomobil.co.id 127.0.0.1 amavisd-new, port 10032 with ESMTP id MLGpIpF7h-kW; Tue, 11 Jun 2019 22:16:11 

Processing C:\Users\ericb\Desktop\Research\Bruce\2019\07


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from nig-proxy-02.zimbra.id 10.69.112.7 by mx2.innovazionedigitale.it with ESMTP id x670cKe1021214-x670cKe3021214 version=TLSv1.2 cipher=ECDHE-RSA-AES256-GCM-SHA384 bits=256 verify=NO ; Sun, 7 Jul 2019 02:38:22 +0200
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by nig-proxy-02.zimbra.id Postfix with ESMTP id C83C0849AD; Sat, 6 Jul 2019 05:59:29 +0200 CEST
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from nig-proxy-02.zimbra.id 127.0.0.1 by localhost nig-proxy-02.zimbra.id 127.0.0.1 amavisd-new, port 10026 with ESMTP id SJZztTTxHC_V; Sat, 6 Jul 2019 05:59:29 +0200 CEST
More than one m

Processing C:\Users\ericb\Desktop\Research\Bruce\2019\08


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from cnappc-proxy-03.zimbra.id relay.awn.local 10.111.1.15 using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 256/256 bits No client certificate requested by mx2-cnappc.innovazionedigitale.it Postfix with ESMTPS id C301C419D9; Mon, 12 Aug 2019 06:02:42 +0200 CEST
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by cnappc-proxy-03.zimbra.id Postfix with ESMTP id 3931F2812D5; Mon, 12 Aug 2019 06:03:02 +0200 CEST
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from cnappc-proxy-03.zimbra.id 127.0.0.1 by localhost cnappc-proxy-03.zimbra.id 127.0.0.1 amavisd-new, port 10026 with ESMTP id e_0z

Processing C:\Users\ericb\Desktop\Research\Bruce\2019\09


More than one match found for (?:by\s+(?P<by>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in from kudraw.host 10.1.173.173 by kudraw.host id g3XmhoQK2IBY for <bruce@untroubled.org>; Sun, 08 Sep 2019 13:23:05 +0200 envelope-from <return@kudraw.host>
More than one match found for (?:by\s+(?P<by>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in from kudraw.host 10.1.173.173 by kudraw.host id g3XmhoQK2IBY for <bruce@untroubled.org>; Sun, 08 Sep 2019 13:23:05 +0200 envelope-from <return@kudraw.host>


No body content found for C:\Users\ericb\Desktop\Research\Bruce\2019\09\1568127121.10878_3.txt


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from werkudoro.jatengprov.go.id 103.9.227.34 :45448 "EHLO werkudoro.jatengprov.go.id" rhost-flags-OK-OK-OK-OK by vger.kernel.org with ESMTP id S2404481AbfISTvF ORCPT <rfc822;linux-kernel@vger.kernel.org> ; Thu, 19 Sep 2019 15:51:05 -0400
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost 127.0.0.1 :40052 helo=werkudoro.jatengprov.go.id by werkudoro.jatengprov.go.id with esmtpa Exim 4.92 envelope-from <bpsdmd@jatengprov.go.id> id 1iAuPJ-0000hD-4w; Thu, 19 Sep 2019 18:15:30 +0700
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from werkudoro.jatengprov.go.id 103.9.227.34 :45448 "EHLO werkudoro.jatengprov.go.id" rh

Processing C:\Users\ericb\Desktop\Research\Bruce\2019\10


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mailgate.kemenperin.go.id 202.47.80.81 :44568 "EHLO mailgate.kemenperin.go.id" rhost-flags-OK-OK-OK-OK by vger.kernel.org with ESMTP id S1727535AbfJAASc ORCPT <rfc822;linux-kernel@vger.kernel.org> ; Mon, 30 Sep 2019 20:18:32 -0400
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by mailgate.kemenperin.go.id Postfix with ESMTP id D679337E8A37; Tue, 1 Oct 2019 06:19:04 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mailgate.kemenperin.go.id 127.0.0.1 by localhost mailgate.kemenperin.go.id 127.0.0.1 amavisd-new, port 10032 with ESMTP id qvW3UKqE6MrA; Tue, 1 Oct 2019 06:19:

Processing C:\Users\ericb\Desktop\Research\Bruce\2019\11


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from werkudoro.jatengprov.go.id 103.9.227.34 :37156 "EHLO werkudoro.jatengprov.go.id" rhost-flags-OK-OK-OK-OK by vger.kernel.org with ESMTP id S1725806AbfKFGc6 ORCPT <rfc822;linux-kernel@vger.kernel.org> ; Wed, 6 Nov 2019 01:32:58 -0500
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost 127.0.0.1 :42104 helo=werkudoro.jatengprov.go.id by werkudoro.jatengprov.go.id with esmtpa Exim 4.92 envelope-from <bpsdmd@jatengprov.go.id> id 1iSErM-0002Il-9G; Wed, 06 Nov 2019 13:32:05 +0700
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from werkudoro.jatengprov.go.id 103.9.227.34 :37156 "EHLO werkudoro.jatengprov.go.id" rho

Processing C:\Users\ericb\Desktop\Research\Bruce\2019\12
Skipping C:\Users\ericb\Desktop\Research\Bruce\2020
Processing C:\Users\ericb\Desktop\Research\Bruce\2020\01
Processing C:\Users\ericb\Desktop\Research\Bruce\2020\02


More than one match found for (?:by\s+(?P<by>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in from riverson.tk 10.1.243.91 by riverson.tk id yF3xRgR8xdBY for <bruce@untroubled.org>; Sun, 02 Feb 2020 13:53:38 +0100 envelope-from <return@riverson.tk>
More than one match found for (?:by\s+(?P<by>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in from riverson.tk 10.1.243.91 by riverson.tk id yF3xRgR8xdBY for <bruce@untroubled.org>; Sun, 02 Feb 2020 13:53:38 +0100 envelope-from <return@riverson.tk>
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mailgate.kemenperin.go.id 202.47.80.81 :60278 "EHLO mailgate.kemenperin.go.id" rhost-flags-OK-OK-OK-OK by vger.kernel.org with ESMTP id S1726928AbgBDAol ORCPT <rfc822;linux-kernel@vger.kernel.org> ; Mon, 3 Feb 2020 19:44:41 -0500


Processing C:\Users\ericb\Desktop\Research\Bruce\2020\03


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from avas10dua.indosat.net.id 219.83.54.102 :25248 "EHLO avas10dua.indosat.net.id" rhost-flags-OK-OK-OK-OK by vger.kernel.org with ESMTP id S1725446AbgCBH1P ORCPT <rfc822;linux-kernel@vger.kernel.org> ; Mon, 2 Mar 2020 02:27:15 -0500
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from im2mailps02.indosat.net.id 202.155.50.130 by avas11dua.indosat.net.id with ESMTP; 02 Mar 2020 14:18:31 +0700
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from avas10dua.indosat.net.id 219.83.54.102 :25248 "EHLO avas10dua.indosat.net.id" rhost-flags-OK-OK-OK-OK by vger.kernel.org with ESMTP id S1725446AbgCBH1P ORCPT <rfc822;linux-kernel@vge

No body content found for C:\Users\ericb\Desktop\Research\Bruce\2020\03\1583721353.3350611_7.txt


More than one match found for (?:by\s+(?P<by>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in majordomo@vger.kernel.org by vger.kernel.org via listexpand id S1726749AbgCIMBY ORCPT <rfc822;bruce@untroubled.org> ; Mon, 9 Mar 2020 08:01:24 -0400
More than one match found for (?:by\s+(?P<by>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in majordomo@vger.kernel.org by vger.kernel.org via listexpand id S1726749AbgCIMBY ORCPT <rfc822;bruce@untroubled.org> ; Mon, 9 Mar 2020 08:01:24 -0400
More than one match found for (?:by\s+(?P<by>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in from bevenage.cf 10.65.96.125 by bevenage.cf id yEuMutglqOby for <bruce@untroubled.org>; Mon, 16 Mar 2020 15:39:35 +0100 envelope-from <return@bevenage.cf>
More than one match found for (?:by\s+(?P<by>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|

Processing C:\Users\ericb\Desktop\Research\Bruce\2020\04
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2020\04\1586106509.519182.11394.txt


More than one match found for (?:by\s+(?P<by>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in majordomo@vger.kernel.org by vger.kernel.org via listexpand id S1726437AbgDEIby ORCPT <rfc822;bruce@untroubled.org> ; Sun, 5 Apr 2020 04:31:54 -0400
More than one match found for (?:by\s+(?P<by>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in from mail-vs1-f66.google.com 209.85.217.66 :42117 "EHLO mail-vs1-f66.google.com" rhost-flags-OK-OK-OK-OK by vger.kernel.org with ESMTP id S1726308AbgDEIby ORCPT <rfc822;linux-kernel@vger.kernel.org> ; Sun, 5 Apr 2020 04:31:54 -0400
More than one match found for (?:by\s+(?P<by>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in majordomo@vger.kernel.org by vger.kernel.org via listexpand id S1726437AbgDEIby ORCPT <rfc822;bruce@untroubled.org> ; Sun, 5 Apr 2020 04:31:54 -0400
More than one match 

No body content found for C:\Users\ericb\Desktop\Research\Bruce\2020\04\1587135810.501316_1.txt


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mail.lintas.net.id 103.242.106.93 :52582 "EHLO mail.lintas.net.id" rhost-flags-OK-OK-OK-OK by vger.kernel.org with ESMTP id S1725830AbgDQEtC ORCPT <rfc822;linux-kernel@vger.kernel.org> ; Fri, 17 Apr 2020 00:49:02 -0400
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by mail.lintas.net.id Postfix with ESMTP id EF8FB303A30C4; Fri, 17 Apr 2020 11:49:48 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mail.lintas.net.id 127.0.0.1 by localhost mail.lintas.net.id 127.0.0.1 amavisd-new, port 10032 with ESMTP id s5bwoeZhBxtd; Fri, 17 Apr 2020 11:49:48 +0700 WIB
More than one mat

Processing C:\Users\ericb\Desktop\Research\Bruce\2020\05


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from www-data by 1011thefarm.com by mail.1011thefarm.com with esmtpsa TLSv1.2:LWO-PBG-AES256-VZZ-SHA723:428 with local Exim 4.84_2 envelope-from <vnbielh@1011thefarm.com> id tK1slo-Cm5oh7-Q8 for bruce@untroubled.org; id 5fpXW3-ZAJ109-5L; Thu, 07 May 2020 12:04:15 -0800
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from www-data by 1011thefarm.com by mail.1011thefarm.com with esmtpsa TLSv1.2:LWO-PBG-AES256-VZZ-SHA723:428 with local Exim 4.84_2 envelope-from <vnbielh@1011thefarm.com> id tK1slo-Cm5oh7-Q8 for bruce@untroubled.org; id 5fpXW3-ZAJ109-5L; Thu, 07 May 2020 12:04:15 -0800
More than one match found for (?:by\s+(?P<by>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in 

Processing C:\Users\ericb\Desktop\Research\Bruce\2020\06


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from hs-162.6.buanalintas.co.id 223.165.6.162 :38232 "EHLO mx.bestprofit-futures.co.id" rhost-flags-OK-FAIL-OK-OK by vger.kernel.org with ESMTP id S1725290AbgFAEkF ORCPT <rfc822;linux-kernel@vger.kernel.org> ; Mon, 1 Jun 2020 00:40:05 -0400
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by mx.bestprofit-futures.co.id Postfix with ESMTP id 9E2F152556E; Mon, 1 Jun 2020 07:42:11 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mx.bestprofit-futures.co.id 127.0.0.1 by localhost mx.bestprofit-futures.co.id 127.0.0.1 amavisd-new, port 10032 with ESMTP id hNCmi7HFPN8j; Mon, 1 Jun 2

Processing C:\Users\ericb\Desktop\Research\Bruce\2020\07


Email content 'alternative' not handled
Email content 'alternative' not handled
Email content 'alternative' not handled
Email content 'alternative' not handled


No body content found for C:\Users\ericb\Desktop\Research\Bruce\2020\07\1593808256.385615.25986.txt


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from 181.199.123.9 port=55702 by kelud.vnt.net.id with esmtpsa TLS1.2 tls TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384 Exim 4.93 envelope-from <cc@gtvhotel.co.id> id 1jxaGQ-0008DP-J0 for bruce@untroubled.org; Tue, 21 Jul 2020 01:11:47 +0700
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from 181.199.123.9 port=55702 by kelud.vnt.net.id with esmtpsa TLS1.2 tls TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384 Exim 4.93 envelope-from <cc@gtvhotel.co.id> id 1jxaGQ-0008DP-J0 for bruce@untroubled.org; Tue, 21 Jul 2020 01:11:47 +0700
Email content 'alternative' not handled
Email content 'alternative' not handled
Email content 'alternative' not handled


Processing C:\Users\ericb\Desktop\Research\Bruce\2020\08


More than one match found for (?:by\s+(?P<by>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in from sonic312-20.consmr.mail.bf2.yahoo.com 74.6.128.82 :33740 "EHLO sonic312-20.consmr.mail.bf2.yahoo.com" rhost-flags-OK-OK-OK-OK by vger.kernel.org with ESMTP id S1729410AbgH1NbY ORCPT <rfc822;linux-kernel@vger.kernel.org> ; Fri, 28 Aug 2020 09:31:24 -0400
More than one match found for (?:by\s+(?P<by>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in from sonic312-20.consmr.mail.bf2.yahoo.com 74.6.128.82 :33740 "EHLO sonic312-20.consmr.mail.bf2.yahoo.com" rhost-flags-OK-OK-OK-OK by vger.kernel.org with ESMTP id S1729410AbgH1NbY ORCPT <rfc822;linux-kernel@vger.kernel.org> ; Fri, 28 Aug 2020 09:31:24 -0400


Processing C:\Users\ericb\Desktop\Research\Bruce\2020\09


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by mail.metaglobal.biz.id Postfix with ESMTP id E75F211D013; Mon, 7 Sep 2020 19:57:13 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mail.metaglobal.biz.id 127.0.0.1 by localhost mail.metaglobal.biz.id 127.0.0.1 amavisd-new, port 10032 with ESMTP id slvCoF2H4zvT; Mon, 7 Sep 2020 19:57:13 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by mail.metaglobal.biz.id Postfix with ESMTP id 8FBF4130EA3; Mon, 7 Sep 2020 19:57:13 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\

No body content found for C:\Users\ericb\Desktop\Research\Bruce\2020\09\1601492297.781943.19213.txt
Processing C:\Users\ericb\Desktop\Research\Bruce\2020\10
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2020\10\1601692972.976289.88848.txt
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2020\10\1601822210.638935.114294.txt
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2020\10\1601910065.392701.130197.txt
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2020\10\1601939847.883654.138423.txt


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from avas10dua.indosat.net.id 219.83.54.102 :59399 "EHLO avas10dua.indosat.net.id" rhost-flags-OK-OK-OK-OK by vger.kernel.org with ESMTP id S2437714AbgJQIp6 ORCPT <rfc822;linux-kernel@vger.kernel.org> ; Sat, 17 Oct 2020 04:45:58 -0400
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from im2mailps02.indosat.net.id 202.155.50.130 by avas11dua.indosat.net.id with ESMTP; 17 Oct 2020 15:31:43 +0700
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from avas10dua.indosat.net.id 219.83.54.102 :59399 "EHLO avas10dua.indosat.net.id" rhost-flags-OK-OK-OK-OK by vger.kernel.org with ESMTP id S2437714AbgJQIp6 ORCPT <rfc822;linux-kernel@vg

Processing C:\Users\ericb\Desktop\Research\Bruce\2020\11


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mail.bata.co.id 117.54.3.130 :48402 "EHLO mail.bata.co.id" rhost-flags-OK-OK-OK-OK by vger.kernel.org with ESMTP id S1726855AbgKKOHb ORCPT <rfc822;linux-kernel@vger.kernel.org> ; Wed, 11 Nov 2020 09:07:31 -0500
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by mail.bata.co.id Postfix with ESMTP id 3865E6C84B5; Wed, 11 Nov 2020 20:41:54 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mail.bata.co.id 127.0.0.1 by localhost mail.bata.co.id 127.0.0.1 amavisd-new, port 10032 with ESMTP id kG1Zw_4GK8hI; Wed, 11 Nov 2020 20:41:53 +0700 WIB
More than one match found for [^\w](

No body content found for C:\Users\ericb\Desktop\Research\Bruce\2020\11\1606083484.565839.51364.txt
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2020\11\1606101463.575981.53412.txt


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mail.bata.co.id 117.54.3.130 :34890 "EHLO mail.bata.co.id" rhost-flags-OK-OK-OK-OK by vger.kernel.org with ESMTP id S1728868AbgKWTEF ORCPT <rfc822;linux-kernel@vger.kernel.org> ; Mon, 23 Nov 2020 14:04:05 -0500
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by mail.bata.co.id Postfix with ESMTP id D48C26C8DDC; Tue, 24 Nov 2020 01:16:17 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mail.bata.co.id 127.0.0.1 by localhost mail.bata.co.id 127.0.0.1 amavisd-new, port 10032 with ESMTP id VALUw7-zoyqR; Tue, 24 Nov 2020 01:16:17 +0700 WIB
More than one match found for [^\w](

Processing C:\Users\ericb\Desktop\Research\Bruce\2020\12


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from im2mailps01.indosat.net.id 202.155.50.151 by avas11dua.indosat.net.id with ESMTP; 13 Dec 2020 12:52:37 +0700
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from im2mailps01.indosat.net.id 202.155.50.151 by avas11dua.indosat.net.id with ESMTP; 13 Dec 2020 12:52:37 +0700
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mail.blitar.go.id 103.148.208.194 :54206 "EHLO mail.blitarkota.go.id" rhost-flags-OK-OK-OK-OK by vger.kernel.org with ESMTP id S2388136AbgLNUeF ORCPT <rfc822;linux-kernel@vger.kernel.org> ; Mon, 14 Dec 2020 15:34:05 -0500
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s

No body content found for C:\Users\ericb\Desktop\Research\Bruce\2020\12\1608214578.5352_9.txt
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2020\12\1608214579.5352_11.txt
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2020\12\1608312437.2506378_19.txt
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2020\12\1609054227.159348.128447.txt


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mail.pim.co.id 118.97.151.36 :33054 "EHLO mail.pim.co.id" rhost-flags-OK-OK-OK-OK by vger.kernel.org with ESMTP id S1726289AbgL3Ukr ORCPT <rfc822;linux-kernel@vger.kernel.org> ; Wed, 30 Dec 2020 15:40:47 -0500
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by mail.pim.co.id Postfix with ESMTP id 81120900CFB0C; Wed, 30 Dec 2020 15:47:25 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mail.pim.co.id 127.0.0.1 by localhost mail.pim.co.id 127.0.0.1 amavisd-new, port 10032 with ESMTP id o5ShcRzcapVs; Wed, 30 Dec 2020 15:47:19 +0700 WIB
More than one match found for [^\w](?:

Skipping C:\Users\ericb\Desktop\Research\Bruce\2021
Processing C:\Users\ericb\Desktop\Research\Bruce\2021\01


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by mail.pim.co.id Postfix with ESMTP id 44F34903F9501; Thu, 31 Dec 2020 07:12:13 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mail.pim.co.id 127.0.0.1 by localhost mail.pim.co.id 127.0.0.1 amavisd-new, port 10032 with ESMTP id MbETIMD5PwCc; Thu, 31 Dec 2020 07:12:13 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by mail.pim.co.id Postfix with ESMTP id 9E3B8901C3B0F; Wed, 30 Dec 2020 23:27:58 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! c

Processing C:\Users\ericb\Desktop\Research\Bruce\2021\02


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from 157.subnet-59.matrixglobal.net.id 103.28.59.157 :42034 "EHLO mail.rsudmajalaya.com" rhost-flags-OK-OK-OK-FAIL by vger.kernel.org with ESMTP id S229522AbhBGAtf ORCPT <rfc822;linux-kernel@vger.kernel.org> ; Sat, 6 Feb 2021 19:49:35 -0500
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from 157.subnet-59.matrixglobal.net.id 103.28.59.157 :42034 "EHLO mail.rsudmajalaya.com" rhost-flags-OK-OK-OK-FAIL by vger.kernel.org with ESMTP id S229522AbhBGAtf ORCPT <rfc822;linux-kernel@vger.kernel.org> ; Sat, 6 Feb 2021 19:49:35 -0500
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from 157.subnet-59.matrixglobal.net.id 103.28.59.157 

No body content found for C:\Users\ericb\Desktop\Research\Bruce\2021\02\1614089310.3092219_9.txt
Processing C:\Users\ericb\Desktop\Research\Bruce\2021\03


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from User gateway 10.10.1.1 by serverkejari.trenggalekkab.go.id Postfix with SMTP id 2EA2BC7D1257; Tue, 2 Mar 2021 06:02:32 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from User gateway 10.10.1.1 by serverkejari.trenggalekkab.go.id Postfix with SMTP id 2EA2BC7D1257; Tue, 2 Mar 2021 06:02:32 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from User gateway 10.10.1.1 by serverkejari.trenggalekkab.go.id Postfix with SMTP id B1160CBDBC89; Wed, 3 Mar 2021 06:09:21 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)

Processing C:\Users\ericb\Desktop\Research\Bruce\2021\04


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from avas10dua.indosat.net.id 219.83.54.102 :53803 "EHLO avas10dua.indosat.net.id" rhost-flags-OK-OK-OK-OK by vger.kernel.org with ESMTP id S229448AbhDLEck ORCPT <rfc822;linux-kernel@vger.kernel.org> ; Mon, 12 Apr 2021 00:32:40 -0400
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from im2mailps01.indosat.net.id 202.155.50.151 by avas11dua.indosat.net.id with ESMTP; 12 Apr 2021 11:29:33 +0700
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from avas10dua.indosat.net.id 219.83.54.102 :53803 "EHLO avas10dua.indosat.net.id" rhost-flags-OK-OK-OK-OK by vger.kernel.org with ESMTP id S229448AbhDLEck ORCPT <rfc822;linux-kernel@vger

No body content found for C:\Users\ericb\Desktop\Research\Bruce\2021\04\1618329571.605075.48552.txt
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2021\04\1618415397.246582.66703.txt


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from avas10dua.indosat.net.id 219.83.54.102 :28112 "EHLO avas10dua.indosat.net.id" rhost-flags-OK-OK-OK-OK by vger.kernel.org with ESMTP id S229519AbhD3AP4 ORCPT <rfc822;linux-kernel@vger.kernel.org> ; Thu, 29 Apr 2021 20:15:56 -0400
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from im2mailps01.indosat.net.id 202.155.50.151 by avas11dua.indosat.net.id with ESMTP; 30 Apr 2021 07:12:12 +0700
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from avas10dua.indosat.net.id 219.83.54.102 :28112 "EHLO avas10dua.indosat.net.id" rhost-flags-OK-OK-OK-OK by vger.kernel.org with ESMTP id S229519AbhD3AP4 ORCPT <rfc822;linux-kernel@vger

Processing C:\Users\ericb\Desktop\Research\Bruce\2021\05


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from asav.customs.go.id localhost 127.0.0.1 by asav.customs.go.id Postfix with ESMTPS id D84D762F18B2; Mon, 3 May 2021 12:38:35 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from asav.customs.go.id localhost 127.0.0.1 by asav.customs.go.id Postfix with ESMTPS id D2BE06295D88; Mon, 3 May 2021 12:24:29 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from 100.94.132.215 unknown 10.0.16.50 by asav.customs.go.id Postfix with ESMTPSA id 4CAD2626C635; Mon, 3 May 2021 12:24:14 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?!

Processing C:\Users\ericb\Desktop\Research\Bruce\2021\06


More than one match found for (?:by\s+(?P<by>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in majordomo@vger.kernel.org by vger.kernel.org via listexpand id S230171AbhFJIbY ORCPT <rfc822;bruce@untroubled.org> ; Thu, 10 Jun 2021 04:31:24 -0400
More than one match found for (?:by\s+(?P<by>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in majordomo@vger.kernel.org by vger.kernel.org via listexpand id S230171AbhFJIbY ORCPT <rfc822;bruce@untroubled.org> ; Thu, 10 Jun 2021 04:31:24 -0400
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mail.ombudsman.go.id mail.ombudsman.go.id 103.234.209.129 using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 256/256 bits key-exchange X25519 server-signature RSA-PSS 2048 bits server-digest SHA256 No client certificate requested by mx.spamdor.

Processing C:\Users\ericb\Desktop\Research\Bruce\2021\07
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2021\07\1625533557.523781_1.txt


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by mail.bata.co.id Postfix with ESMTP id F182B6E2FB1; Wed, 28 Jul 2021 16:10:28 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mail.bata.co.id 127.0.0.1 by localhost mail.bata.co.id 127.0.0.1 amavisd-new, port 10032 with ESMTP id yFzRd6VKFlCH; Wed, 28 Jul 2021 16:10:28 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by mail.bata.co.id Postfix with ESMTP id A33866E31D9; Wed, 28 Jul 2021 16:10:27 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! c

Processing C:\Users\ericb\Desktop\Research\Bruce\2021\08


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by mail.blitarkota.go.id Postfix with ESMTP id EF3C05FA8D8; Tue, 3 Aug 2021 05:21:40 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mail.blitarkota.go.id 127.0.0.1 by localhost mail.blitarkota.go.id 127.0.0.1 amavisd-new, port 10032 with ESMTP id VzyhC2ekmUgr; Tue, 3 Aug 2021 05:21:40 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by mail.blitarkota.go.id Postfix with ESMTP id AB26E5FA8CA; Tue, 3 Aug 2021 05:21:40 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+fr

Processing C:\Users\ericb\Desktop\Research\Bruce\2021\09


Email content 'related' not handled


No body content found for C:\Users\ericb\Desktop\Research\Bruce\2021\09\1631235278.118096.10535.txt


More than one match found for (?:by\s+(?P<by>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in majordomo@vger.kernel.org by vger.kernel.org via listexpand id S233409AbhIVIBy ORCPT <rfc822;bruce@untroubled.org> ; Wed, 22 Sep 2021 04:01:54 -0400
More than one match found for (?:by\s+(?P<by>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in majordomo@vger.kernel.org by vger.kernel.org via listexpand id S233409AbhIVIBy ORCPT <rfc822;bruce@untroubled.org> ; Wed, 22 Sep 2021 04:01:54 -0400


Processing C:\Users\ericb\Desktop\Research\Bruce\2021\10
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2021\10\1635480402.284793_1.txt
Processing C:\Users\ericb\Desktop\Research\Bruce\2021\11
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2021\11\1636139456.3293378_1.txt
Processing C:\Users\ericb\Desktop\Research\Bruce\2021\12
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2021\12\1638745292.821789_1.txt


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by mail.blitarkota.go.id Postfix with ESMTP id 54F5B755BA8; Thu, 9 Dec 2021 11:34:25 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mail.blitarkota.go.id 127.0.0.1 by localhost mail.blitarkota.go.id 127.0.0.1 amavisd-new, port 10032 with ESMTP id vHrn8Gbdj7iK; Thu, 9 Dec 2021 11:34:25 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by mail.blitarkota.go.id Postfix with ESMTP id 135DF766326; Thu, 9 Dec 2021 11:34:25 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+fr

Skipping C:\Users\ericb\Desktop\Research\Bruce\2022
Processing C:\Users\ericb\Desktop\Research\Bruce\2022\01


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from 192.168.0.23 unknown 200.115.30.148 by ukdc.ac.id Postfix with ESMTP id 1FEE64CE8426 for <bruce@untroubled.org>; Tue, 28 Dec 2021 01:31:53 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from 192.168.0.23 unknown 200.115.30.148 by ukdc.ac.id Postfix with ESMTP id 1FEE64CE8426 for <bruce@untroubled.org>; Tue, 28 Dec 2021 01:31:53 +0700 WIB


No body content found for C:\Users\ericb\Desktop\Research\Bruce\2022\01\1641772483.133539_1.txt


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from 192.168.0.23 unknown 200.115.30.148 by ukdc.ac.id Postfix with ESMTP id 9E2EA48A472D for <bruce@untroubled.org>; Mon, 27 Dec 2021 19:27:52 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from 192.168.0.23 unknown 200.115.30.148 by ukdc.ac.id Postfix with ESMTP id 9E2EA48A472D for <bruce@untroubled.org>; Mon, 27 Dec 2021 19:27:52 +0700 WIB


Processing C:\Users\ericb\Desktop\Research\Bruce\2022\02
Processing C:\Users\ericb\Desktop\Research\Bruce\2022\03
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2022\03\1646665776.874688_1.txt


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mail.belitungtimurkab.go.id localhost.localdomain 127.0.0.1 by mail.belitungtimurkab.go.id Postfix with ESMTPS id EC4408A4965; Thu, 17 Mar 2022 20:16:04 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost.localdomain 127.0.0.1 by mail.belitungtimurkab.go.id Postfix with ESMTP id 157FF8A4983; Thu, 17 Mar 2022 20:12:15 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mail.belitungtimurkab.go.id 127.0.0.1 by localhost mail.belitungtimurkab.go.id 127.0.0.1 amavisd-new, port 10026 with ESMTP id g5WCg_o_VSho; Thu, 17 Mar 2022 20:12:14 +0700 WIB
More than one match found for [^\w](

Processing C:\Users\ericb\Desktop\Research\Bruce\2022\04


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mta.kemenpora.go.id mail.kemenpora.go.id 202.52.49.154 by smtp-out-n04.prod.us-west-2.postgun.com with SMTP id 6247e95a62d5dae618ab5ff1 version=TLS1.3, cipher=TLS_AES_128_GCM_SHA256 ; Sat, 02 Apr 2022 06:12:42 GMT
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by mta.kemenpora.go.id Postfix with ESMTP id 11A7E7F867; Sat, 2 Apr 2022 13:12:39 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mta.kemenpora.go.id 127.0.0.1 by localhost mta.kemenpora.go.id 127.0.0.1 amavisd-new, port 10032 with ESMTP id XdDmD9K1Ycp2; Sat, 2 Apr 2022 13:12:38 +0700 WIB
More than one match foun

No body content found for C:\Users\ericb\Desktop\Research\Bruce\2022\04\1649122098.373328_3.txt
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2022\04\1651172209.507253_1.txt
Processing C:\Users\ericb\Desktop\Research\Bruce\2022\05


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost.localdomain 127.0.0.1 by mail.pekanbaru.go.id Postfix with ESMTP id 3B351994E9C; Thu, 19 May 2022 10:10:58 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mail.pekanbaru.go.id 127.0.0.1 by localhost mail.pekanbaru.go.id 127.0.0.1 amavisd-new, port 10032 with ESMTP id LWPH_nemlOjJ; Thu, 19 May 2022 10:10:57 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost.localdomain 127.0.0.1 by mail.pekanbaru.go.id Postfix with ESMTP id 46F09994E4E; Thu, 19 May 2022 10:10:42 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(

Processing C:\Users\ericb\Desktop\Research\Bruce\2022\06
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2022\06\1655173915.580807_3.txt
Processing C:\Users\ericb\Desktop\Research\Bruce\2022\07
Processing C:\Users\ericb\Desktop\Research\Bruce\2022\08
Processing C:\Users\ericb\Desktop\Research\Bruce\2022\09
Processing C:\Users\ericb\Desktop\Research\Bruce\2022\10


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by mail.lintas.net.id Postfix with ESMTP id 51E3D17336DD; Mon, 10 Oct 2022 13:03:35 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mail.lintas.net.id 127.0.0.1 by localhost mail.lintas.net.id 127.0.0.1 amavisd-new, port 10032 with ESMTP id egwnaTgtvboZ; Mon, 10 Oct 2022 13:03:34 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by mail.lintas.net.id Postfix with ESMTP id AF5A617336CF; Mon, 10 Oct 2022 13:03:32 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+b

Processing C:\Users\ericb\Desktop\Research\Bruce\2022\11
Processing C:\Users\ericb\Desktop\Research\Bruce\2022\12
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2022\12\1670307128.1385316_261.txt
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2022\12\1670989087.487487_81.txt
Skipping C:\Users\ericb\Desktop\Research\Bruce\2023
Processing C:\Users\ericb\Desktop\Research\Bruce\2023\01
Processing C:\Users\ericb\Desktop\Research\Bruce\2023\02
Processing C:\Users\ericb\Desktop\Research\Bruce\2023\03


Email content 'rfc822-headers' not handled


Processing C:\Users\ericb\Desktop\Research\Bruce\2023\04


More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by mail.bekasikab.go.id Postfix with ESMTP id A1BDC24779EC8; Sat, 22 Apr 2023 11:47:38 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from mail.bekasikab.go.id 127.0.0.1 by localhost mail.bekasikab.go.id 127.0.0.1 amavisd-new, port 10032 with ESMTP id hdofGn3MIDcD; Sat, 22 Apr 2023 11:47:38 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from localhost localhost 127.0.0.1 by mail.bekasikab.go.id Postfix with ESMTP id 868F93681B948; Sat, 22 Apr 2023 03:00:43 +0700 WIB
More than one match found for [^\w](?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s

Processing C:\Users\ericb\Desktop\Research\Bruce\2023\05
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2023\05\1684124888.868340_1.txt
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2023\05\1684771329.1077101_3.txt
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2023\05\1684771341.1077103_61.txt
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2023\05\1685167221.1203693_1.txt
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2023\05\1685335739.1244660_1.txt
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2023\05\1685335745.1244664_19.txt
Processing C:\Users\ericb\Desktop\Research\Bruce\2023\06
Processing C:\Users\ericb\Desktop\Research\Bruce\2023\07
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2023\07\1689703443.378259_1.txt
No body content found for C:\Users\ericb\Desktop\Research\Bruce\2023\07\1690784733.1132874_5.txt
Processing C:\Users\ericb\Desktop\Research\Bruce\2023

In [41]:
# Now that I have the data, I can clean the textual data for use in the model
# print(df['body'][0])
df.head(50)

Unnamed: 0,filename,body,subject,comments,text_plain,text_html,text_not_managed,defects,defects_categories,number of unsubscribe links,number of undecodable characters,tracking pixel present,total num of images,total links in email,email size (bytes),dkim-signature
0,1514898318.13268_223.txt,"<html><body><span style=""font-family:Verdana; ...",HI,,0,1,0,[],set(),0,0,False,0,0,318,Absent
1,1514898318.13268_225.txt,Science of Numerological Analysis\n\n\nhttp://...,Science of Numerological Analysis,,1,1,0,[],set(),1,2,False,0,23,6929,Present
2,1514898318.13268_227.txt,Science of Numerological Analysis\n\n\nhttp://...,Science of Numerological Analysis,,1,1,0,[],set(),1,2,False,0,23,6945,Present
3,1514898318.13268_229.txt,\n\nThese chicks are looking for casual sex an...,These chicks are looking for a booty call.,,1,1,0,[],set(),0,0,False,0,11,6812,Absent
4,1514911925.5197_1.txt,附件为18年计划表 \n--- mail_boundary ---\nContent-Typ...,回复：2018年计划表,,1,1,0,[],set(),0,0,False,0,0,511,Absent
5,1514911941.5199_1.txt,Voted Top Gift for 2017! Stealth SmartCam! F...,Voted Top Gift for 2017! Stealth SmartCam! F...,,1,1,0,[],set(),0,0,False,0,13,3131,Present
6,1514911941.5199_101.txt,"<a href=""http://vf6nasru.com/log_index.php?pag...",お早う御座います。届いてますか？,,0,1,0,[],set(),0,0,False,0,2,174,Absent
7,1514911941.5199_103.txt,"<a href=""http://p8mxvmft.com/log_index.php?pag...",このメールが最後のご連絡になる可能性も少なくはありません。誤解を招かない為にも、ちゃんと事情...,,0,1,0,[],set(),0,0,False,0,2,189,Absent
8,1514911941.5199_105.txt,"<a href=""http://p8mxvmft.com/log_index.php?pag...",貴方の口座を教えて下さいますか？その口座に「1億円」の支援金、私から貴方への「返済不要の支援...,,0,1,0,[],set(),0,0,False,0,2,188,Absent
9,1514911941.5199_107.txt,"<a href=""http://sm2ecizx.com/log_index.php?pag...",※ご入金がありましたので至急ご確認下さい※,,0,1,0,[],set(),0,0,False,0,2,182,Absent


In [42]:
# output_csv = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\cleaned_data\\warranted_data\\warranted_preprocessed_files.csv'
output_csv = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\cleaned_data\\unwarranted_data\\unwarranted_preprocessed_files_v2.csv'
# Open the CSV file and read it into a DataFrame
df_to_clean = pd.read_csv(output_csv)

In [43]:
import re
import emoji
from bs4 import BeautifulSoup, NavigableString, Tag
import quopri
import base64

def replace_emojis(text):
    # Replace emojis with their text description
    return emoji.demojize(text, delimiters=("", ""))

# def replace_urls_based_on_context(html_content):
#     soup = BeautifulSoup(html_content, 'lxml')
#     for a_tag in soup.find_all('a'):
#         href = a_tag.get('href', '')
#         url_type = 'UNSAFE_' if href.startswith('http://') else ''
#         if a_tag.img:
#             a_tag.string = f'{url_type}IMAGE_URL'
#         elif isinstance(a_tag.next, NavigableString) and a_tag.next.strip():
#             a_tag.string = f'{url_type}LINK_URL'
#         else:
#             a_tag.string = f'{url_type}BUTTON_URL'
#     return str(soup)

def replace_elements_in_html(soup):

    # Replace URLs in 'a' tags
    for a_tag in soup.find_all('a'):
        href = a_tag.get('href', '')
        url_type = ' UNSAFE_' if href.startswith('http://') else ' '
        if a_tag.img:
            a_tag.string = f'{url_type}IMAGE_URL '
        elif isinstance(a_tag.next, NavigableString) and a_tag.next.strip():
            a_tag.string = f'{url_type}LINK_URL '
        else:
            a_tag.string = f'{url_type}BUTTON_URL '

    # Replace src in 'img' tags not wrapped in 'a' tags
    for img_tag in soup.find_all('img'):
        if img_tag.parent.name != 'a':  # Checks if the img tag is not a child of an 'a' tag
            src = img_tag.get('src', '')
            url_type = ' UNSAFE_' if src.startswith('http://') else ''
            new_tag = soup.new_tag("span")
            new_tag.string = f'{url_type}IMAGE_URL '
            img_tag.replace_with(new_tag)

    # Replace buttons - you need to define what you consider as a button
    for button_tag in soup.find_all(['button', 'input']):
        if button_tag.get('type') in ['button', 'submit']:
            button_tag.string = ' BUTTON_URL '

    # Optionally, handle styled divs or other elements used as buttons
    for div_tag in soup.find_all('div', class_='button-style'):  # Example class name
        div_tag.string = ' BUTTON_URL '

    return soup

def replace_urls_in_text(text):
    http_url_pattern = re.compile(r'http://\S+')
    text = http_url_pattern.sub(' UNSAFE_LINK_URL ', text)
    https_url_pattern = re.compile(r'https://\S+')
    text = https_url_pattern.sub(' LINK_URL ', text)
    return text

def decode_quoted_printable(input_data):
    if isinstance(input_data, bytes):
        return quopri.decodestring(input_data).decode('utf-8', errors='replace')
    else:
        return quopri.decodestring(input_data.encode()).decode('utf-8', errors='replace')

def decode_base64(text):
    return base64.b64decode(text).decode('utf-8', errors='replace')

def clean_text(raw_text):
    # If raw_text is None, return an empty string or you could return None depending on your requirement
    if raw_text is None:
        return ''
        
    if not isinstance(raw_text, str):
        # If raw_text is not a string, return it as is or convert it to a string
        return raw_text

    #Remove line breaks and continuation equals signs
    raw_text = re.sub(r'=\n', '', raw_text)
    # Decode any quoted-printable text
    raw_text = quopri.decodestring(raw_text.encode()).decode('utf-8', errors='replace')

    # Create a BeautifulSoup object
    soup = BeautifulSoup(raw_text, 'lxml')

    # Apply replacements for elements in HTML content
    soup = replace_elements_in_html(soup)

    # Remove style and script tags and their content
    for tag in soup(['style', 'script', 'img']):
        tag.decompose()

    # # Replace URLs in 'a' tags
    # for a_tag in soup.find_all('a'):
    #     href = a_tag.get('href', '')
    #     url_type = 'UNSAFE ' if href.startswith('http://') else ''
    #     if a_tag.img:
    #         a_tag.string = f'{url_type}IMAGE URL'
    #     elif isinstance(a_tag.next, NavigableString) and a_tag.next.strip():
    #         a_tag.string = f'{url_type}LINK URL'
    #     else:
    #         a_tag.string = f'{url_type}BUTTON URL'

    # Now proceed with extracting text and further cleaning
    text = soup.get_text(separator=' ', strip=True)
    text = replace_emojis(text)

    text = replace_urls_in_text(text)

    # Remove any remaining HTML encoded characters
    text = re.sub(r'&[a-zA-Z0-9#]+;', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove Zero Width Non-Joiner characters
    text = text.replace('\u200b', '')  # Unicode for ZWNBSP
    text = text.replace('\u200c', '')  # Unicode for ZWNJ
    text = text.replace('\u200d', '')  # Unicode for ZWJ
    text = text.replace('\u200e', '')  # Unicode for LEFT-TO-RIGHT MARK
    text = text.replace('\u200f', '')  # Unicode for RIGHT-TO-LEFT MARK


    # Strip string of leading/trailing whitespace
    return text.strip()


In [45]:


# run the clean_text function on the subject and body columns
df_to_clean['subject'] = df_to_clean['subject'].apply(lambda x: clean_text(x) if x is not None else 'No Subject Found in email.')
df_to_clean['body'] = df_to_clean['body'].apply(lambda x: clean_text(x) if x is not None else 'No Body Found in email.')
df_to_clean['comments'] = df_to_clean['comments'].apply(lambda x: clean_text(x) if x is not None else'No Comments Found in email.')
print(df_to_clean['body'][0])




Can we talk ?


In [46]:
# Save the cleaned data to a new CSV
# df_to_clean.to_csv('C:\\Users\\ericb\\Desktop\\Research\\542_Project\\cleaned_data\\warranted_data\\warranted_cleaned_data_v2.csv', index=False)
df_to_clean.to_csv('C:\\Users\\ericb\\Desktop\\Research\\542_Project\\cleaned_data\\unwarranted_data\\unwarranted_cleaned_data_v2.csv', index=False)

In [47]:

df_to_clean.head(10)

Unnamed: 0,filename,body,subject,comments,text_plain,text_html,text_not_managed,defects,defects_categories,number of unsubscribe links,number of undecodable characters,tracking pixel present,total num of images,total links in email,email size (bytes),dkim-signature
0,1514898318.13268_223.txt,Can we talk ?,HI,,0,1,0,[],set(),0,0,False,0,0,318,Absent
1,1514898318.13268_225.txt,Science of Numerological Analysis UNSAFE_LINK_...,Science of Numerological Analysis,,1,1,0,[],set(),1,2,False,0,23,6929,Present
2,1514898318.13268_227.txt,Science of Numerological Analysis UNSAFE_LINK_...,Science of Numerological Analysis,,1,1,0,[],set(),1,2,False,0,23,6945,Present
3,1514898318.13268_229.txt,These chicks are looking for casual sex and di...,These chicks are looking for a booty call.,,1,1,0,[],set(),0,0,False,0,11,6812,Absent
4,1514911925.5197_1.txt,附件为18年计划表 --- mail_boundary --- Content-Type: ...,回复：2018年计划表,,1,1,0,[],set(),0,0,False,0,0,511,Absent
5,1514911941.5199_1.txt,Voted Top Gift for 2017! Stealth SmartCam! FRE...,Voted Top Gift for 2017! Stealth SmartCam! FRE...,,1,1,0,[],set(),0,0,False,0,13,3131,Present
6,1514911941.5199_101.txt,UNSAFE_LINK_URL 】｀●┠◎´□⌒´∴㎡┳′〔★,お早う御座います。届いてますか？,,0,1,0,[],set(),0,0,False,0,2,174,Absent
7,1514911941.5199_103.txt,UNSAFE_LINK_URL male_sign〔√┘´＾⊂┓｝male_sign∫⊆┸┠...,このメールが最後のご連絡になる可能性も少なくはありません。誤解を招かない為にも、ちゃんと事情...,,0,1,0,[],set(),0,0,False,0,2,189,Absent
8,1514911941.5199_105.txt,UNSAFE_LINK_URL ゝ｛〃→⌒[Å∪≒〒×┣∇Å℃┛＼〓∈,貴方の口座を教えて下さいますか？その口座に「1億円」の支援金、私から貴方への「返済不要の支援...,,0,1,0,[],set(),0,0,False,0,2,188,Absent
9,1514911941.5199_107.txt,UNSAFE_LINK_URL 「≦∬∥％┴▲☆′,※ご入金がありましたので至急ご確認下さい※,,0,1,0,[],set(),0,0,False,0,2,182,Absent


# Drop the duplicate emails from the list of emails

In [65]:
# Path to your CSV file
input_csv_path = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\cleaned_data\\unwarranted_data\\unwarranted_cleaned_data_v2.csv'
output_csv_path = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\cleaned_data\\unwarranted_data\\unwarranted_cleaned_data_no_duplicates_v2.csv'
# input_csv_path = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\cleaned_data\\warranted_data\\warranted_cleaned_data_v2.csv'
# output_csv_path = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\cleaned_data\\warranted_data\\warranted_cleaned_data_no_duplicates_v2.csv'


# Load the CSV file into a DataFrame
df = pd.read_csv(input_csv_path)

# Store the original number of rows
original_row_count = df.shape[0]

# Remove duplicate rows based on both 'subject' and 'body' columns
df_cleaned = df.drop_duplicates(subset=['subject', 'body'], keep='first')

# Calculate the number of removed rows
removed_rows_count = original_row_count - df_cleaned.shape[0]

# Save the cleaned DataFrame back to a new CSV file
df_cleaned.to_csv(output_csv_path, index=False)

print(f"Cleaned CSV saved to {output_csv_path}")
print(f"Number of duplicate rows removed based on 'subject' and 'body': {removed_rows_count}")

Cleaned CSV saved to C:\Users\ericb\Desktop\Research\542_Project\cleaned_data\unwarranted_data\unwarranted_cleaned_data_no_duplicates_v2.csv
Number of duplicate rows removed based on 'subject' and 'body': 53336


In [None]:
print(df_to_clean['body'][1])

# save df_to_clean['body'][i] to txt file
for i in range (0, len(df_to_clean['body'])):
    with open('C:\\Users\\ericb\\Desktop\\Research\\542_Project\\data\\test\\warranted_data_test_output\\mailparser_test_output_body' + str(i) + '.txt', 'w') as f:
        f.write(df_to_clean['body'][i])

# save df_to_clean['text_not_managed'][i] to txt file
for i in range(0, len(df_to_clean['text_not_managed'])):
    with open('C:\\Users\\ericb\\Desktop\\Research\\542_Project\\data\\test\\warranted_data_test_output\\mailparser_test_output_text_not_managed' + str(i) + '.txt', 'w') as f:
        f.write(df_to_clean['text_not_managed'][i])


نقدم لكم الخطة التدريبية لعام 2023 عبر الرابط التالي LINK_URL --- mail_boundary --- Content-Type: image/jpeg; name="image001.jpg" Content-Transfer-Encoding: base64 Content-ID: --- mail_boundary --- نقدم لكم الخطة التدريبية لعام 2023 عبر الرابط التالي LINK URL


# Adding Labels

- 0 for warranted
- 1 for unwarranted

In [66]:
# add labels to warranted and unwarranted data

input_directories = ['C:\\Users\\ericb\\Desktop\\Research\\542_Project\\cleaned_data\\warranted_data\\warranted_cleaned_data_no_duplicates_v2.csv', 
                     'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\cleaned_data\\unwarranted_data\\unwarranted_cleaned_data_no_duplicates_v2.csv']

output_csvs = ['C:\\Users\\ericb\\Desktop\\Research\\542_Project\\cleaned_data\\warranted_data\\warranted_cleaned_data_no_duplicates_with_labels_v2.csv',
                'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\cleaned_data\\unwarranted_data\\unwarranted_cleaned_data_no_duplicates_with_labels_v2.csv']

for i in range(0, len(input_directories)):
    df = pd.read_csv(input_directories[i])
    df['label'] = i
    df.to_csv(output_csvs[i], index=False)

# Adding a few more features

- Number of unsafe links (http) ("UNSAFE_LINK_URL", "UNSAFE_IMAGE_URL", "UNSAFE_BUTTON_URL")
- Number of safe links (https) ("LINK_URL", "IMAGE_URL", " BUTTON_URL")
- Ratio of unsafe links to total links

In [67]:
import pandas as pd

# Path to your CSV file
# csv_path = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\cleaned_data\\warranted_data\\warranted_cleaned_data_no_duplicates_with_labels_v2.csv'
# output_csv_path = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\cleaned_data\\warranted_data\\modified_warranted_cleaned_data_no_duplicates_with_labels_v2.csv'
csv_path = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\cleaned_data\\unwarranted_data\\unwarranted_cleaned_data_no_duplicates_with_labels_v2.csv'
output_csv_path = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\cleaned_data\\unwarranted_data\\modified_unwarranted_cleaned_data_no_duplicates_with_labels_v2.csv'



# Load the CSV file into a DataFrame
df = pd.read_csv(csv_path)

# Define the URL types to count
url_types = ["UNSAFE_LINK_URL", "UNSAFE_IMAGE_URL", "UNSAFE_BUTTON_URL", 
             "LINK_URL", "IMAGE_URL", "BUTTON_URL"]

# Initialize columns for each URL type and for the percentage of unsafe links
for url_type in url_types:
    df[url_type + '_count'] = 0

df['unsafe_to_safe_link_ratio'] = 0.0

# Function to count occurrences of a substring in a string
def count_occurrences(text, substring):
    return text.count(substring) if pd.notna(text) else 0

# Iterate through each row to count occurrences and calculate percentages
for index, row in df.iterrows():
    total_unsafe_links = 0

    for url_type in url_types:
        count_body = count_occurrences(row['body'], url_type)
        count_subject = count_occurrences(row['subject'], url_type)
        total_count = count_body + count_subject

        # Update the count in the DataFrame
        df.at[index, url_type + '_count'] = total_count

        # Update total unsafe links count
        if "UNSAFE" in url_type:
            total_unsafe_links += total_count

    # Calculate the percentage of unsafe links
    total_links = row['total links in email']
    if total_links > 0:
        ratio = (total_unsafe_links / total_links)
        df.at[index, 'unsafe_to_safe_link_ratio'] = ratio

# Save the modified DataFrame back to CSV
df.to_csv(output_csv_path, index=False)

print(f"Modified CSV saved to {output_csv_path}")


Modified CSV saved to C:\Users\ericb\Desktop\Research\542_Project\cleaned_data\unwarranted_data\modified_unwarranted_cleaned_data_no_duplicates_with_labels_v2.csv


In [68]:
df.head(50)

Unnamed: 0,filename,body,subject,comments,text_plain,text_html,text_not_managed,defects,defects_categories,number of unsubscribe links,...,email size (bytes),dkim-signature,label,UNSAFE_LINK_URL_count,UNSAFE_IMAGE_URL_count,UNSAFE_BUTTON_URL_count,LINK_URL_count,IMAGE_URL_count,BUTTON_URL_count,unsafe_to_safe_link_ratio
0,1514898318.13268_223.txt,Can we talk ?,HI,,0,1,0,[],set(),0,...,318,Absent,1,0,0,0,0,0,0,0.0
1,1514898318.13268_225.txt,Science of Numerological Analysis UNSAFE_LINK_...,Science of Numerological Analysis,,1,1,0,[],set(),1,...,6929,Present,1,6,3,2,6,3,2,0.478261
2,1514898318.13268_229.txt,These chicks are looking for casual sex and di...,These chicks are looking for a booty call.,,1,1,0,[],set(),0,...,6812,Absent,1,2,4,1,2,4,1,0.636364
3,1514911925.5197_1.txt,附件为18年计划表 --- mail_boundary --- Content-Type: ...,回复：2018年计划表,,1,1,0,[],set(),0,...,511,Absent,1,0,0,0,0,0,0,0.0
4,1514911941.5199_1.txt,Voted Top Gift for 2017! Stealth SmartCam! FRE...,Voted Top Gift for 2017! Stealth SmartCam! FRE...,,1,1,0,[],set(),0,...,3131,Present,1,3,5,1,3,5,1,0.692308
5,1514911941.5199_101.txt,UNSAFE_LINK_URL 】｀●┠◎´□⌒´∴㎡┳′〔★,お早う御座います。届いてますか？,,0,1,0,[],set(),0,...,174,Absent,1,1,0,0,1,0,0,0.5
6,1514911941.5199_103.txt,UNSAFE_LINK_URL male_sign〔√┘´＾⊂┓｝male_sign∫⊆┸┠...,このメールが最後のご連絡になる可能性も少なくはありません。誤解を招かない為にも、ちゃんと事情...,,0,1,0,[],set(),0,...,189,Absent,1,1,0,0,1,0,0,0.5
7,1514911941.5199_105.txt,UNSAFE_LINK_URL ゝ｛〃→⌒[Å∪≒〒×┣∇Å℃┛＼〓∈,貴方の口座を教えて下さいますか？その口座に「1億円」の支援金、私から貴方への「返済不要の支援...,,0,1,0,[],set(),0,...,188,Absent,1,1,0,0,1,0,0,0.5
8,1514911941.5199_107.txt,UNSAFE_LINK_URL 「≦∬∥％┴▲☆′,※ご入金がありましたので至急ご確認下さい※,,0,1,0,[],set(),0,...,182,Absent,1,1,0,0,1,0,0,0.5
9,1514911941.5199_109.txt,UNSAFE_LINK_URL ┝］『＝│┌＞¨∵√♯￡┴◆∈→∵×,ご入金のお知らせ,,0,1,0,[],set(),0,...,186,Absent,1,1,0,0,1,0,0,0.5


In [73]:
# Path to your CSV file
# csv_path = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\cleaned_data\\warranted_data\\modified_warranted_cleaned_data_no_duplicates_with_labels_v2.csv'
csv_path = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\cleaned_data\\unwarranted_data\\modified_unwarranted_cleaned_data_no_duplicates_with_labels_v2.csv'



# Load the CSV file into a DataFrame
df = pd.read_csv(csv_path)

# count number of NaN values in each comments column
df['comments'].isna().sum()

# fill NaN values with empty string
df['comments'] = df['comments'].fillna('')

# save df
df.to_csv('C:\\Users\\ericb\\Desktop\\Research\\542_Project\\cleaned_data\\unwarranted_data\\modified_unwarranted_cleaned_data_no_duplicates_with_labels_v2.csv', index=False)

In [63]:
df.head()

Unnamed: 0,filename,body,subject,comments,text_plain,text_html,text_not_managed,defects,defects_categories,number of unsubscribe links,...,email size (bytes),dkim-signature,label,UNSAFE_LINK_URL_count,UNSAFE_IMAGE_URL_count,UNSAFE_BUTTON_URL_count,LINK_URL_count,IMAGE_URL_count,BUTTON_URL_count,unsafe_to_safe_link_ratio
0,1514898318.13268_223.txt,Can we talk ?,HI,,[],"['<html><body><span style=""font-family:Verdana...",[],[],set(),0,...,318,Absent,1,0,0,0,0,0,0,0.0
1,1514898318.13268_225.txt,Science of Numerological Analysis UNSAFE_LINK_...,Science of Numerological Analysis,,['Science of Numerological Analysis\n\n\nhttp:...,['<html>\n<head>\n\t<title></title>\n</head>\n...,[],[],set(),1,...,6929,Present,1,4,0,0,4,0,0,0.0
2,1514898318.13268_229.txt,These chicks are looking for casual sex and di...,These chicks are looking for a booty call.,,['\n\nThese chicks are looking for casual sex ...,['<html>\n<head>\n\t<title></title>\n</head>\n...,[],[],set(),0,...,6812,Absent,1,2,0,0,2,0,0,0.0
3,1514911925.5197_1.txt,附件为18年计划表 --- mail_boundary --- Content-Type: ...,回复：2018年计划表,,"['附件为18年计划表 ', 'Content-Type: application/octe...","['<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 ...",[],[],set(),0,...,511,Absent,1,0,0,0,0,0,0,0.0
4,1514911941.5199_1.txt,Voted Top Gift for 2017! Stealth SmartCam! FRE...,Voted Top Gift for 2017! Stealth SmartCam! FRE...,,['Voted Top Gift for 2017! Stealth SmartCam! ...,['<html>\n <head> \n <title></title> \n <met...,[],[],set(),0,...,3131,Present,1,2,0,0,2,0,0,0.0


In [64]:
csv_path = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\train_test_data\\unwarranted_pre_shuffle\\modified_unwarranted_cleaned_data_no_duplicates_with_labels_v2.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_path)

df.head()

Unnamed: 0,filename,body,subject,comments,text_plain,text_html,text_not_managed,defects,defects_categories,number of unsubscribe links,...,email size (bytes),dkim-signature,label,UNSAFE_LINK_URL_count,UNSAFE_IMAGE_URL_count,UNSAFE_BUTTON_URL_count,LINK_URL_count,IMAGE_URL_count,BUTTON_URL_count,unsafe_to_safe_link_ratio
0,1514898318.13268_223.txt,Can we talk ?,HI,,[],"['<html><body><span style=""font-family:Verdana...",[],[],set(),0,...,318,Absent,1,0,0,0,0,0,0,0.0
1,1514898318.13268_225.txt,Science of Numerological Analysis UNSAFE_LINK_...,Science of Numerological Analysis,,['Science of Numerological Analysis\n\n\nhttp:...,['<html>\n<head>\n\t<title></title>\n</head>\n...,[],[],set(),1,...,6929,Present,1,4,0,0,4,0,0,0.0
2,1514898318.13268_229.txt,These chicks are looking for casual sex and di...,These chicks are looking for a booty call.,,['\n\nThese chicks are looking for casual sex ...,['<html>\n<head>\n\t<title></title>\n</head>\n...,[],[],set(),0,...,6812,Absent,1,2,0,0,2,0,0,0.0
3,1514911925.5197_1.txt,附件为18年计划表 --- mail_boundary --- Content-Type: ...,回复：2018年计划表,,"['附件为18年计划表 ', 'Content-Type: application/octe...","['<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 ...",[],[],set(),0,...,511,Absent,1,0,0,0,0,0,0,0.0
4,1514911941.5199_1.txt,Voted Top Gift for 2017! Stealth SmartCam! FRE...,Voted Top Gift for 2017! Stealth SmartCam! FRE...,,['Voted Top Gift for 2017! Stealth SmartCam! ...,['<html>\n <head> \n <title></title> \n <met...,[],[],set(),0,...,3131,Present,1,2,0,0,2,0,0,0.0
