In [1]:
print("start")

start


In [6]:
import os
DATASET_PATH = "../maildir" 

for root, dirs, files in os.walk(DATASET_PATH):
    print(root)


../maildir
../maildir/blair-l
../maildir/blair-l/personnel___promotions
../maildir/blair-l/meetings___nng_customer_mtg
../maildir/blair-l/nng___sla_settlement
../maildir/blair-l/tw___negotiated_rates_issues
../maildir/blair-l/customer___uses
../maildir/blair-l/training___business_objects
../maildir/blair-l/tw___imbalances___netting_trading
../maildir/blair-l/customer___oneok
../maildir/blair-l/customer___virginia_power_dominion
../maildir/blair-l/vacations_2001
../maildir/blair-l/itinerary_off_duty
../maildir/blair-l/presentations
../maildir/blair-l/move
../maildir/blair-l/demarc_allocation
../maildir/blair-l/daily_updates___12_1_01
../maildir/blair-l/tw___invoicing
../maildir/blair-l/expense_reports
../maildir/blair-l/weekly_reports___2001
../maildir/blair-l/merger_with_dynegy
../maildir/blair-l/dra_info
../maildir/blair-l/christmas_cards_list
../maildir/blair-l/ipayit_information
../maildir/blair-l/inbox
../maildir/blair-l/tropical_storm_allison
../maildir/blair-l/nng___sba
../maildi

In [None]:
print("start")
import sqlite3
print("sqlite3 loaded")
import os
from email.parser import Parser
from datetime import datetime

# --- Configuration ---
# Path to the root of the Enron dataset (the 'maildir' folder)
DATASET_PATH = "../maildir" 
# Name for the output database
DB_PATH = "enron2.db"
# ---------------------

def create_database(db_path):
    """
    Creates the database and the four tables with the specified schema.
    """
    print(f"Creating database schema at {db_path}...")
    conn = sqlite3.connect(db_path)
    c = conn.cursor()

    # 1. Employeelist Table
    # This table will be created but left empty. Populating it is the 
    # core data cleaning task (e.g., deduplicating users).
    c.execute('''
    CREATE TABLE IF NOT EXISTS Employeelist (
        eid INTEGER PRIMARY KEY AUTOINCREMENT,
        user_name TEXT UNIQUE NOT NULL,        
        firstName TEXT,
        lastName TEXT,
        status TEXT
    )
    ''')
    
    # 2. Message Table
    c.execute('''
    CREATE TABLE IF NOT EXISTS Message (
        mid INTEGER PRIMARY KEY AUTOINCREMENT,
        sender TEXT,
        eid INTEGER,
        date TEXT,
        message_id TEXT,
        subject TEXT,
        body TEXT,
        folder TEXT,
        length_character INTEGER,
        length_word INTEGER,
        FOREIGN KEY (eid) REFERENCES Employeelist (eid)
    )
    ''')

    # 3. Recipientinfo Table
    c.execute('''
    CREATE TABLE IF NOT EXISTS Recipientinfo (
        rid INTEGER PRIMARY KEY AUTOINCREMENT,
        mid INTEGER,
        rtype TEXT,
        rvalue TEXT,
        FOREIGN KEY (mid) REFERENCES Message (mid)
    )
    ''')

    # 4. Referenceinfo Table
    c.execute('''
    CREATE TABLE IF NOT EXISTS Referenceinfo (
        rfid INTEGER PRIMARY KEY AUTOINCREMENT,
        mid INTEGER,
        reference TEXT,
        FOREIGN KEY (mid) REFERENCES Message (mid)
    )
    ''')

    conn.commit()
    conn.close()
    print("Database schema created successfully.")

def parse_date(input_string: str) -> str:

    parse_format = "%a, %d %b %Y %H:%M:%S %z"
    output_format = "%d/%m/%Y"

    # Remove the timezone name part ' (PDT)' for robust parsing.
    # We parse up to the numeric offset '-0700'.
    date_part = input_string.rsplit(' ', 1)[0]

    dt = datetime.strptime(date_part, parse_format)

    formatted_date = dt.strftime(output_format)
    
    return formatted_date

def parse_recipients(header_string):
    """
    Parses a 'To', 'Cc', or 'Bcc' header string into a list of emails.
    Handles comma-separated lists.
    """
    if not header_string:
        return []
    
    # Simple split on comma. More complex regex could be used
    # for names vs. emails, but this is robust for addresses.
    recipients = [r.strip() for r in header_string.split(',') if r.strip()]
    return recipients

def parse_maildir(dataset_path, db_path):
    """
    Walks the maildir directory, parses each email, and inserts data
    into the Message, Recipientinfo, and Referenceinfo tables.
    """
    conn = sqlite3.connect(db_path)
    c = conn.cursor()
    
    # Use the built-in email Parser
    parser = Parser()
    
    print(f"Starting to parse email files from {dataset_path}...")
    file_count = 0
    
    for root, dirs, files in os.walk(dataset_path):

        rootlist = os.path.normpath(root).split(os.sep)
        depth = len(rootlist)
        
        if depth == 4:
            user_name = rootlist[2]
            c.execute('''
            INSERT OR IGNORE INTO Employeelist (user_name, firstName, lastName, status)
            VALUES (?, Null, ?, Null)
            ''', (user_name, user_name.split('-')[0]))
            current_eid = c.lastrowid
            
            folder_name = rootlist[3]

        for file_name in files:
            # Per your description, files end with a period
            if not file_name.endswith('.'):
                continue

            file_path = os.path.join(root, file_name)
            
            try:
                with open(file_path, 'r', encoding='latin-1') as f:
                    raw_email = f.read()
                
                # Parse the raw email text
                msg = parser.parsestr(raw_email)
                
                # 1. Insert into Message table
                sender = msg.get('From')
                date = parse_date(msg.get('Date'))
                message_id = msg.get('Message-ID')
                subject = msg.get('Subject')
                # The 'X-Folder' header contains the logical Outlook folder
                # folder = msg.get('X-Folder')
                
                # Get the email body (payload)
                body = ""
                if msg.is_multipart():
                    for part in msg.walk():
                        ctype = part.get_content_type()
                        if ctype == 'text/plain':
                            body = part.get_payload(decode=True)
                            try:
                                # Try decoding as utf-8, fall back to latin-1
                                body = body.decode('utf-8')
                            except UnicodeDecodeError:
                                body = body.decode('latin-1')
                            break
                else:
                    body = msg.get_payload(decode=True)
                    try:
                        body = body.decode('utf-8')
                    except UnicodeDecodeError:
                        body = body.decode('latin-1')
                        
                body = body.strip()
                length_character = len(body)
                length_word = len(body.split())

                c.execute('''
                INSERT INTO Message (sender, eid, date, message_id, subject, body, folder, length_character, length_word)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
                ''', (sender, current_eid, date, message_id, subject, body, folder_name, length_character, length_word))
                
                # Get the mid (primary key) of the message we just inserted
                current_mid = c.lastrowid
                
                # 2. Insert into Recipientinfo table
                to_list = parse_recipients(msg.get('To'))
                cc_list = parse_recipients(msg.get('Cc'))
                bcc_list = parse_recipients(msg.get('Bcc'))
                
                for recipient in to_list:
                    c.execute('''
                    INSERT INTO Recipientinfo (mid, rtype, rvalue)
                    VALUES (?, 'to', ?)
                    ''', (current_mid, recipient))
                
                for recipient in cc_list:
                    c.execute('''
                    INSERT INTO Recipientinfo (mid, rtype, rvalue)
                    VALUES (?, 'cc', ?)
                    ''', (current_mid, recipient))

                for recipient in bcc_list:
                    c.execute('''
                    INSERT INTO Recipientinfo (mid, rtype, rvalue)
                    VALUES (?, 'bcc', ?)
                    ''', (current_mid, recipient))

                # 3. Insert into Referenceinfo table
                # Store the entire raw email for reference
                c.execute('''
                INSERT INTO Referenceinfo (mid, reference)
                VALUES (?, ?)
                ''', (current_mid, raw_email))
                
                file_count += 1
                if file_count % 5000 == 0:
                    print(f"Processed {file_count} emails. Committing to database...")
                    conn.commit()

            except Exception as e:
                print(f"Error processing file {file_path}: {e}")
                
    # Final commit and close
    conn.commit()
    conn.close()
    print(f"\n--- Processing Complete ---")
    print(f"Total emails parsed and inserted: {file_count}")
    print(f"Database saved to {db_path}")

if not os.path.isdir(DATASET_PATH):
    print(f"Error: Dataset path '{DATASET_PATH}' not found.")
    print("Please download the Enron dataset and place it in that folder,")
    print("or update the DATASET_PATH variable in the script.")
else:
    create_database(DB_PATH)
    parse_maildir(DATASET_PATH, DB_PATH)


start
sqlite3 loaded
Creating database schema at enron2.db...
Database schema created successfully.
Starting to parse email files from ../maildir...
Processed 5000 emails. Committing to database...
Processed 10000 emails. Committing to database...
Processed 15000 emails. Committing to database...
Processed 20000 emails. Committing to database...
Processed 25000 emails. Committing to database...
Processed 30000 emails. Committing to database...
Processed 35000 emails. Committing to database...
Processed 40000 emails. Committing to database...
Processed 45000 emails. Committing to database...
Processed 50000 emails. Committing to database...
Processed 55000 emails. Committing to database...
Processed 60000 emails. Committing to database...
Processed 65000 emails. Committing to database...
Processed 70000 emails. Committing to database...
Processed 75000 emails. Committing to database...
Processed 80000 emails. Committing to database...
Processed 85000 emails. Committing to database...
Pr

# cleanup data

In [6]:
import sqlite3
import pandas as pd

DB_PATH = "enron.db"


conn = sqlite3.connect(DB_PATH)
c = conn.cursor()

sql_query = "SELECT * FROM Message" 

df = pd.read_sql_query(sql_query, conn)

conn.commit()
conn.close()

In [None]:
import pandas as pd
import re


def remove_legal_disclaimer(text: str) -> str:
    """
    Removes the specific legal disclaimer from a text string.
    Uses regex to handle varying whitespace (newlines, spaces) between words.
    """
    
    # The specific text to remove
    disclaimer_text = """This e-mail message may contain legally privileged and/or confidential
    information. If you are not the intended recipient(s), or the employee
    or agent responsible for delivery of this message to the intended
    recipient(s), you are hereby notified that any dissemination,
    distribution or copying of this e-mail message is strictly prohibited.
    If you have received this message in error, please immediately notify
    the sender and delete this e-mail message from your computer."""

    # 1. Split disclaimer into individual words
    # 2. Escape regex special chars (like parens in 'recipient(s)')
    # 3. Join with \s+ to match any sequence of whitespace (space, tab, newline)
    pattern_parts = [re.escape(word) for word in disclaimer_text.split()]
    regex_pattern = r'\s*'.join(pattern_parts)

    # Compile pattern with IGNORECASE to catch capitalization variations
    # The pattern matches the sequence of words regardless of how they are wrapped
    compiled_pattern = re.compile(regex_pattern, re.IGNORECASE)

    # Replace found instances with an empty string
    cleaned_text = compiled_pattern.sub('', text)
    
    return cleaned_text.strip()

def clean_email_body(text):
    """
    Cleans email body by removing replies, headers, and specific legal disclaimers.
    """
    if pd.isna(text) or not isinstance(text, str):
        return ""

    text = remove_legal_disclaimer(text)
    # 1. Remove the Enron Disclaimer Block
    # Matches a block starting and ending with 10+ asterisks
    text = re.sub(r'\*{10,}[\s\S]*?\*{10,}', '', text)

    # 2. Remove "Original Message" separator lines
    # Replaces the separator with an empty string instead of splitting the text
    text = re.sub(r'-+\s*Original\s*Message\s*-+', '', text, flags=re.IGNORECASE)

    # 3. Remove Quoted text markers (>)
    # Only remove the '>' characters and leading whitespace, preserving the text content
    text = re.sub(r'^[\s>]+', '', text, flags=re.MULTILINE)

    # 4. Remove Header lines (From, Sent, To, Subject)
    # Replaces individual header lines with empty strings. 
    # We do this after removing '>' so we catch headers that were inside quotes.
    # Uses MULTILINE to match the start of lines.
    header_pattern = r'^\s*(?:From|Sent|To|Cc|Subject):\s+.*$'
    text = re.sub(header_pattern, '', text, flags=re.MULTILINE | re.IGNORECASE)

    # 5. Clean up extra whitespace created by removals
    # Collapse multiple newlines into two
    text = re.sub(r'\n{3,}', '\n\n', text)

    return text.strip()

# --- Example Usage ---

# The sample text provided
raw_email = """i am not kidding the kids were using the pot i had cleaned it and 
forgot to dry it. i have no idea are next trip. like i said we will pay 
if it does not dry

-----Original Message-----
From: 	"Danna Burkett" <dmburkett@houston.rr.com>@ENRON 
Sent:	Monday, November 12, 2001 4:01 PM
To:	Mckay, Brad
Subject:	Re: POC

Are you kidding me...what were you using a pot for...I thought you ate every
meal with Walter.......I hope you are kidding....give me the dates you are
going to be down there...I doubt we are going anytime soon but I would like
to know when you are going in case I have anything for you to take....I have
2 chairs that might need to go...I want Tiff to look at them first...
----- Original Message -----
From: <Brad.Mckay@enron.com>
To: <dmburkett@houston.rr.com>
Sent: Monday, November 12, 2001 3:33 PM
Subject: RE: POC


> the couch is just right, i did make the beds to the best of my ability. i
> did not use wally wally. i did however leave a pot that was wet on the
> bottom that created a stain i forgot that granite is a porous rock it
> absorbed the water quite well i hope that it will dry out if not we will
do
> what is necessary to fix it. i am totally serious about the granite
>
>     -----Original Message-----
>    From:   "Danna Burkett" <dmburkett@houston.rr.com>@ENRON
>    Sent:   Monday, November 12, 2001 3:19 PM
>    To:     Mckay, Brad
>    Subject:  Re: POC
>
>    Do you think the couch looks too big?....did you make the beds
>    good?....did
>    you use Wally Wally on the walls???....ha!!!
>    ----- Original Message -----
>    From: <Brad.Mckay@enron.com>
>    To: <dmburkett@houston.rr.com>
>    Sent: Monday, November 12, 2001 2:45 PM
>    Subject: RE: POC
>
>
>    > weekend was good,the place looks good,the boat was fun, we caught
>    trash
>    > fish on bait.
>    >
>    >     -----Original Message-----
>    >    From:   "Danna Burkett" <dmburkett@houston.rr.com>@ENRON
>    >    Sent:   Monday, November 12, 2001 2:41 PM
>    >    To:     Mckay, Brad
>    >    Subject:  POC
>    >
>    >
>    >    How was your weekend? Did you think the place  looked good? How
was
>    the
>    >    boat?......did you catch any fish? Back  atcha.....
>    >
>    >
>    >
>    >
**********************************************************************
>    > This e-mail is the property of Enron Corp. and/or its relevant
>    affiliate
>    and may contain confidential and privileged material for the sole use
of
>    the
>    intended recipient (s). Any review, use, distribution or disclosure by
>    others is strictly prohibited. If you are not the intended recipient
(or
>    authorized to receive for the recipient), please contact the sender or
>    reply
>    to Enron Corp. at enron.messaging.administration@enron.com and delete
>    all
>    copies of the message. This e-mail (and any attachments hereto) are not
>    intended to be an offer (or an acceptance) and do not create or
evidence
>    a
>    binding and enforceable contract between Enron Corp. (or any of its
>    affiliates) and the intended recipient or any other party, and may not
>    be
>    relied on by anyone as the basis of a contract by estoppel or
otherwise.
>    Thank you.
>    >
**********************************************************************
>
"""

# Apply cleaning
df['body_clean'] = df['body'].apply(clean_email_body)


# Display Results
print("--- ORIGINAL ---")
print(df['body'].iloc[0] + "...") # Printing just the start
print("\n--- CLEANED ---")
print(df['body_clean'].iloc[0])
i = 1

KeyError: 'body_clean'

In [10]:
i = 1

In [37]:

# Display Results
print("--- ORIGINAL ---")
print(df['body'].iloc[i]) # Printing just the start
print("\n--- CLEANED ---")
print(df['body_clean'].iloc[i])
print(i)

i += 1

--- ORIGINAL ---
Toby,
would you please make sure that some one on your team adds this enhancement item to the TMS worklist and that some one is assigned to writing up the requirements.

thanks
 -----Original Message-----
From: 	Kuehl, Toby  
Sent:	Friday, September 14, 2001 12:43 PM
To:	Hoang, Joe; Kedwaii, Hasan
Cc:	Blair, Lynn; Holmes, Bradley; Dietz, Rick; Medeles, Gerry; Lee, Dennis
Subject:	RE: Email Notifications

Joe, Hasan, 

Just a thought on this issue.......

In light of the concern from the customers that they are not receiving all of their cut notices, (which was a major topic up during the NNG winter ops meeting) we need to take a closer look at "why" the customers are not receiving them.  Is it the cache, wrong address, e-mail issues etc....  Is there something we do systematically to assure ourselves that these cut notifications are hitting their destinations?  I realize that once the e-mail is sent we have no control of it and there are times when customers are having

In [43]:
# 100x faster
df['clean_length_character'] = df['body_clean'].str.len()
df['clean_length_word'] = df['body_clean'].str.split().str.len()

In [44]:
conn = sqlite3.connect(DB_PATH)

# Write to new table 'similarities'
# if_exists='replace' drops the table if it exists and creates a new one
# if_exists='append' adds to it
df.to_sql(
    name='Message',
    con=conn,
    if_exists='replace', 
    index=False,
    chunksize=10000  # Write in batches to save memory
)

cursor = conn.cursor()
cursor.execute("CREATE INDEX IF NOT EXISTS idx_sim_mid ON Message (mid)")
conn.commit()

conn.close()

In [1]:
#TODO: make a python file to run it on the other computer and then have it write it over ssh back.

In [None]:
# auto backup the enron.db to google drive.

In [3]:
# make the script git add . and commit automatically when ran if there are changes and prompt to give a reason.
# and make the server git pull as well. 

In [25]:
N = 170_589
mu = 309.74

In [26]:
D = mu * N * 4 * 7

In [27]:
P = 2.2 * 10 ** 8 

In [28]:
train_flops = D * P * 6

In [29]:
id_lenght = 10
beam_width = 10
search_flops = 2 * P * N * id_lenght * beam_width

In [30]:
troughput = 124 * 10 ** 12  # 124 TFLOPS

In [31]:
train_time = train_flops / troughput
train_beam_time = (train_flops + search_flops) / troughput
print("in seconds:")
print(train_time)
print(train_beam_time)
print("in minutes:")
print(train_time / 60)
print(train_beam_time / 60)
print("in hours:")
print(train_time / (60 * 60))
print(train_beam_time / (60 * 60))

in seconds:
15749.203502787097
15809.735083432259
in minutes:
262.4867250464516
263.495584723871
in hours:
4.374778750774194
4.391593078731183


In [11]:
print(27.72 * 10 ** 16 / (124 * 10 ** 12) / (60* 60) * 3 * 5)

9.314516129032258


In [None]:
pass