# Imports

In [2]:
import pandas as pd
from pymongo import MongoClient
import sys

# Determine the base directory (e.g., the workspace directory)
#base_directory = SET TO BASE DIRECTORY FULL PATH (phish-gen) 
# Add the base directory to the system path
#sys.path.append(base_directory)

# Attachments

In [5]:
connection = MongoClient('localhost', 27017)
#get all messages with attachments from database
#group them by attachment format and count them

db = connection['enron_emails']
collection = db['step2_single']

pipeline = [
    {"$unwind": "$messages"},
    {
        "$match": {
            "$and": [
                {"messages.attachment_formats": {"$exists": True}},
                {"messages.attachment_formats": {"$ne": None}}
            ]
        }
    },
    {"$unwind": "$messages.attachment_formats"},
    {
        "$project": {
            "_id": 1,
            "attachment_format": "$messages.attachment_formats"
        }
    }
]

cursor = collection.aggregate(pipeline)

for i in cursor:
    print(i)

In [6]:
#count all threads where at least one messages has an attachment format field
collection.find_one()

{'_id': ObjectId('668fc1c0551e847488546dea'),
 'file_path': 'offline_finetuning/data_processing/enron/dataset/maildir/quenet-j/inbox/6.',
 'messages': [{'_id': ObjectId('668fc1c0551e847488546deb'),
   'is_main': True,
   'headers': {'Message-ID': '<8319012.1075861649188.JavaMail.evans@thyme>',
    'Date': 'Mon, 12 Nov 2001 13:55:20 -0800 (PST)',
    'From': 'jae.black@enron.com',
    'To': 'j..broderick@enron.com, robert.benson@enron.com, gautam.gupta@enron.com, \n\tjoe.quenet@enron.com, d..thomas@enron.com, lisa.burnett@enron.com, \n\tpatrick.hanse@enron.com, peter.makkai@enron.com, \n\tbrian.terp@enron.com, benjamin.rogers@enron.com, \n\tjason.choate@enron.com, willis.philip@enron.com, \n\tbryce.schneider@enron.com, cory.willis@enron.com, \n\tdavid.ingram@enron.com, anubhav.aggarwal@enron.com, \n\treagan.mathews@enron.com, carl.tricoli@enron.com, \n\tjohn.llodra@enron.com, george.wood@enron.com, rob.wheeler@enron.com, \n\tnick.politis@enron.com, eric.irani@enron.com, \n\tpalmer.letze

# Senders

In [7]:
connection = MongoClient('localhost', 27017)
db = connection['enron_emails']
db = db['step2_single']
#retrieve a list of unique senders
senders = db.distinct("messages.headers.From")
print(len(senders))
print(len([sender for sender in senders if "enron" not in sender.lower()]))

7064
4310


In [19]:
#retrieve messages containing an email
email_regex = r"(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*)@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"
import re
cursor = db.find({"messages.body": {"$regex": email_regex}})

for i in cursor[1]["messages"]:
    print(i["body"])

The Enron Health Center has received its shipment of flu vaccine.  Shots will be given on a first-come, first-served basis.  NO APPOINTMENTS WILL BE ACCEPTED.  

	When:	Beginning Tuesday, November 13
	Time:	8 a.m. until 4 p.m.
	Where:	Enron Health Center, EB-312
	Cost:	Free to Enron employees, spouses, retirees, and EDS
		$10 for contractors

For more information email:  mailto:health.center@enron.com.


# Entities

In [9]:
collection = connection["enron_emails"]["step2_single"]
#retrieve a list of all entities
threads = collection.find({"messages.entities.auto": {"$exists": True}})
people_set = set()
org_set = set()
location_set = set()
misc_set = set()
for thread in threads:
    for message in thread["messages"]:
        if "PER" in message["entities"]["auto"]:
            people = set([person[0] for person in message["entities"]["auto"]["PER"]])
            people_set.update(people)
        if "ORG" in message["entities"]["auto"]:
            organizations = [org[0] for org in message["entities"]["auto"]["ORG"]]
            org_set.update(organizations)
        if "LOC" in message["entities"]["auto"]:
            locations = [loc[0] for loc in message["entities"]["auto"]["LOC"]]
            location_set.update(locations)
        
print("Unique People: ", len(people_set))
print("Unique Organizations: ", len(org_set))
print("Unique Locations: ", len(location_set))

Unique People:  457
Unique Organizations:  848
Unique Locations:  294


# Topic Modeling

# Torch Dataset

In [6]:
from datasets import load_from_disk

dataset = load_from_disk("../offline_finetuning/datasets/pytorch/enron")

In [8]:
#print random sample of entities
import random
for i in range(10):
    print(random.choice(dataset["text"]))

subject: EOL and Clickpaper Approvals for 10-31-01
sentiment: neutral
->
body: Please see attached.

 << File: EOL <DATE>.xls >> 
Regards,

Wendi Lebrocq
3-3835
subject: funds flow
sentiment: neutral
->
body: Sorry <PER>, I can't come up with the reason behind the July change of $11 
million in "Other, Net". Carolyn who completes the schedules had surgery 
yesterday so will be out until Sept. 11. I can't find the source of the 
amount that is hardcoded in the spreadsheet - she must have combined 
something. One thought was TOLI, but it was flat in July. I have two other 
people who might be able to help me on vacation this week as well. 

>>> Geaccone, <PER> <DATE> 04:24PM >>> 
There was a big change in the Funds Flow number you submitted for the 3Q in 
August from the 3Q you submitted last month. Can you tell me what the change 
is due to? 

Thanks 

<PER>
subject: External E-Mail Sites
sentiment: neutral
->
body: Everyone,

In an effort to protect our computing systems from viruses a