In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

from shared.constants import DatasetPath

In [3]:
# DATASET = DatasetPath('test')
DATASET = DatasetPath('enron-mail-20150507')

In [17]:
import pandas as pd

df = pd.read_csv(DATASET.raw('emails.csv'))
df.head(5)

IsADirectoryError: [Errno 21] Is a directory: '/dd_volume/Development/Python/Thesis/code/datasets/data/raw/test'

In [18]:
import pathlib
import pandas as pd
from tqdm import tqdm
import os

if str(DATASET) == 'test':
    files = []
    path = pathlib.Path(DATASET.raw())
    for file in tqdm(path.glob('**/*')):
        if not os.path.isfile(file):
            continue

        try:
            with open(file, 'r') as f:
                files.append({
                    'file': str(file),
                    'message': f.read()
                })
        except Exception as e:
            print(e)

    df = pd.DataFrame(files)

117it [00:00, 134.77it/s]


In [19]:
# Convert to linux line endings

WINDOWS_LINE_ENDING = '\r\n'
UNIX_LINE_ENDING = '\n'

df['message'] = df.message.map(lambda x: x.replace(WINDOWS_LINE_ENDING, UNIX_LINE_ENDING))

# Enron Dataset Preprocessing
Credit: https://www.kaggle.com/oalvay/enron-emails-complete-preprocessing

In [20]:
import re
import pandas as pd


def info_part(i):
    """split infomation part out"""
    return i.split('\n\n', 1)[0]


def content_part(i):
    """split content part out"""
    return i.split('\n\n', 1)[1]


df['pre_info'] = df.message.map(info_part)
df['content'] = df.message.map(content_part)
df['test_true'] = True

words2split = ['Message-ID: ', 'Date: ', 'From: ', 'To: ', 'Subject: ', 'Cc: ', 'Mime-Version: ', 'Content-Type: ',
               'Content-Transfer-Encoding: ', 'Bcc: ', 'X-From: ', 'X-To: ', 'X-cc: ', 'X-bcc: ', 'X-Folder: ',
               'X-Origin: ',
               'X-FileName: ']
features_naming = [i[:-2] for i in words2split]
split_condition = '|'.join(words2split)


# Some emails' subject confuse the string-spliting function, so I make a little change
def duplicated_info(i):
    return i.replace(' Date: ', ' Date- ').replace(' Subject: ', ' Subject2: ').replace(' To: ',
                                                                                        ' To- ').replace(' (Subject: ',
                                                                                                         ' (Subject- ')


df['pre_info'] = df['pre_info'].map(duplicated_info)


# let's check how many categories are there in these emails
def num_part(i):
    return len(re.split(split_condition, i))


df['num_info'] = df['pre_info'].map(num_part)


# around 20k emails do not have the 'To: ' category, so I add one
def add_to(i):
    return i.replace('\nSubject: ', '\nTo: \nSubject: ')


temp_condition = (df['num_info'] == 17) | (df['num_info'] == 15)
df.loc[temp_condition, 'pre_info'] = df.loc[temp_condition, 'pre_info'].map(add_to)

# similar way to deal with the "Cc:" and "Bcc:" categories
temp_condition = (df['num_info'] == 16) | (df['num_info'] == 15)


def add_bcc(i):
    return i.replace('\nX-From: ', '\nBcc: \nX-From: ')


df.loc[temp_condition, 'pre_info'] = df.loc[temp_condition, 'pre_info'].map(add_bcc)


def add_cc(i):
    return i.replace('\nMime-Version: ', '\nCc: \nMime-Version: ')


df.loc[temp_condition, 'pre_info'] = df.loc[temp_condition, 'pre_info'].map(add_cc)

df['num_info'] = df['pre_info'].map(num_part)
df['num_info'].value_counts()

df_remove = df.loc[df['num_info'] != 18].copy()
df = df.loc[df['num_info'] == 18].copy()

global feature_idx


def info_split(i):
    ## split the i th part out and remove \n for the feature
    return re.split(split_condition, i)[feature_idx + 1][:-1]


def info_split_last(i):
    ## no need to remove \n for last category -- X-FileName
    return re.split(split_condition, i)[feature_idx + 1]


for feature_idx in range(len(words2split)):
    if feature_idx != len(words2split) - 1:
        df[features_naming[feature_idx]] = df['pre_info'].map(info_split)
    else:
        df[features_naming[feature_idx]] = df['pre_info'].map(info_split_last)

df_remove2 = df.loc[df['Content-Transfer-Encoding'] == 'text/plain; charset=us-asci']
df = df.loc[df['Content-Transfer-Encoding'] != 'text/plain; charset=us-asci']

df.loc[df["content"].str.contains("-------------"), "content"]


def split_other_content(i):
    """split other forms of contents out"""
    return i.split('-------------', 1)[0]


df["has_other_content"] = df["content"].str.contains("-------------")
df["if_forwarded"] = df["content"].str.contains("------------- Forwarded")
df['content'] = df.content.map(split_other_content)

df = df.drop(['pre_info', 'test_true', 'num_info'], axis=1).set_index("file")

# Parse what we can from the emails

In [21]:
df['Date'] = pd.to_datetime(df['Date'])
df['folder'] = df.index.map(lambda x: x.split('/')[-2])


def clean_cc(x):
    tokens = [i.strip() for i in x.split(',')]
    return ','.join(tokens)


df['From'] = df['From'].map(clean_cc).str.lower()
df['To'] = df['To'].map(clean_cc).str.lower()
df['Cc'] = df['Cc'].map(clean_cc).str.lower()
df['Bcc'] = df['Bcc'].map(clean_cc).str.lower()

df.drop(columns=['message'], inplace=True)

In [22]:
df

Unnamed: 0_level_0,content,Message-ID,Date,From,To,Subject,Cc,Mime-Version,Content-Type,Content-Transfer-Encoding,...,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,has_other_content,if_forwarded,folder
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
/dd_volume/Development/Python/Thesis/code/datasets/data/raw/test/allen-p/inbox/3.,Phillip\nMy interpretation of this is that we ...,<10326858.1075855377484.JavaMail.evans@thyme>,2001-12-10 15:31:51-08:00,david.port@enron.com,k..allen@enron.com,FW: Gas P&L by day,,1.0,text/plain; charset=us-ascii,7bit,...,"Port, David </O=ENRON/OU=NA/CN=RECIPIENTS/CN=D...","Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...",,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\Inbox",Allen-P,pallen (Non-Privileged).pst,False,False,inbox
/dd_volume/Development/Python/Thesis/code/datasets/data/raw/test/allen-p/inbox/1.,\nPlease let me know if you still need Curve ...,<16159836.1075855377439.JavaMail.evans@thyme>,2001-12-07 10:06:42-08:00,heather.dunton@enron.com,k..allen@enron.com,RE: West Position,,1.0,text/plain; charset=us-ascii,7bit,...,"Dunton, Heather </O=ENRON/OU=NA/CN=RECIPIENTS/...","Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...",,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\Inbox",Allen-P,pallen (Non-Privileged).pst,False,False,inbox
/dd_volume/Development/Python/Thesis/code/datasets/data/raw/test/allen-p/inbox/10.,\n\n_____________________DAVID COURSEY________...,<14955894.1075855377681.JavaMail.evans@thyme>,2001-12-30 22:49:42-08:00,anchordesk_daily@anchordesk.zdlists.com,pallen@enron.com,ANCHORDESK: Hope ahead: What I learned from 20...,,1.0,text/plain; charset=ANSI_X3.4-1968,7bit,...,"""AnchorDesk"" <AnchorDesk_Daily@anchordesk.zdli...",pallen@ENRON.COM,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\Inbox",Allen-P,pallen (Non-Privileged).pst,False,False,inbox
/dd_volume/Development/Python/Thesis/code/datasets/data/raw/test/allen-p/inbox/11.,"Dear phillip,\n\n\nThis e-mail is automated no...",<7462038.1075855377703.JavaMail.evans@thyme>,2001-12-30 23:42:30-08:00,subscriptions@intelligencepress.com,pallen@enron.com,"NGI Publications - Monday, December 31st 2001",,1.0,text/plain; charset=us-ascii,7bit,...,subscriptions@intelligencepress.com@ENRON,pallen@enron.com,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\Inbox",Allen-P,pallen (Non-Privileged).pst,False,False,inbox
/dd_volume/Development/Python/Thesis/code/datasets/data/raw/test/allen-p/inbox/12.,"\n[IMAGE] [IMAGE] [IMAGE] [IMAGE] $ 2,500 ...",<21572157.1075855377726.JavaMail.evans@thyme>,2001-12-31 02:24:51-08:00,prizemachine@feedback.iwon.com,pallen@enron.com,"Click. Spin. Chances to Win up to $10,000!",,1.0,text/plain; charset=us-ascii,7bit,...,Prize Machine<PrizeMachine@feedback.iwon.com>@...,pallen@enron.com,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\Inbox",Allen-P,pallen (Non-Privileged).pst,False,False,inbox
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
/dd_volume/Development/Python/Thesis/code/datasets/data/raw/test/allen-p/notes_inbox/50.,"Due to some problems with my email yesterday, ...",<18640335.1075855713056.JavaMail.evans@thyme>,2001-05-10 06:05:00-07:00,lisa.jacobson@enron.com,"lisa.jacobson@enron.com,kevin.mcgowan@enron.co...",RSVP REQUESTED - Emissions Strategy Meeting....,,1.0,text/plain; charset=us-ascii,7bit,...,Lisa Jacobson,"Lisa Jacobson, Kevin McGowan, Daniel Reck, Mat...",,,\Phillip_Allen_June2001\Notes Folders\Notes inbox,Allen-P,pallen.nsf,False,False,notes_inbox
/dd_volume/Development/Python/Thesis/code/datasets/data/raw/test/allen-p/notes_inbox/6.,"Attached is davis.doc, a quick & dirty report ...",<8522013.1075855678733.JavaMail.evans@thyme>,2000-12-13 07:57:00-08:00,aod@newsdata.com,western.price.survey.contacts@ren-3.cais.net,Report on News Conference,alb@cpuc.ca.gov,1.0,text/plain; charset=us-ascii,7bit,...,"""Arthur O'Donnell"" <aod@newsdata.com>",Western.Price.Survey.contacts@ren-3.cais.net,"""'alb@cpuc.ca.gov'"" <alb@cpuc.ca.gov>",,\Phillip_Allen_Dec2000\Notes Folders\Notes inbox,Allen-P,pallen.nsf,False,False,notes_inbox
/dd_volume/Development/Python/Thesis/code/datasets/data/raw/test/allen-p/notes_inbox/7.,----- Forwarded by Sarah Novosel/Corp/Enron on...,<19166235.1075855678756.JavaMail.evans@thyme>,2000-12-13 08:39:00-08:00,sarah.novosel@enron.com,"steven.kean@enron.com,richard.shapiro@enron.co...",Final FIled Version,,1.0,text/plain; charset=us-ascii,7bit,...,Sarah Novosel,"Steven J Kean, Richard Shapiro, James D Steffe...",,,\Phillip_Allen_Dec2000\Notes Folders\Notes inbox,Allen-P,pallen.nsf,False,False,notes_inbox
/dd_volume/Development/Python/Thesis/code/datasets/data/raw/test/allen-p/notes_inbox/8.,Transwestern Pipeline Co. posted new notice(s)...,<4375099.1075855678796.JavaMail.evans@thyme>,2000-12-13 08:34:00-08:00,critical.notice@enron.com,"ywang@enron.com,patti.sullivan@enron.com,phill...",New Notice from Transwestern Pipeline Co.,,1.0,text/plain; charset=us-ascii,7bit,...,critical.notice@Enron.com,"ywang@Enron.com, Patti.Sullivan@Enron.com, Phi...",,,\Phillip_Allen_Dec2000\Notes Folders\Notes inbox,Allen-P,pallen.nsf,False,False,notes_inbox


In [23]:
df.to_parquet('../data/tmp/emails.parquet')

# Convert back to spark dataframe

In [4]:
spark = (SparkSession.builder
         .appName(f'{DATASET}_preprocess')
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "16g")
         .getOrCreate())

22/01/02 00:37:18 WARN Utils: Your hostname, megatron resolves to a loopback address: 127.0.1.1; using 192.168.1.89 instead (on interface enp7s0)
22/01/02 00:37:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/02 00:37:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/01/02 00:37:19 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
df = spark.read.parquet('../data/tmp/emails.parquet')

                                                                                

In [6]:
df = (
    df
        .withColumn('To', F.regexp_replace(F.col('To'), '"', ''))
        .withColumn('From', F.regexp_replace(F.col('From'), '"', ''))
        .withColumn('Cc', F.regexp_replace(F.col('Cc'), '"', ''))
        .withColumn('Bcc', F.regexp_replace(F.col('Bcc'), '"', ''))
)

In [7]:
df.head(5)

                                                                                

[Row(content='Here is our forecast\n\n ', Message-ID='<18782981.1075855378110.JavaMail.evans@thyme>', Date=datetime.datetime(2001, 5, 15, 1, 39), From='phillip.allen@enron.com', To='tim.belden@enron.com', Subject='', Cc='', Mime-Version='1.0', Content-Type='text/plain; charset=us-ascii', Content-Transfer-Encoding='7bit', Bcc='', X-From='Phillip K Allen', X-To='Tim Belden <Tim Belden/Enron@EnronXGate>', X-cc='', X-bcc='', X-Folder="\\Phillip_Allen_Jan2002_1\\Allen, Phillip K.\\'Sent Mail", X-Origin='Allen-P', X-FileName='pallen (Non-Privileged).pst', has_other_content=False, if_forwarded=False, folder='_sent_mail', file='allen-p/_sent_mail/1.'),
 Row(content="Traveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.\n\nAs f

In [8]:
df_nodes_users = (
    df.select(
        F.explode(F.concat(
            (F.split(F.col('To'), ',')),
            (F.split(F.col('From'), ',')),
            (F.split(F.col('Cc'), ',')),
            (F.split(F.col('Bcc'), ',')),
        )).alias('email')
    )
        .distinct()
        .filter("email != ''")
        .withColumn('is_internal', F.col('email').like('%enron%'))
        .withColumn('id', F.col('email'))
        .dropDuplicates(['id'])
)
df_nodes_users.count()

                                                                                

87678

In [9]:
df_nodes_users.head(20)

                                                                                

[Row(email='#2.martin@enron.com', is_internal=True, id='#2.martin@enron.com'),
 Row(email='#23.training@enron.com', is_internal=True, id='#23.training@enron.com'),
 Row(email='#24.training@enron.com', is_internal=True, id='#24.training@enron.com'),
 Row(email='#25.training@enron.com', is_internal=True, id='#25.training@enron.com'),
 Row(email='#26.training@enron.com', is_internal=True, id='#26.training@enron.com'),
 Row(email='#28.training@enron.com', is_internal=True, id='#28.training@enron.com'),
 Row(email='#29.training@enron.com', is_internal=True, id='#29.training@enron.com'),
 Row(email='#30.training@enron.com', is_internal=True, id='#30.training@enron.com'),
 Row(email="''blanchard@enron.com", is_internal=True, id="''blanchard@enron.com"),
 Row(email="''collins@enron.com", is_internal=True, id="''collins@enron.com"),
 Row(email="''hunt@enron.com", is_internal=True, id="''hunt@enron.com"),
 Row(email="'.''ann@enron.com", is_internal=True, id="'.''ann@enron.com"),
 Row(email="'.''

In [10]:
df_nodes_emails = (
    df.select(
        F.col('Message-ID').alias('id'),
        F.col('content'),
        F.col('From').like('%enron%').alias('is_internal'),
        F.split(F.col('From'), ',').alias('From'),
        F.split(F.col('To'), ',').alias('To'),
        F.split(F.col('Cc'), ',').alias('Cc'),
        F.split(F.col('Bcc'), ',').alias('Bcc'),
        F.col('Date'),
        F.col('Subject'),
        F.col('folder'),
        F.col('if_forwarded'),
    ).distinct()
    .dropDuplicates(['id'])
)
df_nodes_emails.count()

517398

In [11]:
df_edges_from = (
    df_nodes_emails.select(
        F.explode(F.col('From')).alias('src'),
        F.col('id').alias('dst'),
        F.lit(1).alias('weight'),
    ).filter("src != ''").distinct()
)
print(df_edges_from.count())

df_edges_to = (
    df_nodes_emails.select(
        F.explode(F.col('To')).alias('src'),
        F.col('id').alias('dst'),
        F.lit(1).alias('weight'),
    ).filter("src != ''").distinct()
)
print(df_edges_to.count())

df_edges_cc = (
    df_nodes_emails.select(
        F.explode(F.col('Cc')).alias('src'),
        F.col('id').alias('dst'),
        F.lit(1).alias('weight'),
    ).filter("src != ''").distinct()
)
print(df_edges_cc.count())

# df_edges_bcc = (
#     df_nodes_emails.select(
#         F.explode(F.split(F.col('Bcc'), ',')).alias('src'),
#         F.col('id').alias('dst'),
#         F.lit(1).alias('weight'),
#     ).filter("src != ''").distinct()
# )
# print(df_edges_bcc.count())

                                                                                

517398


                                                                                

3101175




561305


                                                                                

In [12]:
df_nodes_users.write.parquet(DATASET.processed_str('nodes_User'), mode='overwrite')
df_nodes_emails.write.parquet(DATASET.processed_str('nodes_Email'), mode='overwrite')

df_edges_from.write.parquet(DATASET.processed_str('edges_SENT'), mode='overwrite')
df_edges_to.write.parquet(DATASET.processed_str('edges_TO'), mode='overwrite')
df_edges_cc.write.parquet(DATASET.processed_str('edges_CC'), mode='overwrite')

                                                                                

In [13]:
from datasets.build_schema import build_schema

build_schema(
    spark,
    name=str(DATASET),
    nodes=[
        ('User', DATASET.processed_str('nodes_User')),
        ('Email', DATASET.processed_str('nodes_Email')),
    ],
    edges=[
        ('Sent', 'User', 'Email', DATASET.processed_str('edges_SENT')),
        ('Received', 'Email', 'User', DATASET.processed_str('edges_TO')),
        ('ReceivedCc', 'Email', 'User', DATASET.processed_str('edges_CC')),
    ]
)

DatasetSchema(name='enron-mail-20150507', prefix='EnronMail_20150507', database='enron-mail-20150507', description=None, nodes=[NodeSchema(label='User', id='id', path='data/processed/enron-mail-20150507/nodes_User', properties=[Property(name='email', type='string', ignore=False), Property(name='is_internal', type='boolean', ignore=False), Property(name='id', type='string', ignore=False)]), NodeSchema(label='Email', id='id', path='data/processed/enron-mail-20150507/nodes_Email', properties=[Property(name='id', type='string', ignore=False), Property(name='content', type='string', ignore=False), Property(name='is_internal', type='boolean', ignore=False), Property(name='From', type='string[]', ignore=False), Property(name='To', type='string[]', ignore=False), Property(name='Cc', type='string[]', ignore=False), Property(name='Bcc', type='string[]', ignore=False), Property(name='Date', type='datetime', ignore=False), Property(name='Subject', type='string', ignore=False), Property(name='folde