In [None]:
%load_ext autoreload
%autoreload 2

In [4]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession


In [5]:
from shared.schema import DatasetSchema

DATASET = DatasetSchema.load_schema('enron-mail-20150507')
DATASET.save_schema()

In [6]:
import pandas as pd

df = pd.read_csv(DATASET.raw('emails.csv'))
df.head(5)

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [7]:
import pathlib
import pandas as pd
from tqdm import tqdm
import os

if str(DATASET) == 'test':
    files = []
    path = pathlib.Path(DATASET.raw())
    for file in tqdm(path.glob('**/*')):
        if not os.path.isfile(file):
            continue

        try:
            with open(file, 'r') as f:
                files.append({
                    'file': str(file),
                    'message': f.read()
                })
        except Exception as e:
            print(e)

    df = pd.DataFrame(files)

In [8]:
# Convert to linux line endings

WINDOWS_LINE_ENDING = '\r\n'
UNIX_LINE_ENDING = '\n'

df['message'] = df.message.map(lambda x: x.replace(WINDOWS_LINE_ENDING, UNIX_LINE_ENDING))

# Enron Dataset Preprocessing
Credit: https://www.kaggle.com/oalvay/enron-emails-complete-preprocessing

In [9]:
import re
import pandas as pd


def info_part(i):
    """split infomation part out"""
    return i.split('\n\n', 1)[0]


def content_part(i):
    """split content part out"""
    return i.split('\n\n', 1)[1]


df['pre_info'] = df.message.map(info_part)
df['content'] = df.message.map(content_part)
df['test_true'] = True

words2split = ['Message-ID: ', 'Date: ', 'From: ', 'To: ', 'Subject: ', 'Cc: ', 'Mime-Version: ', 'Content-Type: ',
               'Content-Transfer-Encoding: ', 'Bcc: ', 'X-From: ', 'X-To: ', 'X-cc: ', 'X-bcc: ', 'X-Folder: ',
               'X-Origin: ',
               'X-FileName: ']
features_naming = [i[:-2] for i in words2split]
split_condition = '|'.join(words2split)


# Some emails' subject confuse the string-spliting function, so I make a little change
def duplicated_info(i):
    return i.replace(' Date: ', ' Date- ').replace(' Subject: ', ' Subject2: ').replace(' To: ',
                                                                                        ' To- ').replace(' (Subject: ',
                                                                                                         ' (Subject- ')


df['pre_info'] = df['pre_info'].map(duplicated_info)


# let's check how many categories are there in these emails
def num_part(i):
    return len(re.split(split_condition, i))


df['num_info'] = df['pre_info'].map(num_part)


# around 20k emails do not have the 'To: ' category, so I add one
def add_to(i):
    return i.replace('\nSubject: ', '\nTo: \nSubject: ')


temp_condition = (df['num_info'] == 17) | (df['num_info'] == 15)
df.loc[temp_condition, 'pre_info'] = df.loc[temp_condition, 'pre_info'].map(add_to)

# similar way to deal with the "Cc:" and "Bcc:" categories
temp_condition = (df['num_info'] == 16) | (df['num_info'] == 15)


def add_bcc(i):
    return i.replace('\nX-From: ', '\nBcc: \nX-From: ')


df.loc[temp_condition, 'pre_info'] = df.loc[temp_condition, 'pre_info'].map(add_bcc)


def add_cc(i):
    return i.replace('\nMime-Version: ', '\nCc: \nMime-Version: ')


df.loc[temp_condition, 'pre_info'] = df.loc[temp_condition, 'pre_info'].map(add_cc)

df['num_info'] = df['pre_info'].map(num_part)
df['num_info'].value_counts()

df_remove = df.loc[df['num_info'] != 18].copy()
df = df.loc[df['num_info'] == 18].copy()

global feature_idx


def info_split(i):
    ## split the i th part out and remove \n for the feature
    return re.split(split_condition, i)[feature_idx + 1][:-1]


def info_split_last(i):
    ## no need to remove \n for last category -- X-FileName
    return re.split(split_condition, i)[feature_idx + 1]


for feature_idx in range(len(words2split)):
    if feature_idx != len(words2split) - 1:
        df[features_naming[feature_idx]] = df['pre_info'].map(info_split)
    else:
        df[features_naming[feature_idx]] = df['pre_info'].map(info_split_last)

df_remove2 = df.loc[df['Content-Transfer-Encoding'] == 'text/plain; charset=us-asci']
df = df.loc[df['Content-Transfer-Encoding'] != 'text/plain; charset=us-asci']

df.loc[df["content"].str.contains("-------------"), "content"]


def split_other_content(i):
    """split other forms of contents out"""
    return i.split('-------------', 1)[0]


df["has_other_content"] = df["content"].str.contains("-------------")
df["if_forwarded"] = df["content"].str.contains("------------- Forwarded")
df['content'] = df.content.map(split_other_content)

df = df.drop(['pre_info', 'test_true', 'num_info'], axis=1).set_index("file")

# Parse what we can from the emails

In [10]:
df['Date'] = pd.to_datetime(df['Date'])
df['folder'] = df.index.map(lambda x: x.split('/')[-2])


def clean_cc(x):
    tokens = [i.strip() for i in x.split(',')]
    return ','.join(tokens)


df['From'] = df['From'].map(clean_cc).str.lower()
df['To'] = df['To'].map(clean_cc).str.lower()
df['Cc'] = df['Cc'].map(clean_cc).str.lower()
df['Bcc'] = df['Bcc'].map(clean_cc).str.lower()

df.drop(columns=['message'], inplace=True)

In [11]:
df

Unnamed: 0_level_0,content,Message-ID,Date,From,To,Subject,Cc,Mime-Version,Content-Type,Content-Transfer-Encoding,...,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,has_other_content,if_forwarded,folder
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
allen-p/_sent_mail/1.,Here is our forecast\n\n,<18782981.1075855378110.JavaMail.evans@thyme>,2001-05-14 16:39:00-07:00,phillip.allen@enron.com,tim.belden@enron.com,,,1.0,text/plain; charset=us-ascii,7bit,...,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,False,False,_sent_mail
allen-p/_sent_mail/10.,Traveling to have a business meeting takes the...,<15464986.1075855378456.JavaMail.evans@thyme>,2001-05-04 13:51:00-07:00,phillip.allen@enron.com,john.lavorato@enron.com,Re:,,1.0,text/plain; charset=us-ascii,7bit,...,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,False,False,_sent_mail
allen-p/_sent_mail/100.,test successful. way to go!!!,<24216240.1075855687451.JavaMail.evans@thyme>,2000-10-18 03:00:00-07:00,phillip.allen@enron.com,leah.arsdall@enron.com,Re: test,,1.0,text/plain; charset=us-ascii,7bit,...,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,False,False,_sent_mail
allen-p/_sent_mail/1000.,"Randy,\n\n Can you send me a schedule of the s...",<13505866.1075863688222.JavaMail.evans@thyme>,2000-10-23 06:13:00-07:00,phillip.allen@enron.com,randall.gay@enron.com,,,1.0,text/plain; charset=us-ascii,7bit,...,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,False,False,_sent_mail
allen-p/_sent_mail/1001.,Let's shoot for Tuesday at 11:45.,<30922949.1075863688243.JavaMail.evans@thyme>,2000-08-31 05:07:00-07:00,phillip.allen@enron.com,greg.piper@enron.com,Re: Hello,,1.0,text/plain; charset=us-ascii,7bit,...,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,False,False,_sent_mail
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zufferli-j/sent_items/95.,This is a trade with OIL-SPEC-HEDGE-NG (John L...,<26807948.1075842029936.JavaMail.evans@thyme>,2001-11-28 13:30:11-08:00,john.zufferli@enron.com,kori.loibl@enron.com,Trade with John Lavorato,,1.0,text/plain; charset=us-ascii,7bit,...,"Zufferli, John </O=ENRON/OU=NA/CN=RECIPIENTS/C...","Loibl, Kori </O=ENRON/OU=NA/CN=RECIPIENTS/CN=K...",,,"\ExMerge - Zufferli, John\Sent Items",ZUFFERLI-J,john zufferli 6-26-02.PST,False,False,sent_items
zufferli-j/sent_items/96.,Some of my position is with the Alberta Term b...,<25835861.1075842029959.JavaMail.evans@thyme>,2001-11-28 12:47:48-08:00,john.zufferli@enron.com,john.lavorato@enron.com,Gas Hedges,,1.0,text/plain; charset=us-ascii,7bit,...,"Zufferli, John </O=ENRON/OU=NA/CN=RECIPIENTS/C...","Lavorato, John </O=ENRON/OU=NA/CN=RECIPIENTS/C...",,,"\ExMerge - Zufferli, John\Sent Items",ZUFFERLI-J,john zufferli 6-26-02.PST,False,False,sent_items
zufferli-j/sent_items/97.,2\n\n -----Original Message-----\nFrom: \tDouc...,<28979867.1075842029988.JavaMail.evans@thyme>,2001-11-28 07:20:00-08:00,john.zufferli@enron.com,dawn.doucet@enron.com,RE: CONFIDENTIAL,,1.0,text/plain; charset=us-ascii,7bit,...,"Zufferli, John </O=ENRON/OU=NA/CN=RECIPIENTS/C...","Doucet, Dawn </O=ENRON/OU=NA/CN=RECIPIENTS/CN=...",,,"\ExMerge - Zufferli, John\Sent Items",ZUFFERLI-J,john zufferli 6-26-02.PST,False,False,sent_items
zufferli-j/sent_items/98.,Analyst\t\t\t\t\tRank\n\nStephane Brodeur\t\t\...,<22052556.1075842030013.JavaMail.evans@thyme>,2001-11-27 11:52:45-08:00,john.zufferli@enron.com,jeanie.slone@enron.com,Calgary Analyst/Associate,,1.0,text/plain; charset=us-ascii,7bit,...,"Zufferli, John </O=ENRON/OU=NA/CN=RECIPIENTS/C...","Slone, Jeanie </O=ENRON/OU=NA/CN=RECIPIENTS/CN...",,,"\ExMerge - Zufferli, John\Sent Items",ZUFFERLI-J,john zufferli 6-26-02.PST,False,False,sent_items


In [13]:
df.to_parquet('../../data/tmp/emails.parquet')

# Convert back to spark dataframe

In [14]:
spark = (SparkSession.builder
         .appName(f'{DATASET}_preprocess')
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "16g")
         .getOrCreate())

22/01/22 22:24:23 WARN Utils: Your hostname, megatron resolves to a loopback address: 127.0.1.1; using 192.168.1.89 instead (on interface enp7s0)
22/01/22 22:24:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/22 22:24:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/01/22 22:24:25 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/01/22 22:24:25 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/01/22 22:24:25 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/01/22 22:24:25 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
22/01/22 22:24:25

In [15]:
df = spark.read.parquet('../../data/tmp/emails.parquet')

                                                                                

In [16]:
df = (
    df
        .withColumn('To', F.regexp_replace(F.col('To'), '"', ''))
        .withColumn('From', F.regexp_replace(F.col('From'), '"', ''))
        .withColumn('Cc', F.regexp_replace(F.col('Cc'), '"', ''))
        .withColumn('Bcc', F.regexp_replace(F.col('Bcc'), '"', ''))
)

In [17]:
df.head(5)

                                                                                

[Row(content='Here is our forecast\n\n ', Message-ID='<18782981.1075855378110.JavaMail.evans@thyme>', Date=datetime.datetime(2001, 5, 15, 1, 39), From='phillip.allen@enron.com', To='tim.belden@enron.com', Subject='', Cc='', Mime-Version='1.0', Content-Type='text/plain; charset=us-ascii', Content-Transfer-Encoding='7bit', Bcc='', X-From='Phillip K Allen', X-To='Tim Belden <Tim Belden/Enron@EnronXGate>', X-cc='', X-bcc='', X-Folder="\\Phillip_Allen_Jan2002_1\\Allen, Phillip K.\\'Sent Mail", X-Origin='Allen-P', X-FileName='pallen (Non-Privileged).pst', has_other_content=False, if_forwarded=False, folder='_sent_mail', file='allen-p/_sent_mail/1.'),
 Row(content="Traveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.\n\nAs f

In [18]:
df_nodes_users = (
    df.select(
        F.explode(F.concat(
            (F.split(F.col('To'), ',')),
            (F.split(F.col('From'), ',')),
            (F.split(F.col('Cc'), ',')),
            (F.split(F.col('Bcc'), ',')),
        )).alias('email')
    )
        .distinct()
        .filter("email != ''")
        .withColumn('is_internal', F.col('email').like('%enron%'))
        .withColumn('id', F.col('email'))
        .dropDuplicates(['id'])
)
df_nodes_users.count()

                                                                                

87678

In [19]:
df_nodes_users.head(20)

                                                                                

[Row(email='#2.martin@enron.com', is_internal=True, id='#2.martin@enron.com'),
 Row(email='#23.training@enron.com', is_internal=True, id='#23.training@enron.com'),
 Row(email='#24.training@enron.com', is_internal=True, id='#24.training@enron.com'),
 Row(email='#25.training@enron.com', is_internal=True, id='#25.training@enron.com'),
 Row(email='#26.training@enron.com', is_internal=True, id='#26.training@enron.com'),
 Row(email='#28.training@enron.com', is_internal=True, id='#28.training@enron.com'),
 Row(email='#29.training@enron.com', is_internal=True, id='#29.training@enron.com'),
 Row(email='#30.training@enron.com', is_internal=True, id='#30.training@enron.com'),
 Row(email="''blanchard@enron.com", is_internal=True, id="''blanchard@enron.com"),
 Row(email="''collins@enron.com", is_internal=True, id="''collins@enron.com"),
 Row(email="''hunt@enron.com", is_internal=True, id="''hunt@enron.com"),
 Row(email="'.''ann@enron.com", is_internal=True, id="'.''ann@enron.com"),
 Row(email="'.''

In [20]:
df_nodes_emails = (
    df.select(
        F.col('Message-ID').alias('id'),
        F.col('content'),
        F.col('From').like('%enron%').alias('is_internal'),
        F.split(F.col('From'), ',').alias('From'),
        F.split(F.col('To'), ',').alias('To'),
        F.split(F.col('Cc'), ',').alias('Cc'),
        F.split(F.col('Bcc'), ',').alias('Bcc'),
        F.col('Date'),
        F.col('Subject'),
        F.col('folder'),
        F.col('if_forwarded'),
    ).distinct()
    .dropDuplicates(['id'])
)
df_nodes_emails.count()

                                                                                

517398

In [21]:
df_edges_from = (
    df_nodes_emails.select(
        F.explode(F.col('From')).alias('src'),
        F.col('id').alias('dst'),
        F.lit(1).alias('weight'),
    ).filter("src != ''").distinct()
)
print(df_edges_from.count())

df_edges_to = (
    df_nodes_emails.select(
        F.explode(F.col('To')).alias('src'),
        F.col('id').alias('dst'),
        F.lit(1).alias('weight'),
    ).filter("src != ''").distinct()
)
print(df_edges_to.count())

df_edges_cc = (
    df_nodes_emails.select(
        F.explode(F.col('Cc')).alias('src'),
        F.col('id').alias('dst'),
        F.lit(1).alias('weight'),
    ).filter("src != ''").distinct()
)
print(df_edges_cc.count())

# df_edges_bcc = (
#     df_nodes_emails.select(
#         F.explode(F.split(F.col('Bcc'), ',')).alias('src'),
#         F.col('id').alias('dst'),
#         F.lit(1).alias('weight'),
#     ).filter("src != ''").distinct()
# )
# print(df_edges_bcc.count())

                                                                                

517398


                                                                                

3101175




561305


                                                                                

In [22]:
df_nodes_users.write.parquet(DATASET.processed_str('nodes_User'), mode='overwrite')
df_nodes_emails.write.parquet(DATASET.processed_str('nodes_Email'), mode='overwrite')

df_edges_from.write.parquet(DATASET.processed_str('edges_SENT'), mode='overwrite')
df_edges_to.write.parquet(DATASET.processed_str('edges_ADDRESSED_TO'), mode='overwrite')
df_edges_cc.write.parquet(DATASET.processed_str('edges_ADDRESSED_CC'), mode='overwrite')

                                                                                

In [26]:
from shared.schema.graph import GraphSchema, NodeSchema, EdgeSchema

(
    GraphSchema()
        .add_node_schema('User', NodeSchema.from_spark(df_nodes_users.schema, label='email'))
        .add_node_schema('Email', NodeSchema.from_spark(df_nodes_emails.schema, label='Subject', timestamp='Date', interaction=True))
        .add_edge_schema('SENT', EdgeSchema.from_spark(df_edges_from.schema, source_type='User', target_type='Email', directed=True))
        .add_edge_schema('ADDRESSED_TO', EdgeSchema.from_spark(df_edges_to.schema, source_type='Email', target_type='User', directed=True))
        .add_edge_schema('ADDRESSED_CC', EdgeSchema.from_spark(df_edges_cc.schema, source_type='Email', target_type='User', directed=True))
        .save_schema(DATASET.processed())
)

GraphSchema(_path=PosixPath('/dd_volume/Development/Python/Thesis/code/datasets/data/processed/enron-mail-20150507'), nodes={'User': NodeSchema(_type='User', _schema=..., label='email', properties={'email': GraphProperty(_name='email', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'is_internal': GraphProperty(_name='is_internal', dtype=DType(atomic=<DTypeAtomic.BOOL: 'boolean'>, array=False)), 'id': GraphProperty(_name='id', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False))}, dynamic=None), 'Email': NodeSchema(_type='Email', _schema=..., label='Subject', properties={'id': GraphProperty(_name='id', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'content': GraphProperty(_name='content', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'is_internal': GraphProperty(_name='is_internal', dtype=DType(atomic=<DTypeAtomic.BOOL: 'boolean'>, array=False)), 'From': GraphProperty(_name='From', dtype=DType(atomic=<DTypeAtomic.STR