In [2]:
!pip install opendatasets --quiet

In [19]:
import pandas as pd
import numpy as np
import opendatasets as od
import email
import random
import os

pd.set_option('display.max_colwidth', 300)

In [4]:
url = (
    "https://www.kaggle.com/datasets/wcukierski/enron-email-dataset"
)

od.download(url)

Downloading enron-email-dataset.zip to ./enron-email-dataset


100%|██████████| 358M/358M [00:04<00:00, 79.1MB/s]





In [5]:
!ls -lh ./enron-email-dataset/

total 1.4G
-rw-r--r-- 1 root root 1.4G Jan  7 09:54 emails.csv


# Enron Emails

In [6]:
emails = pd.read_csv("./enron-email-dataset/emails.csv")
emails.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [7]:
emails.shape

(517401, 2)

In [9]:
print(emails['message'][0])

Message-ID: <18782981.1075855378110.JavaMail.evans@thyme>
Date: Mon, 14 May 2001 16:39:00 -0700 (PDT)
From: phillip.allen@enron.com
To: tim.belden@enron.com
Subject: 
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: Tim Belden <Tim Belden/Enron@EnronXGate>
X-cc: 
X-bcc: 
X-Folder: \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Sent Mail
X-Origin: Allen-P
X-FileName: pallen (Non-Privileged).pst

Here is our forecast

 


In [10]:
# Extract messages
def extract_msg(df):
  messages = []
  for msg in df['message']:
    # Return a message object structure from a string
    e = email.message_from_string(msg)
    # get message body  
    msg_body = e.get_payload()
    messages.append(msg_body)
    
  return messages

In [12]:
bodies = extract_msg(emails)

In [13]:
bodies_df = pd.DataFrame(random.sample(bodies, 10000))
bodies_df.head()

Unnamed: 0,0
0,"Kim:\n\n(See attached file: EPECONb BASE.doc)\n\nThis is EPE's base gas contract - from the GISB S-T contract model.\n\nEPE's lawyer working on the contract with ENRON in 2000 was Mr. Will\nGuerant\n at 512-495-8832.\n\nThanks,\n\nBarry\n - EPECONb BASE.doc"
1,"What is the status of this?\n---------------------- Forwarded by Richard B Sanders/HOU/ECT on 09/17/99 \n12:22 PM ---------------------------\n\n\nDale Snyder\n09/13/99 03:41 PM\nTo: Mark J Leskowitz/HOU/ECT@ECT\ncc: Chad Pennix/HOU/ECT@ECT, Tomas Tellez/HOU/ECT@ECT, Michael \nPhilips/HOU/ECT@EC..."
2,"Please find attached the final program for an exciting conference on ""REAL\nOPTIONS VALUATION IN THE NEW ECONOMY: Internet/E-commerce,\nR&D/Pharmaceuticals, Energy."" The conference, organised in New York City\nMarch 13-15 by the Real Options Group and co-sponsored by Ernst & Young\nLLP, is addre..."
3,Carolyn George - 3-3439
4,"I think there's probably a lot of truth to the Eric theory. The retaliatio=\nn/numbing afforded by using people and tossing them aside can be tempting. =\n My mini-rampage after Ted was totally based upon treating any representat=\nive of Ted's sex like dirt. Fortunately, my behavior was very..."


# Fraudulent Emails
Having loaded the Enron emails, let’s do the same for the “419” fraudulent email corpus, so that we can have some example data in our training set representing the spam class.

In [14]:
url = (
    "https://www.kaggle.com/datasets/rtatman/fraudulent-email-corpus"
)
od.download(url)

Downloading fraudulent-email-corpus.zip to ./fraudulent-email-corpus


100%|██████████| 5.52M/5.52M [00:00<00:00, 41.0MB/s]







In [15]:
!ls -l ./fraudulent-email-corpus/

total 16940
-rw-r--r-- 1 root root 17344435 Jan  7 10:26 fradulent_emails.txt


In [22]:
fp = os.path.join("./fraudulent-email-corpus", "fradulent_emails.txt")

with open(fp, encoding="latin-1") as f:
  data = f.read()

fraud_emails = data.split("From r")

In [23]:
len(fraud_emails)

3978

In [25]:
print(fraud_emails[3])

  Thu Oct 31 17:27:16 2002
Return-Path: <obong_715@epatra.com>
X-Sieve: cmu-sieve 2.0
Return-Path: <obong_715@epatra.com>
Message-Id: <200210312227.g9VMQvDj017948@bluewhale.cs.CU>
From: "PRINCE OBONG ELEME" <obong_715@epatra.com>
Reply-To: obong_715@epatra.com
To: webmaster@aclweb.org
Date: Thu, 31 Oct 2002 22:17:55 +0100
Subject: GOOD DAY TO YOU
X-Mailer: Microsoft Outlook Express 5.00.2919.6900DM
MIME-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 8bit
X-MIME-Autoconverted: from quoted-printable to 8bit by sideshowmel.si.UM id g9VMRBW20642
Status: RO

FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF ELEME KINGDOM 
CHIEF DANIEL ELEME, PHD, EZE 1 OF ELEME.E-MAIL 
ADDRESS:obong_715@epatra.com  

ATTENTION:PRESIDENT,CEO Sir/ Madam. 

This letter might surprise you because we have met
neither in person nor by correspondence. But I believe
it is one day that you got to know somebody either in
physical or through correspondence. 

I got your contact through 

In [27]:
fraud = pd.DataFrame(fraud_emails, columns=["message"], dtype=str)
fraud_bodies = extract_msg(fraud)
fraud_bodies_df = pd.DataFrame(fraud_bodies[1:])
fraud_bodies_df.head()

Unnamed: 0,0
0,"FROM:MR. JAMES NGOLA.\nCONFIDENTIAL TEL: 233-27-587908.\nE-MAIL: (james_ngola2002@maktoob.com).\n\nURGENT BUSINESS ASSISTANCE AND PARTNERSHIP.\n\n\nDEAR FRIEND,\n\nI AM ( DR.) JAMES NGOLA, THE PERSONAL ASSISTANCE TO THE LATE CONGOLESE (PRESIDENT LAURENT KABILA) WHO WAS ASSASSINATED BY HIS BODY G..."
1,"Dear Friend,\n\nI am Mr. Ben Suleman a custom officer and work as Assistant controller of the Customs and Excise department Of the Federal Ministry of Internal Affairs stationed at the Murtala Mohammed International Airport, Ikeja, Lagos-Nigeria.\n\nAfter the sudden death of the former Head of s..."
2,"FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF ELEME KINGDOM \nCHIEF DANIEL ELEME, PHD, EZE 1 OF ELEME.E-MAIL \nADDRESS:obong_715@epatra.com \n\nATTENTION:PRESIDENT,CEO Sir/ Madam. \n\nThis letter might surprise you because we have met\nneither in person nor by correspondence. But I believe\nit is..."
3,"FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF ELEME KINGDOM \nCHIEF DANIEL ELEME, PHD, EZE 1 OF ELEME.E-MAIL \nADDRESS:obong_715@epatra.com \n\nATTENTION:PRESIDENT,CEO Sir/ Madam. \n\nThis letter might surprise you because we have met\nneither in person nor by correspondence. But I believe\nit is..."
4,"Dear sir, \n \nIt is with a heart full of hope that I write to seek your help in respect of the context below. I am Mrs. Maryam Abacha the former first lady of the former Military Head of State of Nigeria General Sani Abacha whose sudden death occurred on 8th of June 1998 as a result of cardiac ..."


## Define Tokenization, Stop-word and Punctuation Removal Functions
Before proceeding, we must decide how many samples to draw from each class. We must also decide the maximum number of tokens per email, and the maximum length of each token. This is done by setting the following overarching hyperparameters

In [28]:
Nsamp = 1000 # number of samples to generate in each class - 'spam', 'not spam'
maxtokens = 200 # the maximum number of tokens per document
maxtokenlen = 100 # the maximum length of each token

### Tokenization

In [None]:
def Tokenize(row):
  if row is None or row == '':
    tokens = ''
  else:
    tokens = str(row).split(" ")[:maxtokens]
  return tokens