## Pre-processing and Modelling of Enron Dataset
<br>
The following outlines the methods used to pre-process the Enron dataset and create the Doc2Vec model that forms the basis of the annotation application.

In [1]:
import re, string, logging

import pandas as pd
import numpy as np
from nltk import sent_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# To get console output when training Doc2vec:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

### 1. Load the Enron Dataset

In [2]:
%%time
master_df = pd.read_csv('./data/enron/emails.csv')
master_df.head()

Wall time: 26.7 s


Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


### 2. Pre-processing
#### 2.a Separate the desired features within the text of the raw message

In [3]:
def parse_raw_message(raw_message):
    lines = raw_message.split('\n')
    email = {}
    # These keys appear in the body of the messages.
    # Not all messages contain the same keys nor in equal quantity.
    # For example, in forwarded messages, 'to' and 'from' appear more than once, and so can become overwritten.
    keys_to_extract = ['message-id']
    hits = 0
    for i, line in enumerate(lines):
        if line == '': # this approach cannot separate forwarded messages.
            email['body'] = ' '.join([l for l in lines[i+1:]]).strip()
            break
        elif (hits < len(keys_to_extract)):
            try:
                pairs = line.split(':')
                key = pairs[0].lower()
                val = pairs[1].lower().strip()
                if (key in keys_to_extract):
                    email[key] = val.strip()
                    hits += 1
            except IndexError as e:
                hits += 1
                pass
    return email

def map_to_list(emails, key):
    results = []
    for email in emails:
        if key not in email:
            results.append('')
        else:
            results.append(email[key])
    return results

def parse_into_emails(messages):
    emails = [parse_raw_message(message) for message in messages]
    return {
        'id': map_to_list(emails, 'message-id'),
        'body': map_to_list(emails, 'body'),
    }

def test_nulls_and_empties(df):
    print("Null values:", df.isnull().values.any()) # Doesn't find empty strings.
    x = [len(df[df[header]==''].values) for header in list(df)]
    if sum(x):
        print("Empty strings:")
        for a in zip(list(df), x):      
            print(a[0], ":", a[1])
    else:
        print("Empty strings: False") 

In [4]:
# Have opted to demonstrate on a very small subsample to speed up both the subsequent processes.
# For reference, the master contains 517,401 entries, so feel free to change it for realistic results.
n = 1000
email_df = pd.DataFrame(parse_into_emails(master_df.sample(n).message))
test_nulls_and_empties(email_df)
email_df.head(5)

Null values: False
Empty strings: False


Unnamed: 0,id,body
0,<12437885.1075852968487.javamail.evans@thyme>,finally read it - I thought it was pretty crap...
1,<6631704.1075851015333.javamail.evans@thyme>,What about the questions I asked about whether...
2,<22940715.1075840010060.javamail.evans@thyme>,I've killed the Cal-Imbalance deal I entered u...
3,<5148853.1075841066243.javamail.evans@thyme>,There have been many occasions where individua...
4,<1122724.1075846012741.javamail.evans@thyme>,---------------------- Forwarded by Kay Mann/C...


#### 2.b Tokenise into sentences and clean text

In [5]:
def clean_text(s):
    # <3 ~ https://regex101.com/ 
    # Remove contiguous repreats of the same character of punctuation:
    s = re.sub(r"(([^\w\s])\2+)", r"\2", s).lower()
    # Remove some bizarre formatting artifacts specific to this dataset:
    s = re.sub(r'(= |=0f\\|=0f|0f|=\d+)', "", s)
    # Remove any punctuation that's attached to whitespace (L/R) or occurs at the begining or end of the text:
    s = re.sub(r"((?<=\s)[^\w\s])|([^\w\s](?=$|\s))", "", s).lstrip(string.punctuation)
    # Again to catch e.g. "(UWE)," -> "UWE)" -> "UWE" - need proper solution:
    s = re.sub(r"((?<=\s)[^\w\s])|([^\w\s](?=$|\s))", "", s).lstrip(string.punctuation)
    return re.sub(r' +', ' ', s).strip()

In [6]:
%%time
# Each document is tokenised into sentences, then those sentences are given an identifier and are cleaned.
doc_id, sen_id, text = [], [], []
for i, doc in enumerate(email_df.body):
    sent_tokens = sent_tokenize(doc)
    for j, sent in enumerate(sent_tokens):
        doc_id.append(i)
        sen_id.append(j)
        text.append(clean_text(sent))

Wall time: 1.31 s


In [7]:
sentence_df = pd.DataFrame({'doc_id': doc_id, 'sen_id': sen_id, 'text':text})
sentence_df.head()

Unnamed: 0,doc_id,sen_id,text
0,0,0,finally read it i thought it was pretty crappy
1,0,1,i am spending all my days in credit
2,0,2,head above water but only just
3,1,0,what about the questions i asked about whether...
4,1,1,2000 especially if the state has the verticall...


### 3. Train Doc2Vec model

In [8]:
%%time
# Gensim requires that data be converted into TaggedDocument format prior to training
# The tags associated with each sentence is simply an integer to denoting the order in which it is stored in the dataframe.
documents = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(sentence_df.text.values)]
d2v_model = Doc2Vec(documents, 
                    vector_size=100,
                    alpha=0.025, 
                    min_alpha=0.00025,
                    min_count=2,
#                     sample=(0, 1e-5),
                    dm=1, 
                    window=4,
                    workers=4,
                    seed=1,
                    epochs=16)

2020-04-11 13:06:37,981 : INFO : collecting all words and their counts
2020-04-11 13:06:37,982 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-04-11 13:06:38,034 : INFO : PROGRESS: at example #10000, processed 192427 words (3733112/s), 22707 word types, 10000 tags
2020-04-11 13:06:38,038 : INFO : collected 23815 word types and 10699 unique tags from a corpus of 10699 examples and 205162 words
2020-04-11 13:06:38,040 : INFO : Loading a fresh vocabulary
2020-04-11 13:06:38,061 : INFO : effective_min_count=2 retains 10776 unique words (45% of original 23815, drops 13039)
2020-04-11 13:06:38,062 : INFO : effective_min_count=2 leaves 192123 word corpus (93% of original 205162, drops 13039)
2020-04-11 13:06:38,108 : INFO : deleting the raw counts dictionary of 23815 items
2020-04-11 13:06:38,112 : INFO : sample=0.001 downsamples 39 most-common words
2020-04-11 13:06:38,113 : INFO : downsampling leaves estimated 157246 word corpus (81.8% of prior 192123)
2

2020-04-11 13:06:54,493 : INFO : EPOCH - 13 : training on 205162 raw words (168054 effective words) took 1.8s, 94204 effective words/s
2020-04-11 13:06:55,565 : INFO : EPOCH 14 - PROGRESS: at 66.18% examples, 112576 words/s, in_qsize 7, out_qsize 0
2020-04-11 13:06:55,714 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-04-11 13:06:55,724 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-04-11 13:06:55,735 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-04-11 13:06:55,735 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-04-11 13:06:55,735 : INFO : EPOCH - 14 : training on 205162 raw words (168184 effective words) took 1.2s, 141429 effective words/s
2020-04-11 13:06:56,494 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-04-11 13:06:56,494 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-04-11 13:06:56,504 : INFO : worker thread finished; awaiting fin

Wall time: 19.5 s


### 4. Save/Load

#### 4.a  Doc2Vec/KeyedVectors
<i>Uncomment blocks as required</i>

In [9]:
# # <!> SAVE:
# # Saves to C:\Users\<username>\AppData\Local\Temp
# from gensim.test.utils import get_tmpfile
# fname = get_tmpfile("enron_100k_d2v_model")
# d2v_model.save(fname)

# # Get word vectors:
# word_vectors = d2v_model.wv
# fname = get_tmpfile("enron_100k_vectors.kv")
# word_vectors.save(fname)

# # # <!> LOAD:
# # from gensim.models import KeyedVectors
# # fname = './models/enron/enron_100k_d2v_model'
# # model = Doc2Vec.load(fname)  # you can continue training with the loaded model!

# # fname = './models/enron/enron_100k_vectors.kv'
# # word_vectors = KeyedVectors.load(fname, mmap='r')

#### 4.b Save DataFrames used within the application
<i>Uncomment as required</i>

In [10]:
# Assign the default IOB tag to the dataframe containing all vocabulary
# word_df = pd.DataFrame({'word':list(d2v_model.wv.vocab.keys()), 'iob' : 'O'})

# prefix = "./data/app/"
# email_df.to_csv(prefix + 'email_df_<n>.csv', index=False)
# sentence_df.to_csv(prefix + 'sentence_df_<n>.csv', index=False)
# word_df.to_csv(prefix + 'word_df_<n>.csv', index=False)