In [147]:
import pandas as pd
from os.path import join
import regex as re
import string
import seaborn as sns
from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

DATA_PATH = 'data/'
EMAIL_DATA = 'Emails.csv'

Read dataframe

In [79]:
df = pd.read_csv(join(DATA_PATH, EMAIL_DATA))

# Subset useful fields
df = df[['Id', 'SenderPersonId', 'MetadataDateSent',
         'ExtractedSubject', 'ExtractedBodyText']]
df = df.astype({'Id': int})
df = df.set_index('Id')

# Drop na values based on Extracted body
len_before = len(df)
df = df[df['ExtractedBodyText'].notna()]
print(f"Number of NA values in body: {len_before - len(df)}.\nNumber of valid email: {len(df)}")
df.head()

Number of NA values in body: 1203.
Number of valid email: 6742


Unnamed: 0_level_0,SenderPersonId,MetadataDateSent,ExtractedSubject,ExtractedBodyText
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,,2011-03-03T05:00:00+00:00,,"B6\nThursday, March 3, 2011 9:45 PM\nH: Latest..."
3,32.0,2012-09-12T04:00:00+00:00,Re: Chris Stevens,Thx
5,80.0,2011-03-11T05:00:00+00:00,,"H <hrod17@clintonemail.com>\nFriday, March 11,..."
6,80.0,2012-09-12T04:00:00+00:00,Meet The Right Wing Extremist Behind Anti-Musl...,Pis print.\n-•-...-^\nH < hrod17@clintonernail...
8,80.0,2011-03-11T05:00:00+00:00,,"H <hrod17@clintonemail.corn>\nFriday, March 11..."


Preprocessing

In [186]:
# Body preprocessing
def clean_body(body):
    days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    email_header = re.compile(r'.+[^<]+<[^>]+>', re.IGNORECASE)
    re_header = re.compile(r'(Delivered:\s+)?RE:[^\n]+\n', re.IGNORECASE)
    fw_header = re.compile(r'FW:[^\n]+\n', re.IGNORECASE)
    date_header = re.compile(r'\w+,\s\w+\s\d+,\s\d+[^\n]+\n', re.IGNORECASE)
    # Convert to lowercase
    body = body.lower()
    # Remove email
    body = re.sub(email_header, '', body).strip()
    # Remove "FW:"
    body = re.sub(fw_header, '', body).strip()
    # Remove "RE:"
    body = re.sub(re_header, '', body).strip()
    # Remove date
    body = re.sub(date_header, '', body).strip()
    return body

sample = df['ExtractedBodyText'].loc[230]
print('>>> Raw:\n' + sample)
print('>>> Cleaned:\n' + clean_body(sample))

>>> Raw:
Brennan, John 0.
Subject: RE: Google and YouTube
Sue just called back and the block will stay through Monday. They will not/not be unblocking it before then.
Nora Toiv
Office of the Secretary
202-647-8633
>>> Cleaned:
brennan, john 0.
subject: sue just called back and the block will stay through monday. they will not/not be unblocking it before then.
nora toiv
office of the secretary
202-647-8633


Drop the emails that are too short

In [187]:
def tokenize_body(body):
    tokenized = word_tokenize(body)
    # Strip tokens
    tokenized = [token.strip() for token in tokenized]

    # --------- STRICT RULE ---------
    # Strict regex rule
    tokenized = [token for token in tokenized if re.match('\w+', token)]
    # --------- STRICT RULE ---------

    # Remove punctuation
    tokenized = [token for token in tokenized if token not in string.punctuation]
    # Remove stopwords
    stop = stopwords.words('english') + [':', '.', '@']
    tokenized = [token for token in tokenized if token not in stop]
    # Remove numbers
    tokenized = [token for token in tokenized if not re.search(r'\d', token)]
    return tokenized

to_tokenize = clean_body(sample)
tokenize_body(to_tokenize)

['brennan',
 'john',
 'subject',
 'sue',
 'called',
 'back',
 'block',
 'stay',
 'monday',
 'not/not',
 'unblocking',
 'nora',
 'toiv',
 'office',
 'secretary']

In [188]:
def process_body(body):
    body = clean_body(body)
    tokenized = tokenize_body(body)
    return tokenized



In [189]:
df['Tokenized'] = df['ExtractedBodyText'].apply(process_body)

In [194]:
tokens_length = [len(x) for x in df['Tokenized']]
tokens_length

[18,
 1,
 9,
 40,
 9,
 1,
 11,
 1,
 11,
 1,
 650,
 39,
 62,
 5,
 2,
 1,
 13,
 10,
 3,
 2,
 7,
 0,
 16,
 0,
 2,
 624,
 2,
 1,
 2,
 635,
 11,
 1,
 278,
 5,
 20,
 1,
 5,
 2,
 4,
 1,
 7,
 628,
 7,
 0,
 9,
 8,
 36,
 63,
 14,
 43,
 70,
 482,
 17,
 75,
 6,
 378,
 3,
 4,
 7,
 1,
 2,
 2,
 89,
 302,
 4,
 1,
 123,
 8,
 6,
 3,
 0,
 29,
 59,
 1,
 22,
 530,
 52,
 7,
 16,
 6,
 6,
 8,
 10,
 19,
 84,
 6,
 0,
 20,
 1,
 12,
 4,
 0,
 5,
 1,
 135,
 6,
 4,
 4,
 1,
 2,
 19,
 7,
 1,
 1,
 19,
 1,
 6,
 20,
 55,
 2,
 29,
 962,
 2,
 0,
 1,
 62,
 2,
 3,
 6,
 1,
 2,
 3,
 1,
 0,
 44,
 14,
 8,
 8,
 0,
 1049,
 37,
 2,
 104,
 214,
 2,
 4,
 1,
 3,
 9,
 7,
 9,
 1,
 299,
 2,
 2,
 4,
 25,
 150,
 33,
 3,
 835,
 595,
 2,
 2,
 4,
 1,
 2,
 114,
 2,
 2,
 7,
 2,
 10,
 2,
 13,
 1,
 2,
 10,
 7,
 1,
 10,
 2,
 9,
 8,
 12,
 16,
 6,
 1,
 689,
 0,
 15,
 1,
 91,
 3,
 231,
 6832,
 2,
 74,
 1,
 22,
 6,
 2,
 11,
 39,
 22,
 12,
 3,
 0,
 2,
 15,
 259,
 1,
 3,
 2,
 260,
 191,
 22,
 1,
 456,
 3,
 3,
 494,
 43,
 2,
 12,
 11,
 10,
 86,
 11,
 0,


In [197]:
sns.displot(tokens_length)

NameError: name 'sns' is not defined