In [14]:
import numpy as np
import pandas as pd
import nltk
import os, gc, re

In [15]:
content_df = pd.read_csv(r'emails_cleaned.csv')
content_df.head()

Unnamed: 0,file,message,content,Message-ID,Date,From,To,Subject,Cc,Mime-Version,...,Bcc,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,has_other_content,if_forwarded
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,Here is our forecast\n\n,<18782981.1075855378110.JavaMail.evans@thyme,"Mon, 14 May 2001 16:39:00 -0700 (PDT",phillip.allen@enron.co,tim.belden@enron.co,,,1.0,...,,Phillip K Alle,Tim Belden <Tim Belden/Enron@EnronXGate,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-,pallen (Non-Privileged).pst,False,False
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,Traveling to have a business meeting takes the...,<15464986.1075855378456.JavaMail.evans@thyme,"Fri, 4 May 2001 13:51:00 -0700 (PDT",phillip.allen@enron.co,john.lavorato@enron.co,Re,,1.0,...,,Phillip K Alle,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-,pallen (Non-Privileged).pst,False,False
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,test successful. way to go!!!,<24216240.1075855687451.JavaMail.evans@thyme,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT",phillip.allen@enron.co,leah.arsdall@enron.co,Re: tes,,1.0,...,,Phillip K Alle,Leah Van Arsdal,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mai,Allen-,pallen.nsf,False,False
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,"Randy,\n\n Can you send me a schedule of the s...",<13505866.1075863688222.JavaMail.evans@thyme,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT",phillip.allen@enron.co,randall.gay@enron.co,,,1.0,...,,Phillip K Alle,Randall L Ga,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mai,Allen-,pallen.nsf,False,False
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,Let's shoot for Tuesday at 11:45.,<30922949.1075863688243.JavaMail.evans@thyme,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT",phillip.allen@enron.co,greg.piper@enron.co,Re: Hell,,1.0,...,,Phillip K Alle,Greg Pipe,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mai,Allen-,pallen.nsf,False,False


In [16]:
for p in range(0,5):
    print(content_df['content'])

0                                 Here is our forecast\n\n 
1         Traveling to have a business meeting takes the...
2                            test successful.  way to go!!!
3         Randy,\n\n Can you send me a schedule of the s...
4                       Let's shoot for Tuesday at 11:45.  
                                ...                        
517392    This is a trade with OIL-SPEC-HEDGE-NG (John L...
517393    Some of my position is with the Alberta Term b...
517394    2\n\n -----Original Message-----\nFrom: \tDouc...
517395    Analyst\t\t\t\t\tRank\n\nStephane Brodeur\t\t\...
517396    i think the YMCA has a class that is for peopl...
Name: content, Length: 517397, dtype: object
0                                 Here is our forecast\n\n 
1         Traveling to have a business meeting takes the...
2                            test successful.  way to go!!!
3         Randy,\n\n Can you send me a schedule of the s...
4                       Let's shoot for Tuesday at 11:4

### Content Preprocessing

Tokenization (Maybe we will have multiple tokenization methods; you can put how you wana tokenize down here)

In [17]:
from nltk.tokenize import word_tokenize, wordpunct_tokenize

# hyperparameters 
maxtokens = 200 # the maximum number of tokens per document
maxtokenlen = 100 # the maximum length of each token

In [18]:
# Tokenization method
# split of white space AND punctuation $3.88 --> '3', '.', '88'
def tokenize_2(row):
    return wordpunct_tokenize(str(row))[:maxtokens]

Regular Expression to remove unnecessary characters (removing \n new lines, symbols?, this could also include links)

In [19]:
# this covers lower() tokens
def reg_expressions(row):
    row = re.sub(r'[\r\n]', "", row)
    return row

Stop-word removal (removing unimportant words)

In [20]:
nltk.download('wordnet')
nltk.download('omw-1.4')

stopwords = nltk.corpus.stopwords.words('english')
# print(stopwords[:10])

def stop_word_removal(row):
    token = [token for token in row if token not in stopwords]
    return token

[nltk_data] Downloading package wordnet to /home/giacomo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/giacomo/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Stemming (removing endings of words, -ing, -ly...)

In [21]:
def stemming(row):
    port_stemmer = nltk.stem.porter.PorterStemmer()
    token = [port_stemmer.stem(token) for token in row]
    return token

Lemmatization (convert into root word)

In [22]:
def lemmatization(row):
    lem = nltk.stem.wordnet.WordNetLemmatizer()
    token = [lem.lemmatize(token) for token in row]
    return token

Final utility in preprocessing data connecting all these preprocessing techniques

In [23]:
def utils_preprocess_text(text, flg_tokenize=1,flg_stemm=False, flg_lemm=True, flg_stopwords=True):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = reg_expressions(text)

    ## Tokenize (convert from string to list)
    if flg_tokenize == 1:
        text = tokenize_1(text)

    elif flg_tokenize == 2:
        text = tokenize_2(text)
    
    # remove Stopwords
    if flg_stopwords == True:
        text = stop_word_removal(text)
        
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        text = stemming(text)
        
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        text = lemmatization(text)
            
    ## back to string from list
    text = " ".join(text)
    return text

In [24]:
print(content_df["content"])

0                                 Here is our forecast\n\n 
1         Traveling to have a business meeting takes the...
2                            test successful.  way to go!!!
3         Randy,\n\n Can you send me a schedule of the s...
4                       Let's shoot for Tuesday at 11:45.  
                                ...                        
517392    This is a trade with OIL-SPEC-HEDGE-NG (John L...
517393    Some of my position is with the Alberta Term b...
517394    2\n\n -----Original Message-----\nFrom: \tDouc...
517395    Analyst\t\t\t\t\tRank\n\nStephane Brodeur\t\t\...
517396    i think the YMCA has a class that is for peopl...
Name: content, Length: 517397, dtype: object


In [25]:
# content_df["text_clean"] = content_df["content"].apply(lambda x: utils_preprocess_text(x, flg_tokenize=2, flg_stemm=True, flg_lemm=True, flg_stopwords=True))
# content_df

# print(content_df('text_clean'))
#print(emails_df.iloc[22,1])

In [1]:
del content_df
gc.collect()


NameError: name 'content_df' is not defined