In [16]:
# import all the libraries
import pandas as pd 
import numpy as np 
import nltk
import spacy
import matplotlib.pyplot as plt
import os
import random
import requests
import tarfile
from bs4 import BeautifulSoup
from bs4.element import Comment
from helperfunctions import get_enron_raw_dataset, get_lingspam_dataset
import re
import html
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import pickle
# html reader




%matplotlib inline

## Data Collection

In [2]:
# Download the Enron Raw enron_emails and parse enron dataset
enron_emails = get_enron_raw_dataset()
lingspam_df = get_lingspam_dataset()

# merge datasets
# emails = enron_emails.append(lingspam_df)

file exists
file exists



## Data Cleaning Natural Language Preprocessing

In [3]:
path = os.getcwd()


# check if the email is empty
def check_nan(x):
    if str(x) == "nan":
        return False
    else:
        return True

    
# header and tcp package information and 
# just keep the subject and the body of the email
def isolate_subject_body(x):
    clean_message = []
    in_message = False
    subject_included = False
    for line in x.split("\n"):
        if "subject:" in line.lower() and subject_included == False:
            clean_message.append(line)
            subject_included = True
            
        if line == "":
            in_message = True
        if in_message:
            clean_message.append(line)
    return "\n".join(clean_message)




# parse all html tags from 
# text
def get_text_bs(html):
    tree = BeautifulSoup(html, 'lxml')
    body = tree.body
    for tag in body.select('script'):
        tag.decompose()
    for tag in body.select('style'):
        tag.decompose()          
    text = body.get_text()
    return text



def remove_html_imperfections(x):
    x = x.replace("&amp;", "&").replace("=\n", "").replace("=20", "").replace("3D", "").replace("=B7", "").replace("=09", "").replace("=0D", "")
    x = html.unescape(x)
    x = x.lower()
    p = re.compile(r'(href).+"|href.+(\w.org|\w.[.]com|\/[a-z0-9]*\/|\w[.]html)|[a-z]*=[a-z0-9]*<br>|[a-z0-9]*="[a-z0-9]*"<br>*>|<[^a-z]*<br>')
    x = p.sub('', x)
    return x





# remove all html tags and leave only
# visible text
def parse_html(x):
    try:
        x = remove_html_imperfections(x)
        if "<" in x and "</" in x and ">" in x:
            return get_text_bs(x)
        else:
            return x
    except:
        print(x)
    
    

    
    
    
# remove subject word from
# email body
def remove_subject(x):
    try:
        if "subject" in x:
            x = x.replace("subject", "")
            return x
        else:
            return x
    except:
        print(x)
    
    

    
    

# clean emails
# remove subject word
# remove line breaks
# remove non alphanumeric characters
# remove numbers
# 
    
stop_words = nltk.corpus.stopwords.words('english')
porter = nltk.PorterStemmer()
def preprocess_text(messy_string):
    try:
        cleaned = re.sub(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddr', messy_string)
        cleaned = re.sub(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'httpaddr', cleaned)
        cleaned = re.sub(r'Â£|\$', 'moneysymb', cleaned)
        cleaned = re.sub(r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b','phonenumbr', cleaned)
        cleaned = re.sub(r'\d+(\.\d+)?', 'numbr', cleaned)
        cleaned = re.sub(r'[^\w\d\s]', ' ', cleaned)
        cleaned = re.sub(r'\s+', ' ', cleaned)
        cleaned = re.sub(r'^\s+|\s+?$', '', cleaned.lower())
        return ' '.join(
            porter.stem(term) 
            for term in cleaned.split()
            if term not in set(stop_words)
        )
    except:
        print(messy_string)
    
    


    

# Get clean enron
# or download and get current version of enron
def get_clean_enron(enron_emails=None):
    filename = "{}/enron_raw/enron_raw_preprocessed.csv".format(path)
    file_exists = os.path.isfile(filename)
    if file_exists:
        enron_emails = pd.read_csv(filename)
    elif not file_exists and enron_emails != None:
        # check emails that have nan in the email
        contain_nan = enron_emails["emails"].apply(check_nan)
        enron_emails = enron_emails.loc[contain_nan]
        #get only subject and body 
        enron_emails["emails"] = enron_emails["emails"].apply(isolate_subject_body)
        enron_emails["emails"] = enron_emails["emails"].apply(parse_html)
        enron_emails.to_csv(filename, index=False)
    return enron_emails

In [4]:
enron_emails = get_clean_enron(enron_emails)

In [5]:
# Now the enron emails dont have html anymore
# enron_emails
emails = enron_emails.append(lingspam_df)

In [6]:
emails["emails"] = emails["emails"].apply(preprocess_text)

nan
nan
nan


In [7]:
labels = [
    "ham",
    "spam"
]

# 1 = spam, 0 = ham
emails["types"] = emails["types"].apply(lambda x: labels.index(x))

In [8]:
# remove the subject word

emails["emails"] = emails["emails"].apply(remove_subject)

None
None
None


In [9]:
emails = emails.loc[emails["emails"].apply(lambda x: True if x != None else False)]

### Feature Extraction

In [10]:
vectorizer = TfidfVectorizer()
X = emails["emails"]
y = emails["types"]

# Use TfIDF
# tfidf_transformer = vectorizer.fit(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Model Training Naive Bayes Algorithm

In [17]:
clf = Pipeline([('vect', CountVectorizer()),
  ('tfidf', TfidfTransformer()),
  ('clf', MultinomialNB()),
])
clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

### Model Evaluation

In [22]:
y_pred = clf.predict(X_test)

In [23]:
# Accuracy
accuracy_score(y_test, y_pred)

0.9789909015715468

In [24]:
# F1 score
f1_score(y_test, y_pred)

0.9829017636763452

### Model Persistance

In [25]:
# Save to file in the current working directory
pkl_filename = "spamClassifier.pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(clf, file)

# Load from file
with open(pkl_filename, 'rb') as file:  
    pickle_model = pickle.load(file)