# Spam Email Classification Model

In [1]:
# importing system libraries
from os import walk
from pathlib import Path
from string import punctuation
from random import shuffle
from collections import Counter

# importing additional libraries
import pandas as pd
import sklearn as sk
import nltk

In [2]:
# Read the whole data from the Enron Dataset into a variable allData.
rawpath = r"/kaggle/input/enron-spam/"
path = Path(rawpath)
pathwalk = walk(path)

allHamData, allSpamData = [], []
for root, dr, file in pathwalk:
    if 'ham' in str(file):
        for obj in file:
            with open(root + '/' + obj, encoding='latin1') as ip:
                allHamData.append(" ".join(ip.readlines()))
                
    elif 'spam' in str(file):
        for obj in file:
            with open(root + '/' + obj, encoding='latin1') as ip:
                allSpamData.append(" ".join(ip.readlines()))

In [3]:
# remove all redundent data
allHamData = list(set(allHamData))
allSpamData = list(set(allSpamData))

In [4]:
# get an overview of the data
print("number of ham emails:", len(allHamData))
print("number of spam emails:", len(allSpamData))

number of ham emails: 15910
number of spam emails: 14583


# NLTK's Naive Bayes Classifier

In [5]:
# creating a preprocessing function
# to tokenize and lemmatize the data using NLTK library
def preprocess(data):
    # tokenization
    tokens = nltk.word_tokenize(data)
    tokens = [w.lower() for w in tokens if w.isalpha()]

    # finding uncommon words
    cnt = Counter(tokens)
    uncommons = cnt.most_common()[:-int(len(cnt)*0.1):-1]
    
    # listing stopwords from NLTK
    stops = set(nltk.corpus.stopwords.words('english'))

    # removing stop words and uncommon words
    tokens = [w for w in tokens if (w not in stops and w not in uncommons)]

    # lemmatization
    lemmatizer = nltk.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]

    return tokens

In [6]:
# tokenize and lemmatize the data
allHamDataProcessed = [preprocess(data) for data in allHamData]
allSpamDataProcessed = [preprocess(data) for data in allSpamData]

# create a feature from this data by combining it, called X
X = allHamDataProcessed + allSpamDataProcessed

# create a label array called y to differenciate between spam and ham
# True will denote it as spam, and False will denote ham
y = [False]*len(allHamDataProcessed) + [True]*len(allSpamDataProcessed)

In [7]:
# Creating a dataframe out of the processed data
dataframeProcessed = pd.DataFrame({"email": X, "label": y})

# Get an overview of the processed data
print(dataframeProcessed.head())

                                               email  label
0  [subject, million, dollar, content, louise, tr...  False
1  [subject, mitsubishi, turbine, louise, spoke, ...  False
2  [subject, start, date, hourahead, hour, start,...  False
3  [subject, enpwer, eol, data, eol, deal, enpowe...  False
4  [subject, caiso, notice, upcoming, mif, stakeh...  False


In [8]:
# Splitting the data into (X, y) with train:test = 70:30
X, y = dataframeProcessed["email"], dataframeProcessed["label"]

X_featurized = [Counter(i) for i in X]
allDataProcessed = [(X_featurized[i], y[i]) for i in range(len(X))]
shuffle(allDataProcessed)
trainData, testData = allDataProcessed[:int(len(allDataProcessed)*0.7)], allDataProcessed[int(len(allDataProcessed)*0.7):]

In [9]:
# Training the model
model_nltkNaiveBayes = nltk.classify.NaiveBayesClassifier.train(trainData)

# Testing the model
testing_accuracy = nltk.classify.accuracy(model_nltkNaiveBayes, testData)
print("Accuracy with NLTK's Naive Bayes classifier is:", testing_accuracy)

# Checking accuracy for the whole dataset
whole_accuracy = nltk.classify.accuracy(model_nltkNaiveBayes, allDataProcessed)
print("Accuracy over the whole dataset is:", whole_accuracy)

Accuracy with NLTK's Naive Bayes classifier is: 0.9881941407958024
Accuracy over the whole dataset is: 0.9920965467484341


# Scikit-learn's Random Forest Classifier

In [10]:
# Creating features and labels from raw, unprocessed data
X = allHamData + allSpamData
y = [0]*len(allHamData) + [1]*len(allSpamData)

In [11]:
# getting an overview of the number of different words 
# used all over the dataset

all = []
for i in X:
    all += list(set(i.split()))
    
all = list(set(all))
numDiffWords = len(all)

print("Number of different words used in the dataset:", numDiffWords)

Number of different words used in the dataset: 159211


In [12]:
# Vectorize the features using CountVectorizer
vec = sk.feature_extraction.text.CountVectorizer(max_features = int(numDiffWords*0.3))
X_vectorized = vec.fit_transform(X)

In [13]:
# Splitting the data into test and train datasets
# using sklearn's train_test_split 
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X_vectorized, y, test_size=0.3)

In [14]:
# Training the model
from sklearn.ensemble import RandomForestClassifier
model_rfclassifier = RandomForestClassifier(random_state=0)
model_rfclassifier.fit(X_train, y_train)

# Testing the model
y_predicted = model_rfclassifier.predict(X_test)

# Checking accuracy
accuracy_rfclassifier = sk.metrics.accuracy_score(y_test, y_predicted)
print("Accuracy with Sklearn's Random Forest Classifier:", accuracy_rfclassifier)

print("Accuracy over the whole dataset:", sk.metrics.accuracy_score(model_rfclassifier.predict(X_vectorized), y))

Accuracy with Sklearn's Random Forest Classifier: 0.984696108439003
Accuracy over the whole dataset: 0.9954087823434886


### Hyperparameters tuning of RandomForestClassifier using GridSearchCV

In [15]:
# Declaring different possible values for the parameters
n_estimators = [int(x) for x in range(10, 100, 10)]
max_features = ['auto', 'sqrt']
max_depth = [2,4]
min_samples_split = [2, 5]
min_samples_leaf = [1, 2]
bootstrap = [True, False]

In [16]:
# Create the param grid
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Getting an overview of the grid
print("The grid looks like:\n")
for key, val in param_grid.items():
    print(key + ":", val)

The grid looks like:

n_estimators: [10, 20, 30, 40, 50, 60, 70, 80, 90]
max_features: ['auto', 'sqrt']
max_depth: [2, 4]
min_samples_split: [2, 5]
min_samples_leaf: [1, 2]
bootstrap: [True, False]


In [17]:
# Loading the grid with parameters and the model
rf_Grid = sk.model_selection.GridSearchCV(estimator = model_rfclassifier, param_grid = param_grid, cv = 3, n_jobs = 4)

# Fitting with data
rf_Grid.fit(X_train, y_train)

# Getting the tuned parameters
print("The tuned parameters:\n", rf_Grid.best_params_)

The tuned parameters:
 {'bootstrap': True, 'max_depth': 4, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 90}


In [18]:
# Using the tuned parameters on the model
model_tuned_rfclassifier = RandomForestClassifier(random_state=0, bootstrap= False, max_depth=4, max_features='auto', min_samples_leaf= 1, min_samples_split= 5, n_estimators= 90)

# Training the model
model_tuned_rfclassifier.fit(X_train, y_train)

# Testing the model
y_predicted = model_tuned_rfclassifier.predict(X_test)

# Checking accuracy
accuracy_tuned_rfclassifier = sk.metrics.accuracy_score(y_test, y_predicted)
print("Accuracy with hyperparameters tuned Random Forest Classifier:", accuracy_tuned_rfclassifier)

print("Accuracy over the whole dataset, with tuned model:", sk.metrics.accuracy_score(model_tuned_rfclassifier.predict(X_vectorized), y))

Accuracy with hyperparameters tuned Random Forest Classifier: 0.9209663314385658
Accuracy over the whole dataset, with tuned model: 0.9223756271931263
