In [1]:
######################################################
# Imports
######################################################

import numpy as np
import json
import glob
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.utils import shuffle
from sklearn import metrics
from sklearn.model_selection import KFold

In [3]:
def read_folder(folder):
    mails = []
    file_list = glob.glob(folder)  # List mails in folder
    num_files = len(file_list)
    for i in range(0, num_files):
        i_path = file_list[i]
        i_file = open(i_path, 'rb')
        i_str = i_file.read()
        i_text = i_str.decode('utf-8', errors='ignore')  # Convert to Unicode
        mails.append(i_text)  # Append to the mail structure
        i_file.close()
    return mails


def load_enron_folders(datasets):
    path = './dataset/'
    ham = []
    spam = []
    for j in datasets:
        ham  = ham  + read_folder(path + 'enron' + str(j) + '/ham/*.txt')
        spam = spam + read_folder(path + 'enron' + str(j) + '/spam/*.txt')
    num_ham  = len(ham)
    num_spam = len(spam)
    print("mails:", num_ham+num_spam)
    print("ham  :", num_ham)
    print("spam :", num_spam)

    mails = ham + spam
    labels = [0]*num_ham + [1]*num_spam
    mails, labels = shuffle(mails, labels, random_state=0)
    return mails, labels


In [4]:
def load_stopwords():
    stopwords = []
    path = './NLTKstopwords.txt'
    file = open(path, 'r')
    stopwords = file.read().splitlines()
    file.close()
    print("stopwords: ", len(stopwords))
    return stopwords

In [69]:
def KfoldCrossValidation(X, Y, stopwords, K):
    Y = np.array(Y)
    extractors = [CountVectorizer(ngram_range=(1,1)),
                  CountVectorizer(ngram_range=(1,2)),
                  CountVectorizer(ngram_range=(1,1), stop_words=stopwords),
                  CountVectorizer(ngram_range=(1,2), stop_words=stopwords),
                  CountVectorizer(ngram_range=(1,1), stop_words='english'),
                  CountVectorizer(ngram_range=(1,2), stop_words='english')]
    types = ['Multinomial', 'Bernoulli']
    alphas = [0.01,0.05,0.1,0.25,0.5,0.75,1,2,5,10,25,50,100]
    
    print("Generating classifiers...")
    classifiers = []
    for t in types:
        for alpha in alphas:
            if t == 'Multinomial':
                classifiers.append(MultinomialNB(alpha=alpha))
            elif t == 'Bernoulli':
                classifiers.append(BernoulliNB(alpha=alpha))
    
    print("Starting training...")
    res = []
    best_errV = np.inf
    best_model = 0
    for extractor in extractors:
        X_train = extractor.fit_transform(X)
        for clf in classifiers:
            err_T = 0
            err_V = 0
            kf = KFold(n_splits=K)
            for Xindex, Vindex in kf.split(X_train, Y):
                Xtrain, Xvalidation = X_train[Xindex], X_train[Vindex]
                Ytrain, Yvalidation = Y[Xindex], Y[Vindex]
                clf.fit(Xtrain, Ytrain)
                err_T += 1-clf.score(Xtrain, Ytrain)
                err_V += 1-clf.score(Xvalidation, Yvalidation)
            err_T = err_T/K
            err_V = err_V/K
            if err_V < best_errV:
                print("New best: ", extractor, clf, err_V)
                best_model = (extractor, clf)
                best_errV = err_V
            res.append( (extractor, clf, err_T, err_V) )

    return res, best_model

In [74]:
print("Loading files...")

print("------Loading train and validation data--------")
mails, y = load_enron_folders([1,2,3,4,5])

print("--------------Loading Test data----------------")
mails_test, y_test = load_enron_folders([6])

Loading files...
------Loading train and validation data--------
mails: 27716
ham  : 15045
spam : 12671
--------------Loading Test data----------------
mails: 6000
ham  : 1500
spam : 4500


In [70]:
print("--------------Loading Stopwords----------------")
stopwords = load_stopwords()

--------------Loading Stopwords----------------
stopwords:  127


In [24]:
print("-----Initializing BOW structure-----")
vectorizer  = CountVectorizer(ngram_range=(1, 1))  # Initialize BOW structure
X = vectorizer.fit_transform(mails)                # BOW with word counts
X_test = vectorizer.transform(mails_test)          # BOW with word counts
print("Train size: ",X.getnnz())
print("Test size: ",X_test.getnnz())

-----Initializing BOW structure-----
Train size:  3260159
Test size:  726926


In [71]:
print("----- K-fold CrossValidation -----")
Results, Classifier = KfoldCrossValidation(mails, y, stopwords, 5)
print(Classifier)

----- K-fold CrossValidation -----
Generating classifiers...
Starting training...
New best:  CountVectorizer() MultinomialNB(alpha=0.01) 0.011906499598182818
New best:  CountVectorizer() MultinomialNB(alpha=0.05) 0.01187043107032284
New best:  CountVectorizer(ngram_range=(1, 2)) MultinomialNB(alpha=0.01) 0.0076851476544783194
New best:  CountVectorizer(ngram_range=(1, 2),
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', 'your', 'yours', 'yourself',
                            'yourselves', 'he', 'him', 'his', 'himself', 'she',
                            'her', 'hers', 'herself', 'it', 'its', 'itself',
                            'they', 'them', 'their', 'theirs', 'themselves',
                            'what', ...]) MultinomialNB(alpha=0.01) 0.007649046585543084
(CountVectorizer(ngram_range=(1, 2),
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselv

In [75]:
x_test = Classifier[0].fit_transform(mails_test)
Classifier[1].score(x_test, y_test)

ValueError: dimension mismatch

In [78]:
print(x_test.getnnz(), len(y_test))

1403738 6000
