In [3]:
######################################################
# Imports
######################################################

import numpy as np
import json
import glob
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.utils import shuffle
from sklearn import metrics

In [6]:
def read_folder(folder):
    mails = []
    file_list = glob.glob(folder)  # List mails in folder
    num_files = len(file_list)
    for i in range(0, num_files):
        i_path = file_list[i]
        i_file = open(i_path, 'rb')
        i_str = i_file.read()
        i_text = i_str.decode('utf-8', errors='ignore')  # Convert to Unicode
        mails.append(i_text)  # Append to the mail structure
        i_file.close()
    return mails


def load_enron_folders(datasets):
    path = './dataset/'
    ham = []
    spam = []
    for j in datasets:
        ham  = ham  + read_folder(path + 'enron' + str(j) + '/ham/*.txt')
        spam = spam + read_folder(path + 'enron' + str(j) + '/spam/*.txt')
    num_ham  = len(ham)
    num_spam = len(spam)
    print("mails:", num_ham+num_spam)
    print("ham  :", num_ham)
    print("spam :", num_spam)

    mails = ham + spam
    labels = [0]*num_ham + [1]*num_spam
    mails, labels = shuffle(mails, labels, random_state=0)
    return mails, labels


In [21]:
def load_stopwords():
    stopwords = []
    path = './NLTKstopwords.txt'
    file = open(path, 'r')
    stopwords = file.read().splitlines()
    file.close()
    print("stopwords: ", len(stopwords))
    return stopwords

In [15]:
print("Loading files...")

print("------Loading train and validation data--------")
mails, y = load_enron_folders([1,2,3,4,5])

print("--------------Loading Test data----------------")
mails_test, y_test = load_enron_folders([6])

Loading files...
------Loading train and validation data--------
mails: 27716
ham  : 15045
spam : 12671
--------------Loading Test data----------------
mails: 6000
ham  : 1500
spam : 4500


In [23]:
print("--------------Loading Stopwords----------------")
stopwords = load_stopwords()

--------------Loading Stopwords----------------
stopwords:  127


In [8]:
print("-----Example of obtaining BOWs from emails-----")
vectorizer  = CountVectorizer(ngram_range=(1, 1))  # Initialize BOW structure
X = vectorizer.fit_transform(mails)                # BOW with word counts
X_test = vectorizer.transform(mails_test)          # BOW with word counts
print("A Bag of Words is represented as a sparse matrix:" )
print(X)

-----Example of obtaining BOWs from emails-----
A Bag of Words is represented as a sparse matrix:
  (0, 115746)	1
  (0, 92993)	1
  (0, 107430)	2
  (0, 37776)	2
  (0, 128168)	1
  (0, 77556)	1
  (0, 18670)	1
  (0, 129577)	4
  (0, 96923)	1
  (0, 120766)	7
  (0, 89393)	1
  (0, 46580)	2
  (0, 30287)	4
  (0, 28130)	4
  (0, 29231)	2
  (0, 119732)	2
  (0, 49734)	1
  (0, 66712)	1
  (0, 134248)	3
  (0, 69138)	1
  (0, 30930)	1
  (0, 88913)	1
  (0, 22813)	1
  (0, 120215)	1
  (0, 16597)	2
  :	:
  (27715, 40122)	1
  (27715, 2707)	1
  (27715, 96621)	3
  (27715, 38110)	1
  (27715, 4131)	1
  (27715, 90304)	1
  (27715, 36044)	2
  (27715, 4150)	1
  (27715, 23515)	1
  (27715, 121894)	1
  (27715, 42363)	1
  (27715, 121994)	1
  (27715, 28422)	1
  (27715, 36935)	1
  (27715, 4779)	1
  (27715, 6418)	1
  (27715, 54705)	2
  (27715, 91511)	1
  (27715, 42677)	3
  (27715, 134619)	1
  (27715, 81579)	1
  (27715, 94217)	1
  (27715, 81812)	1
  (27715, 77788)	1
  (27715, 69304)	1
