In [8]:
import email
import os
import random
import nltk
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split

### Load and Label Email Objects

In [2]:
def loadEmails(label):
    labeledEmailObjects = []
    for fileName in os.listdir(label):
        file = open(f'{label}/{fileName}')
        labeledEmailObjects.append((email.message_from_file(file), label))
        file.close()
    return labeledEmailObjects

promotions = loadEmails('promotions')
inbox = loadEmails('inbox')
labeledEmailObjects = promotions + inbox
random.shuffle(labeledEmailObjects)

### Generate Feature Sets on Each Email Object

In [3]:
def generateFeatures(mailObject):
    features = {
#         'subject': mailObject['Subject'],
#         'from': mailObject['from'],
        'sale': 'sale' in mailObject.as_string(),
        'free': 'free' in mailObject.as_string(),
#         'special': 'special' in mailObject.as_string(),
        'save': 'save' in mailObject.as_string(),
        'notSale': 'notSale' not in mailObject.as_string()
    }
    return features
labeledFeatureSets = [(generateFeatures(email), label) for (email, label) in labeledEmailObjects]

### Build a Classifier and randomize cross-validation

In [19]:
def getScores(labeledFeatureSets):
    # build the classifier
    trainingSet, testingSet = train_test_split(labeledFeatureSets)
    classifier = nltk.NaiveBayesClassifier.train(trainingSet)
    # test the classifier
    y_true = []
    y_predict = []
    for testObject in testingSet:
        y_predict.append(classifier.classify(testObject[0]))
        y_true.append(testObject[1])
    return precision_recall_fscore_support(y_true, y_predict, average=None)

NUM_TRIALS = 100
INBOX_INDEX = 0
PROMOTION_INDEX = 1
promotionF1 = []
promotionPrec = []
promotionRec = []
inboxF1 = []
inboxPrec = []
inboxRec = []
for i in range(NUM_TRIALS):
    (p, r, f, s) = getScores(labeledFeatureSets)
    promotionF1.append(f[PROMOTION_INDEX])
    promotionPrec.append(p[PROMOTION_INDEX])
    promotionRec.append(r[PROMOTION_INDEX])
    inboxF1.append(f[INBOX_INDEX])
    inboxPrec.append(p[INBOX_INDEX])
    inboxRec.append(r[INBOX_INDEX])

def average(array):
    theSum = 0
    for item in array:
        theSum += item
    return theSum / len(array)

print(f'Average F1 for promotions for {NUM_TRIALS} trials: {average(promotionF1)}')
print(f'Average Prec for promotions for {NUM_TRIALS} trials: {average(promotionPrec)}')
print(f'Average Rec for promotions for {NUM_TRIALS} trials: {average(promotionRec)}')
print(f'Average F1 for inbox for {NUM_TRIALS} trials: {average(inboxF1)}')
print(f'Average Prec for inbox for {NUM_TRIALS} trials: {average(inboxPrec)}')
print(f'Average Rec for inbox for {NUM_TRIALS} trials: {average(inboxRec)}')


Average F1 for promotions for 100 trials: 0.5416975193290579
Average Prec for promotions for 100 trials: 0.7571403318903319
Average Rec for promotions for 100 trials: 0.45321919420448836
Average F1 for inbox for 100 trials: 0.6904481143237645
Average Prec for inbox for 100 trials: 0.5922568519801493
Average Rec for inbox for 100 trials: 0.8531346431346429
