In [2]:
from __future__ import division
import pandas as pd
import numpy  as np
import os
import re
from collections import Counter
import time, sys
import math

# Code made by Brain Khuu: https://stackoverflow.com/questions/3160699/python-progress-bar
# update_progress() : Displays or updates a console progress bar
## Accepts a float between 0 and 1. Any int will be converted to a float.
## A value under 0 represents a 'halt'.
## A value at 1 or bigger represents 100%
def update_progress(progress):
    
    barLength = 10 # Modify this to change the length of the progress bar
    status = ""
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
        status = "error: progress var must be float\r\n"
    if progress < 0:
        progress = 0
        status = "\nHalt...\r\n"
    if progress >= 1:
        progress = 1
        status = "\nDone...\r\n"
    block = int(round(barLength*progress))
    text = "\rPercent: [{0}] {1}% {2}".format( "#"*block + "-"*(barLength-block), progress*100, status)
    sys.stdout.write(text)
    sys.stdout.flush()

In [3]:
directory = "/home/daniel/Documents/homework/Senior/Fall_2019/CS6375/Homework_2/Code/"

In [27]:
# Condense multiple whitespaces into one, grab only alphabetic words, convert to lowercase
# and return array of words.
def getAllWordsFromString(words):
    return re.sub('\s+', ' ',re.sub('[^a-zA-Z]+', ' ', words)).strip().lower().split(" ")

def getCountOfWords(words, allUniqueWords):
    dictionary = {}
    for word in allUniqueWords:
        dictionary = mergeTwoDicts(dictionary, {word: words.count(word)})
    return dictionary

def getBernoulliWords(words, allUniqueWords):
    dictionary = {}
    for word in allUniqueWords:
        dictionary = mergeTwoDicts(dictionary, {word: int(word in words)})
    return dictionary
        
def getCountOfWordsWithProgressBar(words, allUniqueWords, progress):
    progress = round(progress,3)
    update_progress(progress)
    return getCountOfWords(words, allUniqueWords)

def getBernoulliWithProgressBar(words, allUniqueWords, progress):
    progress = round(progress,3)
    update_progress(progress)
    return getBernoulliWords(words, allUniqueWords)

def mergeTwoDicts(x, y):
    z = x.copy()
    z.update(y)
    return z

In [25]:
def naiveBayesOnModel(text, T):
    p_0 = np.log(len(T[T['isSpam'] == 0]) / len(T)) + getProductOfProbabities(text, (T[T['isSpam'] == 0]).drop('isSpam', axis=1))
    p_1 = np.log(len(T[T['isSpam'] == 1]) / len(T)) + getProductOfProbabities(text, (T[T['isSpam'] == 1]).drop('isSpam', axis=1))
    return 0 if p_0 > p_1 else 1

def getDirectoryContents(dataDirectory):
    contents = np.array([])
    for fileName in os.listdir(dataDirectory):
        contents = np.append(contents, [open(dataDirectory + fileName).read()])
    return contents

def getBagOfWordsDataFrame(data, isSpam, allUniqueWords):
    print "Creating DataFrame with Bag Of Words as the feature..."
    attributes = ['isSpam'] + allUniqueWords
    i = 0
    df = pd.DataFrame([getCountOfWordsWithProgressBar(textFile, allUniqueWords, i / (len(data) - 1))
                       for i,textFile in enumerate(data)])
    df.insert(0, 'isSpam', isSpam)
    return df

def getBernoulliDataFrame(data, isSpam, allUniqueWords):
    print "Creating DataFrame with Bernoulli model as the feature..."
    attributes = ['isSpam'] + allUniqueWords
    i = 0
    df = pd.DataFrame([getBernoulliWithProgressBar(textFile, allUniqueWords, i / (len(data) - 1))
                       for i,textFile in enumerate(data)])
    df.insert(0, 'isSpam', isSpam)
    return df

def getProductOfProbabities(text, T):
    featureSums = T.sum().loc[[w for w in getAllWordsFromString(text) if w in T.columns]]
    totalWords  = T.sum().sum()
    return np.log(featureSums + 1 / (totalWords + len(T.columns))).sum()

def getAccuracyOnNaiveBayes(Test, Train):
    return (sum([t[1] == naiveBayesOnModel(t[0], Train) for t in Test]) / len(Test))

In [6]:
trainHamData  = getDirectoryContents(directory + "train/ham/")
trainSpamData = getDirectoryContents(directory + "train/spam/")
testHamData   = getDirectoryContents(directory + "test/ham/")
testSpamData  = getDirectoryContents(directory + "test/spam/")

In [7]:
#Transform all files into a single string.
allTrainWords = ''.join(trainHamData) + ''.join(trainSpamData)
#Retrieve all unique WORDS - Remove all words with numbers/punctuation and replace with space.
allUniqueWords = np.unique(getAllWordsFromString(allTrainWords))

In [28]:
trainB = getBernoulliDataFrame(trainHamData, 0, list(allUniqueWords))

Creating DataFrame with Bernoulli model as the feature...
Percent: [##########] 100%  
Done...


In [10]:
#Get a dataframe with bag of words as a feature for training
trainBOW = getBagOfWordsDataFrame(trainHamData, 0, list(allUniqueWords)).append(getBagOfWordsDataFrame(trainSpamData, 1, list(allUniqueWords)))

Creating DataFrame with Bag Of Words as the feature..
Percent: [##########] 100%  
Done...
Creating DataFrame with Bag Of Words as the feature..
Percent: [##########] 100%  
Done...


In [11]:
hamData = [naiveBayesOnModel(t, trainBOW) for t in testHamData]
spamData = [naiveBayesOnModel(t, trainBOW) for t in testSpamData]

In [18]:
getAccuracyOnNaiveBayes([[h, 0] for h in testHamData] + [[h,1] for h in testSpamData], trainBOW)

0.8765690376569037

In [None]:
getAccuracyOnNaiveBayes([[h, 0] for h in testHamData], trainB)

  This is separate from the ipykernel package so we can avoid doing imports until
