In [1]:
from __future__ import division
import pandas as pd
import numpy  as np
import os
import re
from collections import Counter
import time, sys
import math

# Code made by Brain Khuu: https://stackoverflow.com/questions/3160699/python-progress-bar
# update_progress() : Displays or updates a console progress bar
## Accepts a float between 0 and 1. Any int will be converted to a float.
## A value under 0 represents a 'halt'.
## A value at 1 or bigger represents 100%
def update_progress(progress):
    
    barLength = 10 # Modify this to change the length of the progress bar
    status = ""
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
        status = "error: progress var must be float\r\n"
    if progress < 0:
        progress = 0
        status = "\nHalt...\r\n"
    if progress >= 1:
        progress = 1
        status = "\nDone...\r\n"
    block = int(round(barLength*progress))
    text = "\rPercent: [{0}] {1}% {2}".format( "#"*block + "-"*(barLength-block), progress*100, status)
    sys.stdout.write(text)
    sys.stdout.flush()

In [2]:
directory = "/home/daniel/Documents/homework/Senior/Fall_2019/CS6375/Homework_2/Code/"

In [3]:
# Condense multiple whitespaces into one, grab only alphabetic words, convert to lowercase
# and return array of words.
def getAllWordsFromString(words):
    return re.sub('\s+', ' ',re.sub('[^a-zA-Z1-9]+', ' ', words)).strip().lower().split(" ")

# getCountOfWords
# Create a dictionary with the count of each word in a string.
def getCountOfWords(words, allUniqueWords):
    dictionary = {}
    for word in allUniqueWords:
        dictionary = mergeTwoDicts(dictionary, {word: words.count(word)})
    return dictionary

#getBernoulliWords
# Create a dictionary that shows the existence of words as 1 or 0.
def getBernoulliWords(words, allUniqueWords):
    dictionary = {}
    uniqueFileWords = set(getAllWordsFromString(words))
    for word in allUniqueWords:
        dictionary = mergeTwoDicts(dictionary, {word: int(word in uniqueFileWords)})
    return dictionary
        
def getCountOfWordsWithProgressBar(words, allUniqueWords, progress):
    progress = round(progress,3)
    update_progress(progress)
    return getCountOfWords(words, allUniqueWords)

def getBernoulliWithProgressBar(words, allUniqueWords, progress):
    progress = round(progress,3)
    update_progress(progress)
    return getBernoulliWords(words, allUniqueWords)

#getProduct of Probabilities
# returns the log probability sum of all of the elements based on bayes
# works with both bernoulli and bag of words model
def getProductOfProbabities(text, T):
    featureSums = T.sum().loc[[w for w in getAllWordsFromString(text) if w in T.columns]]
    totalWords  = T.sum().sum()
    return np.log( (featureSums + 1) / (totalWords + len(T.columns))).sum()

def mergeTwoDicts(x, y):
    z = x.copy()
    z.update(y)
    return z

In [33]:
def naiveBayesOnModel(text, T):
    p_0 = np.log(len(T[T['isSpam'] == 0]) / len(T)) + getProductOfProbabities(text, (T[T['isSpam'] == 0]).drop('isSpam', axis=1))
    p_1 = np.log(len(T[T['isSpam'] == 1]) / len(T)) + getProductOfProbabities(text, (T[T['isSpam'] == 1]).drop('isSpam', axis=1))
    return 0 if p_0 > p_1 else 1

def MCAPUpdateWeights(T, learnRate, W):
    
    changeW = learnRate * np.array([sum([T[x][i] * (y - (T[T['isSpam'] == 1][x].sum()) / T[x].sum())
                    for i,y in enumerate(T['isSpam'])])
               for x in ['a']])
    
    return W + changeW
    
def getDirectoryContents(dataDirectory):
    contents = np.array([])
    for fileName in os.listdir(dataDirectory):
        contents = np.append(contents, [open(dataDirectory + fileName).read()])
    return contents

def getBagOfWordsDataFrame(data, allUniqueWords):
    print "Creating DataFrame with Bag Of Words as the feature..."
    attributes = set(allUniqueWords)
    df = pd.DataFrame([mergeTwoDicts({"isSpam": d[0]}, 
                                     getCountOfWordsWithProgressBar(d[1], 
                                                                    attributes, 
                                                                    i / (len(data) - 1)))
                       for i,d in enumerate(data)])
    return df

def getBernoulliDataFrame(data, allUniqueWords):
    print "Creating DataFrame with Bernoulli model as the feature..."
    attributes = set(allUniqueWords)
    df = pd.DataFrame([mergeTwoDicts({"isSpam": d[0]}, 
                                     getBernoulliWithProgressBar(d[1], 
                                                                 attributes, 
                                                                 i / (len(data) - 1)))
                       for i,d in enumerate(data)])
    return df

def getAccuracyOnNaiveBayes(Test, Train):
    return sum(Test.apply(lambda x: naiveBayesOnModel(x['text'], Train) == x['isSpam'], axis=1)) / len(Test)    

def PredictWithLR(T, W):
    bias = W[0]
    PY_1 = 1 / (1 + math.exp(bias + 
                             T.apply(lambda x: (T[x].sum() / T.sum().sum()) * W[x]).sum()))
    PY_0 = 1 - PY_1
    
    return 1

In [8]:
trainHamData  = [[0,f] for f in getDirectoryContents(directory + "train/ham/")]
trainSpamData = [[1,f] for f in getDirectoryContents(directory + "train/spam/")]
allTrainData  = trainHamData + trainSpamData
testHamData   = [[0,f] for f in getDirectoryContents(directory + "test/ham/")]
testSpamData  = [[1,f] for f in getDirectoryContents(directory + "test/spam/")]
allTestData   = pd.DataFrame(testHamData + testSpamData).rename(columns={0: 'isSpam', 1: 'text'})

In [9]:
#Transform all files into a single string.
allTrainWords = ''.join([f[1] for f in allTrainData])
#Retrieve all unique WORDS - Remove all words with numbers/punctuation and replace with space.
allUniqueWords = np.unique(getAllWordsFromString(allTrainWords))

In [10]:
#Get a dataframe with bernoulli as the feature
trainB = getBernoulliDataFrame(allTrainData, allUniqueWords)
#Get a dataframe with bag of words as a feature for training
trainBOW = getBagOfWordsDataFrame(allTrainData, allUniqueWords)

Creating DataFrame with Bernoulli model as the feature...
Percent: [##########] 100%  
Done...
Creating DataFrame with Bag Of Words as the feature...
Percent: [##########] 100%  
Done...


In [16]:
#MultinomialBayes
getAccuracyOnNaiveBayes(allTestData, trainBOW)

0.9597701149425287

In [94]:
#Discrete bayes?
getAccuracyOnNaiveBayes(allTestData, trainB)

0.9121338912133892

In [222]:
import random
def getProbYIsZero(X, W):
    exp = math.exp(W[0] + (W[1:len(W)] * X)).sum()
    return 1 if exp == 1 else 1 / (1 - exp)

def getProbYIsOne(X, W):
    exp = math.exp(W[0] + (W[1:len(W)] * X).sum())
    return 1 if exp == 1 else exp / (1 - exp)

def getWeight(W, T):
    P = T.drop('isSpam', axis=1).apply(lambda X: getProbYIsOne(X, W), axis=1)
    isSpam = T['isSpam']
    return T.drop('isSpam', axis=1).apply(lambda X: getNewWeights(X, isSpam, P), axis=0)
    
def getNewWeights(X, isSpam, P):
    return sum([x * (isSpam[i] - P[i]) for i, x in enumerate(X)])

W = np.full(len(trainBOW.columns), 0)
W = W + 0.01 * getWeight(W, trainBOW)
W

ValueError: operands could not be broadcast together with shapes (9266,) (9265,) 

In [220]:
W = getWeight(W, trainBOW)

In [221]:
W

a                 7359.0
aa                  18.0
aaas                 1.0
aabda                0.0
ab                 203.0
abbey                1.0
abbpge               0.0
abdul                2.0
abel                 6.0
abelmosk             1.0
abelson              1.0
abideth              1.0
ability             14.0
ablate               1.0
ablaze               1.0
able                76.0
ablished             1.0
abo                 32.0
abort                0.0
about               17.0
above                7.0
abovementioned       1.0
abscess              1.0
abscissa             1.0
absently             1.0
absolute             5.0
absolutely           3.0
absorbed             1.0
ac                 379.0
acc                 45.0
                   ...  
zeros                0.0
zevex                0.0
zgmzw                1.0
zibet                1.0
zisobath             1.0
zively               0.0
zivley               0.0
zjdsl                1.0
zjgpws               1.0
