In [1]:
from __future__ import division
import pandas as pd
import numpy  as np
import os
import re
from collections import Counter
import time, sys
import math

# Code made by Brain Khuu: https://stackoverflow.com/questions/3160699/python-progress-bar
# update_progress() : Displays or updates a console progress bar
## Accepts a float between 0 and 1. Any int will be converted to a float.
## A value under 0 represents a 'halt'.
## A value at 1 or bigger represents 100%
def update_progress(progress):
    
    barLength = 10 # Modify this to change the length of the progress bar
    status = ""
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
        status = "error: progress var must be float\r\n"
    if progress < 0:
        progress = 0
        status = "\nHalt...\r\n"
    if progress >= 1:
        progress = 1
        status = "\nDone...\r\n"
    block = int(round(barLength*progress))
    text = "\rPercent: [{0}] {1}% {2}".format( "#"*block + "-"*(barLength-block), progress*100, status)
    sys.stdout.write(text)
    sys.stdout.flush()

In [2]:
directory = "/home/daniel/Documents/homework/Senior/Fall_2019/CS6375/Homework_2/Code/"

In [130]:
# Condense multiple whitespaces into one, grab only alphabetic words, convert to lowercase
# and return array of words.
def getAllWordsFromString(words):
    return re.sub('\s+', ' ',re.sub('[^a-zA-Z1-9]+', ' ', words)).strip().lower().split(" ")

# getCountOfWords
# Create a dictionary with the count of each word in a string.
def getCountOfWords(words, allUniqueWords):
    allUniqueWordsDict = { i : 0 for i in allUniqueWords }
    counts = Counter(getAllWordsFromString(words))
    counts = {k : v for k, v in dict(counts).items() if k in allUniqueWordsDict}
    return mergeTwoDicts(allUniqueWordsDict, counts)

#getBernoulliWords
# Create a dictionary that shows the existence of words as 1 or 0.
def getBernoulliWords(words, allUniqueWords):
    counts = getCountOfWords(words, allUniqueWords)
    #Transform counts into existense
    return { k : (0 if v == 0 else 1) for k , v in counts.items()}
        
def getCountOfWordsWithProgressBar(words, allUniqueWords, progress):
    progress = round(progress,3)
    update_progress(progress)
    return getCountOfWords(words, allUniqueWords)

def getBernoulliWithProgressBar(words, allUniqueWords, progress):
    progress = round(progress,3)
    update_progress(progress)
    return getBernoulliWords(words, allUniqueWords)

#getProduct of Probabilities
# returns the log probability sum of all of the elements based on bayes
# works with both bernoulli and bag of words model
def getProductOfProbabities(text, T):
    featureSums = T.sum().loc[[w for w in getAllWordsFromString(text) if w in T.columns]]
    totalWords  = T.sum().sum()
    return np.log( (featureSums + 1) / (totalWords + len(T.columns))).sum()

def mergeTwoDicts(x, y):
    z = x.copy()
    z.update(y)
    return z

In [4]:
def naiveBayesOnModel(text, T):
    p_0 = np.log(len(T[T['isSpam'] == 0]) / len(T)) + getProductOfProbabities(text, (T[T['isSpam'] == 0]).drop('isSpam', axis=1))
    p_1 = np.log(len(T[T['isSpam'] == 1]) / len(T)) + getProductOfProbabities(text, (T[T['isSpam'] == 1]).drop('isSpam', axis=1))
    return 0 if p_0 > p_1 else 1

def MCAPUpdateWeights(T, learnRate, W):
    changeW = learnRate * np.array([sum([T[x][i] * (y - (T[T['isSpam'] == 1][x].sum()) / T[x].sum())
                    for i,y in enumerate(T['isSpam'])])
               for x in ['a']])
    
    return W + changeW
    
def getDirectoryContents(dataDirectory):
    contents = np.array([])
    for fileName in os.listdir(dataDirectory):
        contents = np.append(contents, [open(dataDirectory + fileName).read()])
    return contents

def getBagOfWordsDataFrame(data, allUniqueWords):
    print "Creating DataFrame with Bag Of Words as the feature..."
    attributes = set(allUniqueWords)
    df = pd.DataFrame([getCountOfWordsWithProgressBar(d[1], attributes, i / (len(data) - 1))
                       for i,d in enumerate(data)])
    df.insert(0, 'isSpam', [d[0] for d in data])
    return df

def getBernoulliDataFrame(data, allUniqueWords):
    print "Creating DataFrame with Bernoulli model as the feature..."
    attributes = set(allUniqueWords)
    df = pd.DataFrame([getBernoulliWithProgressBar(d[1], attributes, i / (len(data) - 1))
                       for i,d in enumerate(data)])
    df.insert(0, 'isSpam', [d[0] for d in data])
    return df

def getAccuracyOnNaiveBayes(Test, Train):
    return sum(Test.apply(lambda x: naiveBayesOnModel(x['text'], Train) == x['isSpam'], axis=1)) / len(Test)    

def PredictWithLR(T, W):
    bias = W[0]
    PY_1 = 1 / (1 + math.exp(bias + 
                             T.apply(lambda x: (T[x].sum() / T.sum().sum()) * W[x]).sum()))
    PY_0 = 1 - PY_1
    
    return 1

In [5]:
trainHamData  = [[0,f] for f in getDirectoryContents(directory + "train/ham/")]
trainSpamData = [[1,f] for f in getDirectoryContents(directory + "train/spam/")]
allTrainData  = trainHamData + trainSpamData
testHamData   = [[0,f] for f in getDirectoryContents(directory + "test/ham/")]
testSpamData  = [[1,f] for f in getDirectoryContents(directory + "test/spam/")]
allTestData   = pd.DataFrame(testHamData + testSpamData).rename(columns={0: 'isSpam', 1: 'text'})

In [131]:
#Transform all files into a single string.
allTrainWords = ''.join([f[1] for f in allTrainData])
#Retrieve all unique WORDS - Remove all words with numbers/punctuation and replace with space.
allUniqueWords = np.unique(getAllWordsFromString(allTrainWords))

In [358]:
#Get a dataframe with bernoulli as the feature
trainB = getBernoulliDataFrame(allTrainData, allUniqueWords)
testB  = getBernoulliDataFrame(testHamData + testSpamData, allUniqueWords)
#Get a dataframe with bag of words as a feature for training
trainBOW = getBagOfWordsDataFrame(allTrainData, allUniqueWords)
testBOW  = getBagOfWordsDataFrame(testHamData + testSpamData, allUniqueWords)


Creating DataFrame with Bernoulli model as the feature...
Percent: [##########] 100%  
Done...
Creating DataFrame with Bernoulli model as the feature...
Percent: [##########] 100%  
Done...
Creating DataFrame with Bag Of Words as the feature...
Percent: [##########] 100%  
Done...
Creating DataFrame with Bag Of Words as the feature...
Percent: [##########] 100%  
Done...


In [193]:
#Multinomial Bayes
getAccuracyOnNaiveBayes(allTestData, trainBOW)

0.9476987447698745

In [194]:
#Discrete bayes
getAccuracyOnNaiveBayes(allTestData, trainB)

0.8828451882845189

In [401]:
import random
def getProbYIsZero(scores):
    return 1 / (1 + np.exp(-scores))

def getProbYIsOne(scores):
    return 1 - getProbYIsZero(scores)

def getWeight(W, T):
    predictions = getPredictions(W, T)
    target      = T['isSpam']
    attributes  = T.drop('isSpam', axis=1)
    attributes.insert(0, 'x_0', 1)
    gradient    = np.dot(attributes.T, target - predictions)
    return gradient.astype(np.float64)

def getLogLikelihood(W, T):
    target = T['isSpam']
    features = T.drop('isSpam', axis=1)
    features.insert(0, 'isSpam', 1)
    scores = np.dot(features, W)
    return np.sum(target * scores - np.log(1 + np.exp(-scores)))

def getPredictions(W, T):
    features = T.drop('isSpam', axis=1)
    features.insert(0, 'isSpam', 1)
    return getProbYIsZero(np.dot(T,W)).astype(np.float64)
    
def getAccuracy(W, T):
    return np.sum([T['isSpam'][i] == prediction.round() for i, prediction in enumerate(getPredictions(W,T))]) / len(T)

def splitDataFrame(D, frac):
    return (D[0: int(math.floor(len(D) * frac))], D[int(math.floor(len(D) * frac)): len(D)])

def L2Regularization(W, V, penalty):
    target = T['isSpam']
    features = T.drop('isSpam', axis=1)
    features.insert(0,'x_0', 1)
    return (target - np.dot(features, W))**2 + (penalty / 2) * (W ** 2)
    
def logisticRegression(D, numSteps, learningRate):
    W = np.zeros(len(D.columns))
    ham1, ham2   = splitDataFrame(D[D['isSpam'] == 0], 0.7)
    spam1, spam2 = splitDataFrame(D[D['isSpam'] == 1], 0.7)
    T = ham1.append(spam1).reset_index(drop=True)
    V = ham2.append(spam2).reset_index(drop=True)
    for i in range(1, numSteps):
        W += learningRate * getWeight(W,T)
        update_progress( round(i / (numSteps - 1), 3))
    
    return W


In [402]:
W = logisticRegression(trainBOW, 1000, 0.001)

Percent: [##########] 100%  
Done...


In [408]:
getAccuracy(W, testBOW)

0.9163179916317992

In [399]:
V = trainBOW
penalty = 0.5
target = T['isSpam']
features = T.drop('isSpam', axis=1)
features.insert(0,'x_0', 1)
(target - np.dot(features, W))**2 + (penalty / 2) * np.dot(W,W)

0        396.983185
1        160.249767
2        777.700714
3      10843.135183
4         57.546126
5         42.533844
6         46.351074
7         44.975723
8        126.792341
9         48.757462
10        54.728610
11        15.003200
12       368.812539
13       728.556299
14        38.482121
15       140.535677
16        13.537973
17      8447.632837
18        30.740837
19       231.418233
20       637.150362
21        50.376071
22        47.409741
23       864.508413
24        60.955171
25        40.400834
26        96.607096
27        49.426364
28       501.382473
29       102.082237
           ...     
433        9.139084
434        9.106871
435        9.716927
436        9.955069
437        9.588446
438        9.658288
439       11.165488
440       17.482180
441       44.035748
442        9.079972
443       10.561270
444       15.676008
445       10.695207
446       10.456435
447        9.733561
448       14.228903
449       27.205083
450       49.311472
451       21.260652


In [400]:
W

array([-0.0597583 , -0.19963011,  0.09378671, ...,  0.06210696,
        0.04746371,  0.0793221 ])

In [330]:
T = trainBOW
target = T['isSpam']
attributes = trainBOW.drop('isSpam',axis=1)
attributes = T.drop('isSpam', axis=1)
attributes.insert(0, 'isSpam', 1)
scores = np.dot(attributes, W)
np.sum( (target * scores) - np.log(1 + np.exp(-scores)))

-5511.0747460628145

In [318]:
target.shape

(463,)