In [1]:
from __future__ import division
import pandas as pd
import numpy  as np
import os
import re
from collections import Counter
import time, sys
import math

# Code made by Brain Khuu: https://stackoverflow.com/questions/3160699/python-progress-bar
# update_progress() : Displays or updates a console progress bar
## Accepts a float between 0 and 1. Any int will be converted to a float.
## A value under 0 represents a 'halt'.
## A value at 1 or bigger represents 100%
def update_progress(progress):
    
    barLength = 10 # Modify this to change the length of the progress bar
    status = ""
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
        status = "error: progress var must be float\r\n"
    if progress < 0:
        progress = 0
        status = "\nHalt...\r\n"
    if progress >= 1:
        progress = 1
        status = "\nDone...\r\n"
    block = int(round(barLength*progress))
    text = "\rPercent: [{0}] {1}% {2}".format( "#"*block + "-"*(barLength-block), progress*100, status)
    sys.stdout.write(text)
    sys.stdout.flush()

In [2]:
directory = "/home/daniel/Documents/homework/Senior/Fall_2019/CS6375/Homework_2/Code/"

In [130]:
# Condense multiple whitespaces into one, grab only alphabetic words, convert to lowercase
# and return array of words.
def getAllWordsFromString(words):
    return re.sub('\s+', ' ',re.sub('[^a-zA-Z1-9]+', ' ', words)).strip().lower().split(" ")

# getCountOfWords
# Create a dictionary with the count of each word in a string.
def getCountOfWords(words, allUniqueWords):
    allUniqueWordsDict = { i : 0 for i in allUniqueWords }
    counts = Counter(getAllWordsFromString(words))
    counts = {k : v for k, v in dict(counts).items() if k in allUniqueWordsDict}
    return mergeTwoDicts(allUniqueWordsDict, counts)

#getBernoulliWords
# Create a dictionary that shows the existence of words as 1 or 0.
def getBernoulliWords(words, allUniqueWords):
    counts = getCountOfWords(words, allUniqueWords)
    #Transform counts into existense
    return { k : (0 if v == 0 else 1) for k , v in counts.items()}
        
def getCountOfWordsWithProgressBar(words, allUniqueWords, progress):
    progress = round(progress,3)
    update_progress(progress)
    return getCountOfWords(words, allUniqueWords)

def getBernoulliWithProgressBar(words, allUniqueWords, progress):
    progress = round(progress,3)
    update_progress(progress)
    return getBernoulliWords(words, allUniqueWords)

#getProduct of Probabilities
# returns the log probability sum of all of the elements based on bayes
# works with both bernoulli and bag of words model
def getProductOfProbabities(text, T):
    featureSums = T.sum().loc[[w for w in getAllWordsFromString(text) if w in T.columns]]
    totalWords  = T.sum().sum()
    return np.log( (featureSums + 1) / (totalWords + len(T.columns))).sum()

def mergeTwoDicts(x, y):
    z = x.copy()
    z.update(y)
    return z

In [4]:
def naiveBayesOnModel(text, T):
    p_0 = np.log(len(T[T['isSpam'] == 0]) / len(T)) + getProductOfProbabities(text, (T[T['isSpam'] == 0]).drop('isSpam', axis=1))
    p_1 = np.log(len(T[T['isSpam'] == 1]) / len(T)) + getProductOfProbabities(text, (T[T['isSpam'] == 1]).drop('isSpam', axis=1))
    return 0 if p_0 > p_1 else 1

def MCAPUpdateWeights(T, learnRate, W):
    changeW = learnRate * np.array([sum([T[x][i] * (y - (T[T['isSpam'] == 1][x].sum()) / T[x].sum())
                    for i,y in enumerate(T['isSpam'])])
               for x in ['a']])
    
    return W + changeW
    
def getDirectoryContents(dataDirectory):
    contents = np.array([])
    for fileName in os.listdir(dataDirectory):
        contents = np.append(contents, [open(dataDirectory + fileName).read()])
    return contents

def getBagOfWordsDataFrame(data, allUniqueWords):
    print "Creating DataFrame with Bag Of Words as the feature..."
    attributes = set(allUniqueWords)
    df = pd.DataFrame([getCountOfWordsWithProgressBar(d[1], attributes, i / (len(data) - 1))
                       for i,d in enumerate(data)])
    df.insert(0, 'isSpam', [d[0] for d in data])
    return df

def getBernoulliDataFrame(data, allUniqueWords):
    print "Creating DataFrame with Bernoulli model as the feature..."
    attributes = set(allUniqueWords)
    df = pd.DataFrame([getBernoulliWithProgressBar(d[1], attributes, i / (len(data) - 1))
                       for i,d in enumerate(data)])
    df.insert(0, 'isSpam', [d[0] for d in data])
    return df

def getAccuracyOnNaiveBayes(Test, Train):
    return sum(Test.apply(lambda x: naiveBayesOnModel(x['text'], Train) == x['isSpam'], axis=1)) / len(Test)    

def PredictWithLR(T, W):
    bias = W[0]
    PY_1 = 1 / (1 + math.exp(bias + 
                             T.apply(lambda x: (T[x].sum() / T.sum().sum()) * W[x]).sum()))
    PY_0 = 1 - PY_1
    
    return 1

In [5]:
trainHamData  = [[0,f] for f in getDirectoryContents(directory + "train/ham/")]
trainSpamData = [[1,f] for f in getDirectoryContents(directory + "train/spam/")]
allTrainData  = trainHamData + trainSpamData
testHamData   = [[0,f] for f in getDirectoryContents(directory + "test/ham/")]
testSpamData  = [[1,f] for f in getDirectoryContents(directory + "test/spam/")]
allTestData   = pd.DataFrame(testHamData + testSpamData).rename(columns={0: 'isSpam', 1: 'text'})

In [131]:
#Transform all files into a single string.
allTrainWords = ''.join([f[1] for f in allTrainData])
#Retrieve all unique WORDS - Remove all words with numbers/punctuation and replace with space.
allUniqueWords = np.unique(getAllWordsFromString(allTrainWords))

In [358]:
#Get a dataframe with bernoulli as the feature
trainB = getBernoulliDataFrame(allTrainData, allUniqueWords)
testB  = getBernoulliDataFrame(testHamData + testSpamData, allUniqueWords)
#Get a dataframe with bag of words as a feature for training
trainBOW = getBagOfWordsDataFrame(allTrainData, allUniqueWords)
testBOW  = getBagOfWordsDataFrame(testHamData + testSpamData, allUniqueWords)


Creating DataFrame with Bernoulli model as the feature...
Percent: [##########] 100%  
Done...
Creating DataFrame with Bernoulli model as the feature...
Percent: [##########] 100%  
Done...
Creating DataFrame with Bag Of Words as the feature...
Percent: [##########] 100%  
Done...
Creating DataFrame with Bag Of Words as the feature...
Percent: [##########] 100%  
Done...


In [193]:
#Multinomial Bayes
getAccuracyOnNaiveBayes(allTestData, trainBOW)

0.9476987447698745

In [194]:
#Discrete bayes
getAccuracyOnNaiveBayes(allTestData, trainB)

0.8828451882845189

In [349]:
import random
def getProbYIsZero(scores):
    return 1 / (1 + np.exp(-scores))

def getProbYIsOne(scores):
    return 1 - getProbYIsZero(scores)

def getWeight(W, T):
    predictions = getPredictions(W, T)
    target      = T['isSpam']
    attributes  = T.drop('isSpam', axis=1)
    attributes.insert(0, 'x_0', 1)
    gradient    = np.dot(attributes.T, target - predictions)
    return gradient.astype(np.float64)

def getLogLikelihood(W, T):
    target = T['isSpam']
    attributes = T.drop('isSpam', axis=1)
    attributes.insert(0, 'isSpam', 1)
    scores = np.dot(attributes, W)
    return np.sum(target * scores - np.log(1 + np.exp(-scores)))

def getPredictions(W, T):
    return getProbYIsZero(np.dot(T,W)).astype(np.float64)
    
def getAccuracy(W, T):
    return np.sum([T['isSpam'][i] == prediction.round() for i, prediction in enumerate(getPredictions(W,T))]) / len(T)

def splitDataFrame(D, frac):
    return (D[0: int(math.floor(len(D) * frac))], D[int(math.floor(len(D) * frac)): len(D)])

def L2Regularization(D):
    
    
def logisticRegression(D, numSteps, learningRate):
    W = np.zeros(len(D.columns))
    ham1, ham2   = splitDataFrame(D[D['isSpam'] == 0], 0.7)
    spam1, spam2 = splitDataFrame(D[D['isSpam'] == 1], 0.7)
    T = ham1.append(spam1).reset_index(drop=True)
    V = ham2.append(spam2).reset_index(drop=True)
    for i in range(1, numSteps):
        W += learningRate * getWeight(W,T)
        update_progress( round(i / (numSteps - 1), 3))
    
    return W


In [350]:
W = logisticRegression(trainBOW, 1000, 0.001)

Percent: [##########] 100%  
Done...


In [361]:
getPredictions(W, testBOW).round()

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [363]:
sum(W)

30.674050288388695

In [330]:
T = trainBOW
target = T['isSpam']
attributes = trainBOW.drop('isSpam',axis=1)
attributes = T.drop('isSpam', axis=1)
attributes.insert(0, 'isSpam', 1)
scores = np.dot(attributes, W)
np.sum( (target * scores) - np.log(1 + np.exp(-scores)))

-5511.0747460628145

In [318]:
target.shape

(463,)

In [319]:
scores.shape

(464,)

In [321]:
target.shape

(463,)

In [328]:
scores +

array([ -22.43338145,  -13.11139517,  -31.30021004, -119.1550345 ,
         -7.65096237,   -6.18366824,   -6.28645799,   -5.49369181,
        -12.38707574,   -6.39262841,   -7.10288871,   -2.30738332,
        -22.04743293,  -32.10563948,   -6.58554926,  -13.13136033,
         -2.039782  , -106.31893081,   -4.91578923,  -15.59978861,
        -26.19464116,   -6.42310172,   -6.4168706 ,  -31.52646781,
         -8.5187136 ,   -6.15530987,  -10.26144115,   -6.65015591,
        -25.55137228,   -9.86858177,  -74.44060837,   -6.88419247,
         -6.84538524,  -21.73550874,   -6.88542163,   -3.62275985,
        -66.04646064,  -15.28944505,  -14.28734202,   -7.1288302 ,
        -22.47388154,  -13.87418358,   -8.94964863,   -6.95960253,
         -5.24700091,   -9.51328878,  -13.44599337,   -6.48870624,
         -3.81524737,  -20.37050368,   -1.5732571 ,   -4.55096863,
        -24.21328783,   -8.81309411,   -6.98482107,   -7.89401157,
         -3.30992488,  -28.6334721 ,   -2.24468805,   -2.58778

array([ -22.15240083,  -12.83041456,  -31.01922942, -118.87405388,
         -7.36998176,   -5.90268762,   -6.00547737,   -5.21271119,
        -12.10609513,   -6.1116478 ,   -6.82190809,   -2.02640271,
        -21.76645232,  -31.82465886,   -6.30456864,  -12.85037971,
         -1.75880138, -106.03795019,   -4.63480861,  -15.318808  ,
        -25.91366054,   -6.1421211 ,   -6.13588998,  -31.2454872 ,
         -8.23773299,   -5.87432925,   -9.98046053,   -6.3691753 ,
        -25.27039166,   -9.58760115,  -74.15962775,   -6.60321186,
         -6.56440462,  -21.45452812,   -6.60444101,   -3.34177923,
        -65.76548003,  -15.00846444,  -14.0063614 ,   -6.84784959,
        -22.19290093,  -13.59320296,   -8.66866802,   -6.67862191,
         -4.96602029,   -9.23230816,  -13.16501275,   -6.20772562,
         -3.53426675,  -20.08952307,   -1.29227649,   -4.26998802,
        -23.93230721,   -8.53211349,   -6.70384046,   -7.61303095,
         -3.02894426,  -28.35249148,   -1.96370744,   -2.30680