In [2]:
import numpy as np, pandas as pd
import os
from crossValidate import getXVFolds
from NaiveBayes import NB_Train, NB_Pred

In [None]:
x = np.random.randint(5, size=[6,3])
wt = np.random.rand(3,4)*2
print(x)
print(wt)

In [None]:
list(zip(*  [[1,2,3],[4,5,6]]  ))

In [None]:
yhat = softMax(x, wt)
idx = np.argmax(yhat, axis=1)
y = np.zeros([6,4], int)
for r in np.arange(6):
    y[r,idx[r]] = 1

np.sum(-y*np.log(yhat),axis=1).mean()

In [103]:
def concateIntercept(x):
    return np.hstack( [np.ones((x.shape[0],1), x.dtype), x] )

################################################################################

def softMax(x, wts):
    pr = np.exp(x @ wts)
    return pr / np.sum(pr,axis=1)[:,None]


def fitLogisticNK(x, y, eta, eps=1e-7, trace=False):
    def crossEntNK(x, wts, y):
        yhat = softMax(x, wts)
        err = np.sum(-y*np.log(yhat), axis=1).mean()
        return err, yhat
    
    def updateWeightNK(eta, yhat, y, x, wt):
        d = ((y - yhat).T @ x).T / len(y)
        return wt + (eta*d)
    
    x = concateIntercept(x)
    nDim,nK = x.shape[1],y.shape[1]
    wts = np.random.rand(nDim,nK)/50 - 0.01 # init wts to be (-0.01,0.01)
    lastErr = np.inf # max error possible
    err,yhat = crossEntNK(x, wts, y)

    n = 0
    while (abs(err-lastErr) > eps) and n < 1e6:
        if n % 1000 == 0 and trace:
            print('Iter #%u, error: %f'%(n,err))
        wts = updateWeightNK(eta, yhat, y, x, wts)
        lastErr = err
        err,yhat = crossEntNK(x, wts, y)
        if err > lastErr:
            eta /= 10
        n += 1
    
    print('Final iteration #%u, error: %f' % (n-1,err) )
    return wts

def predLogisticNK(x, wts):
    x = concateIntercept(x)
    yhat = softMax(x, wts) # calc posterior prob of all classes
    return yhat.argmax(axis=1) # return class with largest probability


def sigmoid(x, wt):
    return 1.0 / ( 1 + np.exp(-x @ wt) )

def fitLogisticReg(x, y, eta, eps=1e-7, trace=False):
    def crossEntropy(x, wt, y):
        yhat = sigmoid(x, wt)
        err = -np.mean(y*np.log(yhat) + (1-y)*np.log(1-yhat))
        return err, yhat
    
    def updateWeight(eta, yhat, y, x, wt):
        d = (y - yhat) @ x / len(y)
        return wt + (eta*d)
    
    x = concateIntercept(x)
    wt = np.random.rand(x.shape[1])/50 - 0.01 # initialize weights
    lastErr = 1
    err,yhat = crossEntropy(x, wt, y)

    n = 0
    while (abs(err-lastErr) > eps) and n < 1e6:
        if n % 1000 == 0 and trace:
            print('Iter #%u, error: %f'%(n,err))
        wt = updateWeight(eta, yhat, y, x, wt)
        lastErr = err
        err,yhat = crossEntropy(x, wt, y)
        if err > lastErr:
            eta /= 10
        n += 1
    
    print('Final iteration #%u, error: %f' % (n-1,err) )
    return wt

def predLogistic(x, wt):
    x = concateIntercept(x)
    yhat = sigmoid(x, wt) # calc posterior prob of binary response
    return (yhat > 0.5)*1 # whether posterior prob > 0.5

In [5]:
def discretizeMean(inDF, useMed=False):
    outDF = pd.DataFrame()
    if useMed:
        thresh = inDF.median()
    else:
        thresh = inDF.mean()
    for v in list(inDF): # loop over all columns
        outDF[v] = (inDF[v] > thresh[v])
    return outDF.values * 1

# generate one-hot coding for issues with lots of missing votes
def oneHot(data, colNames):
    outDF = pd.DataFrame()
    for col in colNames:
        x = data[col]
        for val in x.unique():
            suff = 'q' if val=='?' else str(val)
            outDF[col+'_'+suff] = (x==val)
    return outDF

def normalizeDF(data):
    mins = data.min() # min of every col
    maxs = data.max() # max of every col
    return ((data-mins) / (maxs-mins)).values # normalize to [0,1]

def classVecToMat(classVec):
    uniqK = np.unique(classVec)

In [None]:
soyData = os.path.join('./data', 'soybean-small.data')
soyNames = ['c%02d'%(n+1) for n in range(35)] + ['class']
raw = pd.read_csv(soyData, names=soyNames)
feats = np.array(soyNames)[raw.nunique()!=1] # remove feats with only 1 value
raw = raw[feats]

In [None]:
dataMat = oneHot(raw, raw.columns[1:-1])
dataMat['class'] = raw['class']

In [None]:
glassData = os.path.join('./data/', 'glass.txt')
glassNames = ['id','RI','Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'type']
raw = pd.read_csv(glassData , names=glassNames ) # read in glass file
glassTypes = pd.concat([raw['type'].isin([1,3]), raw['type'].isin([2,4]),
                        raw['type'].isin([5,6,7])], axis=1).values * 1
feats = glassNames[1:-1]
glassMat = normalizeDF(raw[feats])
glassDiscMat = discretizeMean(raw[feats])
################################################################################

In [None]:
fitLogisticNK(glassDiscMat, glassTypes, 1)

In [None]:
irisFile = os.path.join('./data/', 'iris.data')
irisName = ['sepalLen', 'sepalWth', 'petalLen', 'petalWth', 'class']
raw = pd.read_csv(irisFile , names=irisName)  # read CSV file
irisTypes = pd.concat([raw['class']==x for x in raw['class'].unique()],
                     axis=1).values * 1
feats = irisName[:-1]
irisMat = normalizeDF(raw[feats])
irisDiscMat = discretizeMean(raw[feats])

In [None]:
fitLogisticNK(irisMat, irisTypes, 0.1)

In [109]:
bc_WI_data = os.path.join('./data/', 'breast-cancer-wisconsin.data')
bc_WI_names = ['id', 'clumpThick', 'unifSize', 'unifShape', 'margAdhsn', 
               'epithSize', 'bareNuclei', 'blandChrom', 'normNucleo', 
               'mitoses', 'class']
raw = pd.read_csv(bc_WI_data , names=bc_WI_names)  # read CSV file
missRow = (raw=='?').any(axis=1).values # rows with missing data
raw = raw[~missRow] # remove rows with missing
raw = raw.apply(pd.to_numeric, errors= 'coerce') # conv to numeric data
bcFeats = bc_WI_names[1:-1] # list of feature variables
bcMat = raw[bcFeats].values
bcDiscMat = discretizeMean(raw[bcFeats])
malign = (raw['class']==4).values *1

In [111]:
folds = getXVFolds(bcDiscMat, malign, categorical=True)
testIdx = folds[0]
trainIdx = np.hstack(folds[1:])
trainData,trainLabel = bcDiscMat[trainIdx],malign[trainIdx]
testData,testLabel = bcDiscMat[testIdx],malign[testIdx]

In [117]:
testLabel

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1])

In [114]:
wt = fitLogisticReg(trainData, trainLabel, 0.1)
np.mean(predLogistic(testData,wt)!=testLabel)

Final iteration #7734, error: 0.096037


0.0364963503649635

In [118]:
prbs = NB_Train(trainData, trainLabel)
prbs

(array([0.34981685, 0.65018315]),
 [array([0.15104167, 0.171875  , 0.140625  , 0.21875   , 0.26041667,
         0.14583333, 0.1875    , 0.19270833, 0.55729167]),
  array([0.78651685, 0.9747191 , 0.95505618, 0.90449438, 0.95786517,
         0.94662921, 0.95786517, 0.94101124, 0.96629213])])

In [119]:
sum(NB_Pred(testData, prbs)==testLabel)/len(testLabel)

0.021897810218978103

In [3]:
vote84Data = os.path.join('./data', 'house-votes-84.data')
vote84Names = ['party', 'infant', 'water', 'budget', 'doctorfee','salvador',
              'religion', 'satellite', 'contras', 'missile', 'immigration',
              'synfuels', 'education', 'superfund', 'crime', 'exports',
              'ZAF']
raw = pd.read_csv(vote84Data , names=vote84Names ) # read in vote file

In [95]:
oneHotCols = oneHot(raw,['water','education','ZAF'])
# remove variables with completed one-hot coding from list of variables
yesVars = np.setdiff1d(vote84Names[1:],['water','education','ZAF'])
yesVote = raw.loc[:,yesVars] == 'y' # boolean for vote='yes' for rest of vars
yesVote.columns = [s+'_y' for s in yesVote.columns]
repub = raw.loc[:,['party']] == 'republican' # boolean for republicans
voteData = pd.concat([yesVote,oneHotCols], axis=1) # concat two dataframes
voteMat = voteData.values * 1 # give matrixs of 0 & 1 for calculation
repubVec = repub.values.ravel() * 1 # vector of 0 & 1 for calculation

In [96]:
folds = getXVFolds(voteMat, repubVec, categorical=True)
testIdx = folds[0]
trainIdx = np.hstack(folds[1:])
trainData,trainLabel = voteMat[trainIdx],repubVec[trainIdx]
testData,testLabel = voteMat[testIdx],repubVec[testIdx]

In [150]:
def condProb(data, add): # data assumed to be class-homogenous
    ''' Calculate the conditional probability of a class-homogenous data set.
    The function returns the conditional probability with Laplace smoothing. 
    Data matrix has to be binary 0-1.
    '''
    condPr = np.zeros(data.shape[1]) # pre-allocate cond probilities
    for n,x in enumerate(data.T): # loop over the columns of the data
        condPr[n] = (sum(x==0)+add)/(len(x)+add) # laplace smooth if needed
    return condPr

def NB_Train(data, classVec, smooth=True):
    ''' Trains Naive Bayes on an input data matrix and class label. 
    If smooth = True, then Laplace smoothing is performed.

    Returns 3-tuple of probabilities, cond prob of C=0, cond prob of C=1
    '''
    smoothAdd = smooth*1 # addition to num and denom for smoothing
    if classVec.ndim==1: # binary class vector, transform into 2D
        classVec = np.vstack([classVec==0,classVec==1]).T *1 # [0's, 1's]

        
    pr_class = np.empty(classVec.shape[1], float) # probability of classes
    condPrs = list()
    for n,vec in enumerate(classVec.T): # loop over classes
        idx = (vec==1) # all data points belonging to this class
        pr_class[n] = sum(idx) / len(idx) # uncond probability
        condPrs.append( condProb(data[idx],smoothAdd) ) # calc cond probs
    
    return (pr_class,condPrs) # return class prob and cond probs

def NB_Pred(data, probs): # predicting based on conditional probs
    PrX = np.empty([len(data), len(probs[0])], float) # T-by-K matrix
    for n,(pr,cond) in enumerate(zip(*probs)): # loop over classes
        tmp = (data==0)*cond + (data==1)*(1-cond) # cond prob in class
        #PrX[:,n] = pr * tmp.prod(axis=1) # prod of uncond and cond probs
        PrX[:,n] = np.log(pr) + np.log(tmp).sum(axis=1) # sum log probabilities

    return PrX.argmax(axis=1) # return most likely classification


In [151]:
prbs = NB_Train(trainData, trainLabel)
prbs

(array([0.65018315, 0.34981685]),
 [array([0.78651685, 0.9747191 , 0.95505618, 0.90449438, 0.95786517,
         0.94662921, 0.95786517, 0.94101124, 0.96629213]),
  array([0.15104167, 0.171875  , 0.140625  , 0.21875   , 0.26041667,
         0.14583333, 0.1875    , 0.19270833, 0.55729167])])

In [152]:
sum(NB_Pred(testData, prbs)==testLabel)/len(testLabel)

0.9781021897810219

In [None]:
np.mean(predLogistic(testData,wt)!=testLabel)

In [120]:
testX = np.array([[1,1,1,0,0], [0,1,1,1,0], [0,0,1,1,1], [1,0,0,1,0], 
                  [0,1,1,0,1], [0,1,0,1,0], [1,0,0,0,0], [0,1,0,0,0], 
                  [1,0,1,0,1], [0,1,0,1,0]])
testY = np.array([0,0,0,0,0,1,1,1,1,1])
print(testX)
print(testY)

[[1 1 1 0 0]
 [0 1 1 1 0]
 [0 0 1 1 1]
 [1 0 0 1 0]
 [0 1 1 0 1]
 [0 1 0 1 0]
 [1 0 0 0 0]
 [0 1 0 0 0]
 [1 0 1 0 1]
 [0 1 0 1 0]]
[0 0 0 0 0 1 1 1 1 1]


In [148]:
NB_Train(testX, testY, False)

(array([0.5, 0.5]),
 [array([0.6, 0.4, 0.2, 0.4, 0.6]), array([0.6, 0.4, 0.8, 0.6, 0.8])])

In [149]:
NB_Pred(testX, NB_Train(testX, testY, False))

[[-3.77052344 -4.46367062]
 [-2.95959323 -4.46367062]
 [-3.77052344 -6.25543009]
 [-5.1568178  -3.88830648]
 [-3.77052344 -5.44449988]
 [-4.34588759 -3.07737626]
 [-5.56228291 -3.48284137]
 [-4.7513527  -2.67191115]
 [-4.58145366 -6.25543009]
 [-4.34588759 -3.07737626]]


array([0, 0, 0, 1, 0, 1, 1, 1, 0, 1], dtype=int64)