In [4]:
import numpy as np, pandas as pd
import os

In [90]:
def concateIntercept(x):
    return np.hstack( [np.ones((x.shape[0],1), x.dtype), x] )

################################################################################
def sigmoid(x, wt):
    return 1.0 / ( 1 + np.exp(-x @ wt) )

def crossEntropy(x, wt, y):
    yhat = sigmoid(x, wt)
    err = -np.mean(y*np.log(yhat) + (1-y)*np.log(1-yhat))
    return err, yhat
    
def updateWeight(eta, yhat, y, x, wt):
    d = (y - yhat) @ x / len(y)
    return wt + (eta*d)

def fitLogisticReg(x, y, eta, eps=1e-7, trace=False):
    x = concateIntercept(x)
    wt = np.random.rand(x.shape[1])/50 - 0.01 # initialize weights
    lastErr = 0
    err,yhat = crossEntropy(x, wt, y)

    n = 0
    while (abs(err-lastErr) > eps) and n < 1e6:
        if n % 1000 == 0 and trace:
            print('Iter #%u, error: %f'%(n,err))
        wt = updateWeight(eta, yhat, y, x, wt)
        lastErr = err
        err,yhat = crossEntropy(x, wt, y)
        if err > lastErr:
            eta /= 10
        n += 1
    
    print('Final iteration #%u, error: %f' % (n-1,err) )
    return wt

In [44]:
vote84Data = os.path.join('./data', 'house-votes-84.data')
vote84Names = ['party', 'infant', 'water', 'budget', 'doctorfee','salvador',
              'religion', 'satellite', 'contras', 'missile', 'immigration',
              'synfuels', 'education', 'superfund', 'crime', 'exports',
              'ZAF']
raw = pd.read_csv(vote84Data , names=vote84Names ) # read in vote file

In [50]:
# generate one-hot coding for issues with lots of missing votes
def oneHot(data, colName):
    x = data.loc[:,colName]
    oneHotMat = pd.concat([(x=='y'),(x=='n'),(x=='?')], axis=1)
    oneHotMat.columns = [colName+'_'+suff for suff in ['y','n','q']]
    return oneHotMat

oneHotCols = pd.concat([oneHot(raw,'water'), oneHot(raw,'education'), 
                        oneHot(raw,'ZAF')], axis=1)


In [84]:
def getXVFolds(dataMat, classVec, nFolds=5, categorical=False):
    ''' Cut N-fold cross validation of the data set
    Given a data matrix, a class vector, and the number of folds, the function
    randomly cuts a 5-fold cross validation. If the data is categorical, 
    stratified sampling is used.
    '''

    idx = np.arange(dataMat.shape[0]) # construct index of data rows
    if categorical:
        unqs = np.unique(classVec)
        tmpHold = [None] * len(unqs)
        for n,k in enumerate(unqs):
            grpIdx = idx[classVec==k] # idx of all elems in current class
            np.random.shuffle(grpIdx) # permutate idx for random selection
            tmpHold[n] = np.array_split(grpIdx, nFolds) # split: N equals
        chunks = [np.hstack(k) for k in zip(*tmpHold)] # concat sub chunks
    else:
        np.random.shuffle(idx) # random shuffle data
        chunks = np.array_split(idx, nFolds) # split into N equal sized chunks

    return chunks # return the prediction of the last fold

In [51]:
# remove variables with completed one-hot coding from list of variables
yesVars = np.setdiff1d(vote84Names[1:],['water','education','ZAF'])
yesVote = raw.loc[:,yesVars] == 'y' # boolean for vote='yes' for rest of vars
yesVote.columns = [s+'_y' for s in yesVote.columns]
repub = raw.loc[:,['party']] == 'republican' # boolean for republicans
voteData = pd.concat([yesVote,oneHotCols], axis=1) # concat two dataframes
voteMat = voteData.values * 1 # give matrixs of 0 & 1 for calculation
repubVec = repub.values.ravel() * 1 # vector of 0 & 1 for calculation

In [94]:
folds = getXVFolds(voteMat, repubVec, categorical=True)
trainIdx = folds[0]
testIdx = np.hstack(folds[1:])
trainData,trainLabel = voteMat[trainIdx],repubVec[trainIdx]
testData,testLabel = voteMat[testIdx],repubVec[testIdx]

In [95]:
wts = fitLogisticReg(trainData, trainLabel, 0.1, trace=True)

Iter #0, error: 0.687436
Iter #1000, error: 0.073454
Iter #2000, error: 0.050947
Iter #3000, error: 0.039593
Iter #4000, error: 0.032469
Iter #5000, error: 0.027536
Iter #6000, error: 0.023910
Iter #7000, error: 0.021129
Iter #8000, error: 0.018928
Iter #9000, error: 0.017142
Iter #10000, error: 0.015665
Iter #11000, error: 0.014422
Iter #12000, error: 0.013362
Iter #13000, error: 0.012448
Iter #14000, error: 0.011650
Iter #15000, error: 0.010949
Iter #16000, error: 0.010328
Iter #17000, error: 0.009774
Iter #18000, error: 0.009276
Iter #19000, error: 0.008826
Iter #20000, error: 0.008419
Iter #21000, error: 0.008047
Iter #22000, error: 0.007707
Iter #23000, error: 0.007395
Iter #24000, error: 0.007107
Iter #25000, error: 0.006840
Iter #26000, error: 0.006593
Iter #27000, error: 0.006364
Iter #28000, error: 0.006149
Iter #29000, error: 0.005949
Iter #30000, error: 0.005762
Iter #31000, error: 0.005586
Iter #32000, error: 0.005420
Iter #33000, error: 0.005264
Iter #34000, error: 0.00511

In [105]:
np.mean((sigmoid(concateIntercept(testData), wts)>0.5)*1==testLabel)

0.9394812680115274