In [1]:
import numpy as np, pandas as pd
import os
from crossValidate import getXVFolds

In [2]:
def concateIntercept(x):
    return np.hstack( [np.ones((x.shape[0],1), x.dtype), x] )

################################################################################
def sigmoid(x, wt):
    return 1.0 / ( 1 + np.exp(-x @ wt) )

def crossEntropy(x, wt, y):
    yhat = sigmoid(x, wt)
    err = -np.mean(y*np.log(yhat) + (1-y)*np.log(1-yhat))
    return err, yhat
    
def updateWeight(eta, yhat, y, x, wt):
    d = (y - yhat) @ x / len(y)
    return wt + (eta*d)

def fitLogisticReg(x, y, eta, eps=1e-7, trace=False):
    x = concateIntercept(x)
    wt = np.random.rand(x.shape[1])/50 - 0.01 # initialize weights
    lastErr = 0
    err,yhat = crossEntropy(x, wt, y)

    n = 0
    while (abs(err-lastErr) > eps) and n < 1e6:
        if n % 1000 == 0 and trace:
            print('Iter #%u, error: %f'%(n,err))
        wt = updateWeight(eta, yhat, y, x, wt)
        lastErr = err
        err,yhat = crossEntropy(x, wt, y)
        if err > lastErr:
            eta /= 10
        n += 1
    
    print('Final iteration #%u, error: %f' % (n-1,err) )
    return wt

In [19]:
def discretizeMean(inDF, useMed=False):
    outDF = pd.DataFrame()
    if useMed:
        thresh = inDF.median()
    else:
        thresh = inDF.mean()
    for v in list(inDF): # loop over all columns
        outDF[v] = (inDF[v] > thresh[v]) * 1
    return outDF

# generate one-hot coding for issues with lots of missing votes
def oneHot(data, colName):
    x = data.loc[:,colName]
    oneHotMat = pd.concat([(x=='y'),(x=='n'),(x=='?')], axis=1)
    oneHotMat.columns = [colName+'_'+suff for suff in ['y','n','q']]
    return oneHotMat


In [47]:
soyData = os.path.join('./data', 'soybean-small.data')
soyNames = ['c%02d'%(n+1) for n in range(35)] + ['class']
raw = pd.read_csv(soyData, names=soyNames)
feats = np.array(soyNames)[raw.nunique()!=1] # remove feats with only 1 value

In [48]:
raw[feats]

Unnamed: 0,c01,c02,c03,c04,c05,c06,c07,c08,c09,c10,...,c21,c22,c23,c24,c25,c26,c27,c28,c35,class
0,4,0,2,1,1,1,0,1,0,2,...,3,1,1,1,0,0,0,0,0,D1
1,5,0,2,1,0,3,1,1,1,2,...,3,0,1,1,0,0,0,0,0,D1
2,3,0,2,1,0,2,0,2,1,1,...,3,0,1,1,0,0,0,0,0,D1
3,6,0,2,1,0,1,1,1,0,0,...,3,1,1,1,0,0,0,0,0,D1
4,4,0,2,1,0,3,0,2,0,2,...,3,1,1,1,0,0,0,0,0,D1
5,5,0,2,1,0,2,0,1,1,0,...,3,1,1,1,0,0,0,0,0,D1
6,3,0,2,1,0,2,1,1,0,1,...,3,0,1,1,0,0,0,0,0,D1
7,3,0,2,1,0,1,0,2,1,2,...,3,0,1,1,0,0,0,0,0,D1
8,6,0,2,1,0,3,0,1,1,1,...,3,1,1,1,0,0,0,0,0,D1
9,6,0,2,1,0,1,0,1,0,2,...,3,1,1,1,0,0,0,0,0,D1


In [None]:
vote84Data = os.path.join('./data', 'house-votes-84.data')
vote84Names = ['party', 'infant', 'water', 'budget', 'doctorfee','salvador',
              'religion', 'satellite', 'contras', 'missile', 'immigration',
              'synfuels', 'education', 'superfund', 'crime', 'exports',
              'ZAF']
raw = pd.read_csv(vote84Data , names=vote84Names ) # read in vote file

oneHotCols = pd.concat([oneHot(raw,'water'), oneHot(raw,'education'), 
                        oneHot(raw,'ZAF')], axis=1)

# remove variables with completed one-hot coding from list of variables
yesVars = np.setdiff1d(vote84Names[1:],['water','education','ZAF'])
yesVote = raw.loc[:,yesVars] == 'y' # boolean for vote='yes' for rest of vars
yesVote.columns = [s+'_y' for s in yesVote.columns]
repub = raw.loc[:,['party']] == 'republican' # boolean for republicans
voteData = pd.concat([yesVote,oneHotCols], axis=1) # concat two dataframes
voteMat = voteData.values * 1 # give matrixs of 0 & 1 for calculation
repubVec = repub.values.ravel() * 1 # vector of 0 & 1 for calculation

In [None]:
folds = getXVFolds(voteMat, repubVec, categorical=True)
trainIdx = folds[0]
testIdx = np.hstack(folds[1:])
trainData,trainLabel = voteMat[trainIdx],repubVec[trainIdx]
testData,testLabel = voteMat[testIdx],repubVec[testIdx]

In [None]:
wts = fitLogisticReg(trainData, trainLabel, 0.1, trace=True)

In [None]:
np.mean((sigmoid(concateIntercept(testData), wts)>0.5)*1==testLabel)