In [1]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
dataDir = './data/'
glassData = 'glass.txt'
glassNames = ['id','RI','Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'type']
vote84Data = os.path.join(dataDir, 'house-votes-84.data')
vote84Names = ['class', 'infant', 'water', 'budget', 'doctorfee','salvador',
              'religion', 'satellite', 'contras', 'missile', 'immigration',
              'synfuels', 'education', 'superfund', 'crime', 'exports',
              'ZAF']

In [3]:
################################################################################
raw = pd.read_csv(vote84Data , names=vote84Names ) # read in vote file
# find the counts of unique values by all the variables 
z = raw.groupby('class').agg({i:'value_counts' for i in raw.columns[1:]})

In [4]:
z.iloc[:,:8]

Unnamed: 0,Unnamed: 1,infant,water,budget,doctorfee,salvador,religion,satellite,contras
democrat,?,9,28,7,8,12,9,8,4
democrat,n,102,119,29,245,200,135,59,45
democrat,y,156,120,231,14,55,123,200,218
republican,?,3,20,4,3,3,2,6,11
republican,n,134,73,142,2,8,17,123,133
republican,y,31,75,22,163,157,149,39,24


In [5]:
z.iloc[:,9:]

Unnamed: 0,Unnamed: 1,immigration,synfuels,education,superfund,crime,exports,ZAF
democrat,?,4,12,18,15,10,16,82
democrat,n,139,126,213,179,167,91,12
democrat,y,124,129,36,73,90,160,173
republican,?,3,9,13,10,7,12,22
republican,n,73,138,20,22,3,142,50
republican,y,92,21,135,136,158,14,96


In [6]:
# generate one-hot coding for issues with lots of missing votes

def oneHot(data, colName):
    x = data.loc[:,colName]
    oneHotMat = pd.concat([(x=='y'),(x=='n'),(x=='q')], axis=1)
    oneHotMat.columns = [colName+'_'+suff for suff in ['y','n','q']]
    return oneHotMat

oneHotCols = pd.concat([oneHot(raw,'water'), oneHot(raw,'education'), 
                        oneHot(raw,'ZAF')], axis=1)

In [7]:
# remove variables with completed one-hot coding from list of variables
yesVars = np.setdiff1d(vote84Names[1:],['water','education','ZAF'])
yesVote = raw.loc[:,yesVars] == 'y' # boolean for vote='yes' for rest of vars
yesVote.columns = [s+'_y' for s in yesVote.columns]
repub = raw.loc[:,['class']] == 'republican' # boolean for republicans
voteData = pd.concat([yesVote,oneHotCols], axis=1) # concat two dataframes
voteMat = voteData.values * 1 # give matrixs of 0 & 1 for calculation
repubVec = repub.values.ravel() * 1 # vector of 0 & 1 for calculation

In [112]:
def WinnowTrain(data, classVec, prm, trace=False):
    wts = np.ones(data.shape[1]) # initialize weight vector
    if trace: # print initial weights if trace is on
        print('initial weights: %s'%wts) 
    for n,x in enumerate(data):
        pred = wts.dot(x) > prm['theta'] # prediction is if f > theta
        if pred != classVec[n]: # wrong prediction: promotion / demotion
            mult = (pred==0)*prm['alpha'] + (pred==1)/prm['alpha'] 
            wts = (x==1)*wts*mult + (x==0)*wts # update weights
            if trace: # print updated weights if its 
                print('[%d] new weights: %s'%(n,wts)) 
        else: # correct prediction, no update needed, only for tracing
            if trace: # print that no update to weights
                print('[%d] no update to weights'%n)
    if trace: # print final weights for trace
        print('Final weights: %s'%wts)
    return wts

def WinnowPred(data, wts, prm):
    return (data.dot(wts) > prm['theta'])*1 # prediction: if f > theta

def errRates(pred, actual):
    return np.sum(actual!=pred)/pred.size # return error rate

In [204]:
def condProb(data, add): # data assumed to be class-homogenous
    condPr = np.zeros(data.shape[1]) # pre-allocate cond probilities
    for n,x in enumerate(data.T): # loop over the columns of the data
        condPr[n] = (sum(x==0)+add)/(len(x)+add) # laplace smooth if needed
    return condPr

################################################################################
def NB_Train(data, classVec, smooth=True):
    smoothAdd = smooth*1 # addition to num and denom for smoothing
    
    pr_C0 = sum(classVec==0)/len(classVec) # probability of class=0
    condPr_C0 = condProb(data[classVec==0,], smoothAdd) # cond prob class=0
    condPr_C1 = condProb(data[classVec==1,], smoothAdd) # cond prob class=1
    return (pr_C0,condPr_C0,condPr_C1) # return class prob and cond probs

def NB_pred(data, probs): # predicting based on conditional probs
    pr_C0,condPr0_C0,condPr0_C1 = probs
    xCondsC0 = (data==0)*condPr0_C0 + (data==1)*(1-condPr0_C0)
    xCondsC1 = (data==0)*condPr0_C1 + (data==1)*(1-condPr0_C1)
    PrX_C0 = np.cumprod(xCondsC0,1)[:,-1] * pr_C0
    PrX_C1 = np.cumprod(xCondsC1,1)[:,-1] * (1-pr_C0)
    return (PrX_C1>PrX_C0)*1

In [548]:
def crossValidate(dataMat, classVec, hyPrm, trace=False):
    slices = 10
    idx = np.arange(dataMat.shape[0]) # construct index of data rows
    np.random.shuffle(idx) # random shuffle data
    chunks = np.array_split(idx, slices) # split into N equal sized chunks

    errsWinnow = np.zeros(slices)  # pre-allocate Winnow errors for each fold
    errsNB = np.zeros(slices) # pre-allocate NB errors for each fold
    for n in range(slices): # loop over all slices
        # get index and dataset for current fold of cross-validation
        trnIdx = np.hstack([chunks[x] for x in range(slices) if x != n])
        vldIdx = np.hstack([chunks[x] for x in range(slices) if x == n])
        dataTrain,classTrain = dataMat[trnIdx,:],classVec[trnIdx] # training
        dataVald,classVald = dataMat[vldIdx,:],classVec[vldIdx] # validation

        # train and test Winnow algorithm
        wts = WinnowTrain(dataTrain, classTrain, hyPrm, False)
        pred = WinnowPred(dataVald, wts, hyPrm)
        errsWinnow[n] = errRates(pred, classVald)
        
        # train and test Naive Bayes
        probs = NB_Train(dataTrain, classTrain, smooth=True)
        pred = NB_pred(dataVald, probs)
        errsNB[n] = errRates(pred, classVald)

    print('Average error rate for Winnow is %f.'%np.mean(errsWinnow))
    print('Std Dev of error rate for Winnow is %f.'%np.std(errsWinnow))
    print('Average error rate for NB is %f.'%np.mean(errsNB))
    print('Std Dev of error rate for NB is %f.'%np.std(errsNB))
    return pred
################################################################################

def estModels(dataMat, classVec, hyPrm, trace=False):
    wts = WinnowTrain(dataMat, classVec, hyPrm, False)
    probs = NB_Train(dataMat, classVec, smooth=True)
    return wts,probs

In [554]:
votePrm = {'theta': 0.5*6, 'alpha': 2} # hyper-parameters for vote data
z = crossValidate(voteMat, repubVec, votePrm) # cross validation test
estModels(voteMat,repubVec,votePrm)

Average error rate for Winnow is 0.073837.
Std Dev of error rate for Winnow is 0.051851.
Average error rate for NB is 0.103383.
Std Dev of error rate for NB is 0.029404.


(array([2.44140625e-04, 1.25000000e-01, 2.50000000e-01, 4.00000000e+00,
        6.25000000e-02, 2.50000000e-01, 3.12500000e-02, 1.25000000e-01,
        3.12500000e-02, 5.00000000e-01, 2.50000000e-01, 2.50000000e-01,
        1.22070312e-04, 3.12500000e-02, 2.50000000e-01, 1.00000000e+00,
        2.50000000e-01, 6.25000000e-02, 1.00000000e+00, 5.00000000e-01,
        1.25000000e-01, 1.00000000e+00]),
 (0.6137931034482759,
  array([0.1380597 , 0.18656716, 0.6641791 , 0.94776119, 0.40298507,
         0.53731343, 0.41791045, 0.29850746, 0.54104478, 0.79477612,
         0.25373134, 0.72761194, 0.51865672, 0.55223881, 0.55597015,
         1.        , 0.86567164, 0.20522388, 1.        , 0.35447761,
         0.95522388, 1.        ]),
  array([0.86982249, 0.85798817, 0.06508876, 0.03550296, 0.91715976,
         0.4556213 , 0.81656805, 0.88757396, 0.1183432 , 0.07100592,
         0.76923077, 0.19526627, 0.87573964, 0.55621302, 0.56804734,
         1.        , 0.20118343, 0.8816568 , 1.        , 0

In [314]:
bc_WI_data = os.path.join(dataDir, 'breast-cancer-wisconsin.data')
bc_WI_names = ['id', 'clumpThick', 'unifSize', 'unifShape', 'margAdhsn', 
               'epithSize', 'bareNuclei', 'blandChrom', 'normNucleo', 
               'mitoses', 'class']
raw = pd.read_csv(bc_WI_data , names=bc_WI_names)  # read CSV file
raw = raw.apply(pd.to_numeric, errors= 'coerce') # convert all to numeric
bcFeats = bc_WI_names[1:-1] # list of feature variables

In [332]:
meanVals = raw[bcFeats].mean() # mean value for every feature
bcData = pd.DataFrame() # pre-allocate data frame for data
for v in bcFeats: # create DF of features if they are > mean feat value
    bcData[v] = raw[v] > meanVals[v]
bcMat = bcData.values * 1 # feature vector of 0 & 1s
malign = raw.loc[:,["class"]]==4 # if case is malignant tumor
malignVec = malign.values.ravel()*1 # class vectors of 0 & 1s

In [555]:
brCanPrm = {'theta': 0.5*3, 'alpha': 2} # hyper-parameters for vote data
z = crossValidate(bcMat, malignVec, brCanPrm)

estModels(bcMat,malignVec,brCanPrm)

Average error rate for Winnow is 0.095776.
Std Dev of error rate for Winnow is 0.043748.
Average error rate for NB is 0.031511.
Std Dev of error rate for NB is 0.017964.


(array([0.25   , 0.5    , 1.     , 0.125  , 0.03125, 0.5    , 0.5    ,
        0.0625 , 0.5    ]),
 (0.6552217453505007,
  array([0.76906318, 0.96949891, 0.95206972, 0.89978214, 0.95642702,
         0.95206972, 0.95642702, 0.94335512, 0.97167756]),
  array([0.1322314 , 0.15702479, 0.13636364, 0.2231405 , 0.28099174,
         0.16942149, 0.19008264, 0.19834711, 0.55785124])))

In [316]:
raw.groupby('class')[bcFeats].median()

Unnamed: 0_level_0,clumpThick,unifSize,unifShape,margAdhsn,epithSize,bareNuclei,blandChrom,normNucleo,mitoses
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,3,1,1,1,2,1.0,2,1,1
4,8,6,6,5,5,10.0,7,6,1


In [317]:
raw.groupby('class')[bcFeats].mean()

Unnamed: 0_level_0,clumpThick,unifSize,unifShape,margAdhsn,epithSize,bareNuclei,blandChrom,normNucleo,mitoses
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,2.956332,1.325328,1.443231,1.364629,2.120087,1.346847,2.100437,1.290393,1.063319
4,7.195021,6.572614,6.560166,5.547718,5.298755,7.627615,5.979253,5.863071,2.589212


In [427]:
################################################################################
irisFile = os.path.join(dataDir, 'iris.data')
irisName = ['sepalLen', 'sepalWth', 'petalLen', 'petalWth', 'class']
raw = pd.read_csv(irisFile , names=irisName)  # read CSV file

In [432]:
irisFeats = irisName[:-1]
meanVals = raw[irisFeat].mean() # mean value for every feature
irisData = pd.DataFrame() # pre-allocate data frame for data
for v in irisFeats: # create DF of features if they are > mean feat value
    irisData[v] = raw[v] > meanVals[v]
irisMat = irisData.values * 1 # feature vector of 0 & 1s
setosa = raw.loc[:,["class"]]=='Iris-setosa' # Iris-setosa class
setosaVec = setosa.values.ravel()*1 # class vectors of 0 & 1s

In [556]:
irisPrm = {'theta': 0.5*1, 'alpha': 2} # hyper-parameters for vote data
z = crossValidate(irisMat, setosaVec, irisPrm)
estModels(irisMat,setosaVec,irisPrm)

Average error rate for Winnow is 0.240000.
Std Dev of error rate for Winnow is 0.095219.
Average error rate for NB is 0.040000.
Std Dev of error rate for NB is 0.044222.


(array([0.125, 0.125, 0.125, 0.125]),
 (0.6666666666666666,
  array([0.30693069, 0.75247525, 0.07920792, 0.10891089]),
  array([1.        , 0.17647059, 1.        , 1.        ])))

In [387]:
raw.groupby('class').mean()

Unnamed: 0_level_0,sepalLen,sepalWth,petalLen,petalWth
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,5.006,3.418,1.464,0.244
Iris-versicolor,5.936,2.77,4.26,1.326
Iris-virginica,6.588,2.974,5.552,2.026


In [None]:
raw = raw.apply(pd.to_numeric, errors= 'coerce') # convert all to numeric
bcFeats = bc_WI_names[1:-1] # list of feature variables

In [None]:
voteModel = pd.DataFrame(Winnow(voteMat, repubVec, modlPrm, False))
voteModel.columns = ['weight']
voteModel.index = voteData.columns

In [None]:
testX = np.array([[0,0,0],[0,0,1],[0,1,0],[0,1,1],[1,0,0],[1,0,1],[1,1,0],[1,1,1]])
testY = [0,0,0,1,0,1,0,1]
prm = {'theta':0.5, 'alpha': 2}
Winnow(testX, testY, prm, True)

In [81]:
testX = np.array([[1,1,1,0,0], [0,1,1,1,0], [0,0,1,1,1], [1,0,0,1,0], 
                  [0,1,1,0,1], [0,1,0,1,0], [1,0,0,0,0], [0,1,0,0,0], 
                  [1,0,1,0,1], [0,1,0,1,0]])
testY = np.array([0,0,0,0,0,1,1,1,1,1])
z = NB_Train(testX, testY, False)

In [379]:
################################################################################
raw = pd.read_csv( os.path.join(dataDir, glassData), names=glassNames )

# adding some binary classes
raw['window'] = raw['type'] < 5 # type 1-4 are windows
raw['buildWin'] = raw['type'] < 3 # type 1 & 2 are building windows
raw['floatProc'] = raw['type'].isin([1,3]) # 1 & 3 are float processed glass
feature = ['RI','Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe']

In [383]:
raw['window'].value_counts()

True     163
False     51
Name: window, dtype: int64

In [367]:
raw.groupby(['window'])[feature].mean()

Unnamed: 0_level_0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
window,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
False,1.517638,14.066667,0.733725,1.966667,72.85549,0.559608,9.060196,0.639216,0.023137
True,1.518593,13.201718,3.294908,1.281656,72.586933,0.477485,8.924663,0.029816,0.067607


In [374]:
raw.shape

(214, 14)

In [368]:
raw.mean()

id           107.500000
RI             1.518365
Na            13.407850
Mg             2.684533
Al             1.444907
Si            72.650935
K              0.497056
Ca             8.956963
Ba             0.175047
Fe             0.057009
type           2.780374
window         0.761682
buildWin       0.682243
floatProc      0.406542
dtype: float64