In [1]:
import numpy as np
import matplotlib.pyplot as plt


In [2]:
# load and preprocess the data

spambase_path = "./data/spambase/spambase.data"
X = np.genfromtxt(spambase_path, delimiter=',')

Y = X[:, -1]
X = X[:, :-1]

## modify 10% data labels
tochange = np.random.choice(Y.shape[0], int(0.1*Y.shape[0]))
Y[tochange] = 1 - Y[tochange]

Y[Y < 0.01] = -1
X = (X - X.mean(axis=0)) / X.std(axis=0)
X = np.hstack((X, np.ones((X.shape[0],1))))


[1850 4329 2984 3406  950 2120 1757 4170 4233 2716 1182 1450 1196 3568
 4448  338  742 3271  417 1335 2422 1183 4527  433 4245 2950 2321 2250
 3448 3060 2913 1476  164   12  323 3731 4150 3173 2803  584  130 1807
 1444  149 1918 2666 2594 1574 3155 2013 3563 3796 4448 1264 2374  179
 3525  551 3731 1018  262 3284 2903  813  404 1252 2314 2348 3763  680
 2481    7 4527 1014 2925  161  679 4160 2675 2350 3251 3838 3738 3982
 3265  304 2705 1359 2232 3460 4104 4006 3432  136 3941 3416 3809  531
 2063 1467 2253 1596 1236 4092 3529  829 1277 3322 1664 1047 2581 4246
  338 1650  843 3498  778 3704   68  337 3263 4570 2115   80 1905 4401
 1899  435  928 3847 3116 4367 3623 1616 2213 1640 4171 2164 3813 3815
 3906 1473 3637 1163  426  635   31 3653  488 1162  164  258  254  254
  289 1129  833 1099 3398 3618 4304 3326 2572 3076 3600  707  650 1673
 1355 3035 2080 3607 1409 3362 4395 3374  256  512 1245 4031 4477 3587
 3296 3353 3172 2596 3639 3929  860 4464  923 2194 2105 2836  915 3529
 1804 

In [None]:
def perceptron(X, Y, T, eta=0.1):
    correct = np.zeros(T,)

    N, d = X.shape
    W = np.zeros(d,)

    for t in range(T):
        it = np.random.randint(N)
        x = X[it]

        y_bar = -1 if W.dot(x) < 0 else 1
        y = Y[it]

        if y*y_bar < 0:
            W = W + eta * y * x
            correct[t] = 0
        else:
            correct[t] = 1
            
    return correct


In [None]:
def winnow(X, Y, T, eta=0.1):
    correct = np.zeros(T,)
    X = np.hstack((X, -X))
    
    N, d = X.shape
    W = np.ones(d,) / d
    
    for t in range(T):
        it = np.random.randint(N)
        x = X[it]
        
        y_bar = -1 if W.dot(x) < 0 else 1
        y = Y[it]
        
        if y*y_bar < 0:
            W = W * np.exp(eta * y * x)
            W = W / W.sum()
            correct[t] = 0
        else:
            correct[t] = 1
            
    return correct
    

In [None]:
def check_accuracy(X, Y, W):
    cor = sum([1 for x, y in zip(X, Y) if y*W.dot(x) > 0])
    return 1.0 * cor / X.shape[0]

In [None]:
T = 10000
runs = 100
correct_perc1 = np.zeros((runs, T))

for i in range(runs):
    np.random.seed(i*57)
    cor = perceptron(X, Y, T, eta=0.1)
    correct_perc1[i, :] = cor


In [None]:
T = 10000
runs = 100
correct_winn1 = np.zeros((runs, T))

for i in range(runs):
    np.random.seed(i*57)
    cor = winnow(X, Y, T, eta=0.1)
    correct_winn1[i, :] = cor


In [None]:
T = 10000
runs = 100
correct_perc2 = np.zeros((runs, T))

for i in range(runs):
    np.random.seed(i*57)
    cor = perceptron(X, Y, T, eta=1.0)
    correct_perc2[i, :] = cor


In [None]:
T = 10000
runs = 100
correct_winn2 = np.zeros((runs, T))

for i in range(runs):
    np.random.seed(i*57)
    cor = winnow(X, Y, T, eta=1.0)
    correct_winn2[i, :] = cor


In [None]:
%matplotlib notebook

xlab = np.arange(T)
g = 100

mu = correct_perc1.mean(axis=0)
sig = correct_perc1.std(axis=0)**2
plt.errorbar(xlab[::g], mu[::g], yerr=sig[::g], fmt='o', label="perceptron(eta=0.1)")


plt.legend(loc=4)
plt.ylim(0, 1)
plt.ylabel("correct predictions")
plt.xlabel("time steps")
plt.title("Perceptron on Spambase database")
plt.show()

