# Forward stagewise selection

As described in the [Least Angle Regression](http://statweb.stanford.edu/~tibs/ftp/lars.pdf) paper. This is a precursor to the LAR algorithm.

In [79]:
import csv
import numpy as np
norm = np.linalg.norm
inv = np.linalg.inv

In [80]:
def normalize(v): return v / norm(v, axis=0)

def dot(a, b):
  # Returns scalar
  return np.dot(a.T, b)[0, 0]

def angle(a, b):
  # https://stackoverflow.com/a/13849249
  return np.arccos(np.clip(dot(a/norm(a), b/norm(b)), -1.0, 1.0))

def single_variate_regression(x, y):
  # Assumes x and y are column vectors
  # Returns beta
  return dot(x, y) / dot(x, x)

def multi_variate_regression(x, y):
  return inv(x.T @ x) @ x.T @ y

def compute_residual(x, z_basis):
  gammas = [single_variate_regression(z, x) for z in z_basis]
  return x - sum([gamma * z for gamma, z in zip(gammas, z_basis)]), gammas

def prepend_1(x):
  # prepend 1 to every row in x
  return np.concatenate((np.ones((x.shape[0], 1)), x), axis=1)

def standardize(x):
  return normalize(x - x.mean(axis=0))

In [152]:
with open('../datasets/winequality/winequality-white.csv') as f:
  reader = csv.reader(f, delimiter=';')
  header = next(reader)
  body = np.asarray([[float(value) for value in row] for row in reader])
x = body[:, :-1]
y = body[:, -1:]
N = 1000
train = (x[:N], y[:N])
test = (x[N:], y[N:])

# Compute stagewise selection
epsilon = 1e-3  # step size
x = prepend_1(standardize(train[0]))  # Center and normalize predictors
y_mu = train[1].mean()
y = train[1] - y_mu  # Center response
full_beta_hat = multi_variate_regression(x, y).reshape(-1)
l1_full_beta_hat = np.abs(full_beta_hat).sum()
print('L1(full beta_hat) =', l1_full_beta_hat)
beta_hat = np.zeros([x.shape[1]])
turned_on = np.zeros([x.shape[1]])
iterations = 0
while np.abs(l1_full_beta_hat - np.abs(beta_hat).sum()) > 0.2:
  c = x.T @ (y - x @ beta_hat.reshape(-1, 1))  # Correlation is the dot product
  j = np.argmax(np.abs(c))  # Maximum correlation, minimum angle
  if turned_on[j] == 0:
    print('Turned on beta_{}'.format(j))
    turned_on[j] = 1
  beta_hat[j] += epsilon * np.sign(c[j])
  if iterations % 10000 == 0:
    # Eval on test set
    y_hat = y_mu + prepend_1(standardize(test[0])) @ beta_hat
    error = test[1].reshape(-1) - y_hat
    mse = np.sqrt(error**2).mean()
    print('L1(beta_hat) =', np.abs(beta_hat).sum())
    print('Test mean squared error =', mse)
  iterations += 1

L1(full beta_hat) = 72.84742721984625
Turned on beta_11
L1(beta_hat) = 0.001
Test mean squared error = 0.6642038626070994
Turned on beta_2
Turned on beta_9
L1(beta_hat) = 10.001000000000655
Test mean squared error = 0.6197896138130714
Turned on beta_10
Turned on beta_6
Turned on beta_5
L1(beta_hat) = 20.000999999999564
Test mean squared error = 0.6062615677045224
Turned on beta_8
Turned on beta_1
Turned on beta_7
Turned on beta_4
L1(beta_hat) = 30.00099999999891
Test mean squared error = 0.6003658653026395
L1(beta_hat) = 37.50099999999944
Test mean squared error = 0.5977200791579266
L1(beta_hat) = 43.87900000000088
Test mean squared error = 0.5965069368359485
Turned on beta_3
L1(beta_hat) = 50.1729999999986
Test mean squared error = 0.5955025219155503
L1(beta_hat) = 56.410999999996235
Test mean squared error = 0.5946842408028592
L1(beta_hat) = 62.555000000000085
Test mean squared error = 0.5941169350109907
L1(beta_hat) = 68.58300000000433
Test mean squared error = 0.5938192191304464


In [153]:
np.round(beta_hat - full_beta_hat, 2)

array([ 0.  , -0.02, -0.  , -0.  , -0.05, -0.  ,  0.  , -0.01,  0.08,
       -0.01, -0.  ,  0.04])

In [154]:
y_hat = y_mu + prepend_1(standardize(test[0])) @ full_beta_hat
error = test[1].reshape(-1) - y_hat
mse = np.sqrt(error**2).mean()
print('Full regression MSE =', mse)

Full regression MSE = 0.5937635701821794
