In [4]:
import csv
import numpy as np
norm = np.linalg.norm
inv = np.linalg.inv

In [5]:
def dot(a, b):
  # Returns scalar
  return np.dot(a.T, b)[0, 0]

def angle(a, b):
  # https://stackoverflow.com/a/13849249
  return np.arccos(np.clip(dot(a/norm(a), b/norm(b)), -1.0, 1.0))

def single_variate_regression(x, y):
  # Assumes x and y are column vectors
  # Returns beta
  return dot(x, y) / dot(x, x)

def multi_variate_regression(x, y):
  return inv(x.T @ x) @ x.T @ y

def compute_residual(x, z_basis):
  gammas = [single_variate_regression(z, x) for z in z_basis]
  return x - sum([gamma * z for gamma, z in zip(gammas, z_basis)]), gammas

def prepend_1(x):
  # prepend 1 to every row in x
  return np.concatenate((np.ones((x.shape[0], 1)), x), axis=1)

In [39]:
with open('../datasets/winequality/winequality-white.csv') as f:
  reader = csv.reader(f, delimiter=';')
  header = next(reader)
  body = np.asarray([[float(value) for value in row] for row in reader])
x = body[:, :-1]
y = body[:, -1:]
N = 1000  # x.shape[0] // 2
train = (x[:N], y[:N])
test = (x[N:], y[N:])

x = prepend_1(train[0])
y = train[1]
r = y.copy()  # Residual. The unexplained portion of y.
beta_hat = np.zeros([x.shape[1]])  # Initialize beta as all 0s.
indices = list(range(x.shape[1]))  # Set of feature columns not yet used.
active_set = []  # Subset of feature columns being used for regression.
while indices:
  # Select feature column that has smallest angle to residual.
  angles = [angle(x[:, i:i+1], r) for i in indices]
  best_i = np.argmin(angles)
  print('Angle:',angles[best_i] / (2*np.pi))
  active_set.append(indices[best_i])  # Add new feature column to our active set.
  del indices[best_i]
  beta_res = multi_variate_regression(x[:, active_set], r)
  beta_hat[active_set] += beta_res.reshape(-1)
  r_proj = x[:, active_set] @ beta_res
  r -= r_proj  # Subtract off residual "explained" by additional feature column.
  
  # Sanity checks.
  beta_check = multi_variate_regression(x[:, active_set], y)
  assert np.allclose(beta_check.reshape(-1), beta_hat[active_set])
  assert np.allclose(y - x[:, active_set] @ beta_check, r)
  
  # Report MSE on test set.
  print('Subset:', active_set)
  test_x = prepend_1(test[0])[:, active_set]
  test_y = test[1]
  y_hat = test_x @ multi_variate_regression(test_x, test_y)
  print('Train MSE:', np.mean(np.square(r)))
  print('Test MSE:', np.mean(np.square(y_hat - test_y)))
  print('')

assert np.allclose(full_beta_hat.reshape(-1), beta_hat)

Angle: 0.02362164405772462
Subset: [11]
Train MSE: 0.7711826249396441
Test MSE: 0.7010798750305166

Angle: 0.23164355194545877
Subset: [11, 6]
Train MSE: 0.7239716929499962
Test MSE: 0.6489762338398364

Angle: 0.24235720206754238
Subset: [11, 6, 10]
Train MSE: 0.6938178230603371
Test MSE: 0.6361201185890205

Angle: 0.24738613512783106
Subset: [11, 6, 10, 4]
Train MSE: 0.6932382235685295
Test MSE: 0.6025399640927911

Angle: 0.2480512207819947
Subset: [11, 6, 10, 4, 9]
Train MSE: 0.6765728598565822
Test MSE: 0.5880198830548996

Angle: 0.24820358474821022
Subset: [11, 6, 10, 4, 9, 3]
Train MSE: 0.6758749193956046
Test MSE: 0.5875657748356725

Angle: 0.24996877061924885
Subset: [11, 6, 10, 4, 9, 3, 1]
Train MSE: 0.6758731194951852
Test MSE: 0.5870945925576196

Angle: 0.2504656625844256
Subset: [11, 6, 10, 4, 9, 3, 1, 0]
Train MSE: 0.6695618572048802
Test MSE: 0.5823512067786656

Angle: 0.25001312828343064
Subset: [11, 6, 10, 4, 9, 3, 1, 0, 8]
Train MSE: 0.6462007903242177
Test MSE: 0.56910