# Principal Components Regression

Similar to forward stepwise, but use principal components instead of QR decomposition.

In [2]:
import csv
import numpy as np
norm = np.linalg.norm

In [3]:
def dot(a, b):
  return (a*b).sum(axis=-2)

def angle(a, b):
  # https://stackoverflow.com/a/13849249
  return np.arccos(np.clip(dot(a/norm(a), b/norm(b)), -1.0, 1.0))

def single_variate_regression(x, y):
  # Assumes x and y are column vectors
  # Returns beta
  return dot(x, y) / dot(x, x)

def compute_residual(x, z_basis):
  gammas = [single_variate_regression(z, x) for z in z_basis]
  return x - sum([gamma * z for gamma, z in zip(gammas, z_basis)]), gammas

def prepend_1(x):
  # prepend 1 to every row in x
  return np.concatenate((np.ones((x.shape[0], 1)), x), axis=1)

In [4]:
with open('../datasets/winequality/winequality-white.csv') as f:
  reader = csv.reader(f, delimiter=';')
  header = next(reader)
  body = np.asarray([[float(value) for value in row] for row in reader])
x = body[:, :-1]
y = body[:, -1:]
N = 1000
train = (x[:N], y[:N])
test = (x[N:], y[N:])

def subset_regression(k):
  assert k >= 1
  
  x, y = train
  x = prepend_1(x)
  
  # Principal component decomposition
  U, d, V = np.linalg.svd(x, full_matrices=False)
  V = V.T
  ordering = np.argsort(-d)  # Highest to lowest variance
  idxs = ordering[:k]
  y_hat = U[:, idxs] @ U[:, idxs].T @ y

  # Compute MSE
  train_mse = np.power(y - y_hat, 2).mean()
  print('train mse', train_mse)

# Forward stepwise subset selection.
for k in range(1, x.shape[1]):
  # Compute regression with different subset sizes.
  # Take best fitting features in each subset.
  subset_regression(k)

train mse 4.280718170208161
train mse 4.277988098751895
train mse 2.8932457608473023
train mse 0.7255696031462348
train mse 0.7062085599999718
train mse 0.6853729143027781
train mse 0.6738583921309597
train mse 0.6611952239932581
train mse 0.6396902616849713
train mse 0.6392377320918076
