In [1]:
import csv
import numpy as np

In [9]:
# Whether to do exact regression or approximate regression with gradient descent
mode = 'exact'  #  'exact', 'gradient_descent'

In [7]:
def prepend_1(x):
  # prepend 1 to every row in x
  return np.concatenate((np.ones((x.shape[0], 1)), x), axis=1)

# Exact regression
def solve_exact(x, y):
  x1 = prepend_1(x)
  return np.matmul(np.matmul(np.linalg.inv(np.matmul(x1.T, x1)), x1.T), y)

def gradient(x, y, b):
  # Loss = mean(||y - x.b||^2)
  return -2*np.matmul(x.T, y - np.matmul(x, b)) / x.shape[0]

# Gradient descent
def solve_gd(x, y, lr=1e-5, iterations=40000):
  x_pad = prepend_1(x)
  b = np.random.normal(0.0, 0.01, (x_pad.shape[1], 1))
  grad_norms = []
  for i in range(iterations):
    g = gradient(x_pad, y, b)
    b -= lr * g
    grad_norms.append(np.linalg.norm(g))
    if i % 2000 == 0:
      rss = ((y - np.matmul(x_pad, b))**2).mean()
      print('Iteration {}: RSS={:.2f}; Avg gradient norm={:.2f}'.format(i, rss, np.mean(grad_norms)))
    if i % 50 == 0:
      grad_norms = []
  return b

def predict(x, y, b):
  y_hat = np.matmul(prepend_1(x), b)
  error = y - y_hat
  se = error**2  # squared error
  return y_hat, se

In [None]:
assert mode in ('exact', 'gradient_descent')   # Exact solve, or gradient descent

with open('../datasets/winequality/winequality-white.csv') as f:
  reader = csv.reader(f, delimiter=';')
  header = next(reader)
  body = np.asarray([[float(value) for value in row] for row in reader])

print('Columns:', header)
print('Num data points:', len(body))

x = body[:, :-1]
y = body[:, -1:]
N = 3500
train = (x[:N], y[:N])
test = (x[N:], y[N:])
print('Train/test split:', len(train[0]), ':', len(test[0]))

make_beta_hat = {'exact': solve_exact, 'gradient_descent': solve_gd}[mode]
beta_hat = make_beta_hat(*train)
print('beta_hat:', beta_hat.reshape(-1))

def print_stats(se):
  print('  avg RSS:', se.mean())
  print('  Squared error deciles:', ' '.join(['{:.2f}'.format(d) for d in np.quantile(se, np.arange(0, 1.01, 0.1))]))

y_hat, se = predict(*train, beta_hat)
print('Train')
print_stats(se)

y_hat, se = predict(*test, beta_hat)
print('Test')
print_stats(se)
print('')

# View best and worst predictions
ranking = np.argsort(se.reshape(-1))
best = ranking[0]
worst = ranking[-1]
middle = ranking[len(ranking)//2]

print('Good prediction:')
print('X')
for key, val in zip(header, test[0][best]):
  print('  {:<24} {:.2f}'.format(key, val))
  print('Y true {:.2f}'.format(test[1][best, 0]))
  print('Y pred {:.2f}  (error {})'.format(y_hat[best, 0], se[best, 0]))
  print('')

print('Median prediction:')
print('X')
for key, val in zip(header, test[0][middle]):
  print('  {:<24} {:.2f}'.format(key, val))
  print('Y true {:.2f}'.format(test[1][middle, 0]))
  print('Y pred {:.2f}  (error {})'.format(y_hat[middle, 0], se[middle, 0]))
  print('')

print('Bad prediction:')
print('X')
for key, val in zip(header, test[0][worst]):
  print('  {:<24} {:.2f}'.format(key, val))
  print('Y true {:.2f}'.format(test[1][worst, 0]))
  print('Y pred {:.2f}  (error {})'.format(y_hat[worst, 0], se[worst, 0]))