In [None]:
from scipy.optimize import curve_fit
import pandas

In [None]:
data = pandas.read_csv('data/winequality-red.csv', sep=';')

In [None]:
data.head()

In [None]:
data.cov()

In [None]:
data.columns

In [None]:
variables = [u'fixed acidity', u'volatile acidity', u'citric acid',
       u'residual sugar', u'chlorides', u'free sulfur dioxide',
       u'total sulfur dioxide', u'density', u'pH', u'sulphates', u'alcohol']

## The Numpy Way

In [None]:
import numpy as np

In [None]:
X = data[variables].values
N, p = X.shape
X = np.hstack((np.repeat(1, N).reshape(-1,1), X))
XTXi = np.linalg.inv(np.dot(X.transpose(), X))
theta = np.dot(np.dot(XTXi, X.transpose()), data['quality'])

In [None]:
theta

In [None]:
sigma = 1. / (N - p - 1) * sum((f(data, *theta) - data['quality'])**2)
var_theta = np.dot(XTXi, sigma)

In [None]:
d_theta = np.sqrt(var_theta.diagonal())

In [None]:
from scipy.stats import t

In [None]:
print " ".join(["{:>12}".format(i) for i in ('name', 'coef', 'std', 't value', 'Pr(>|t|)')])
for c, dc, name in zip(theta, d_theta, ['const.'] + variables):
    print "{:>12}".format(name[:10]),
    print " ".join(["{:>12f}".format(i) for i in (c, dc, c/dc)]),
    Pz = 2*t.sf(abs(c/dc), len(data) - 5)
    if Pz < 1e-16:
        print " {:>10}".format("< 1e-16"),
    else:
        print " {:>10.2e}".format(Pz),
    if Pz < 0.05:
        print "***"
    else:
        print

## The Scipy Way

In [None]:
def f(x, t0, *thetas):
    return t0 + sum(theta*x[var] for theta, var in zip(thetas, variables))

In [None]:
res = curve_fit(f, data, data['quality'], [0]*(1 + len(variables)))

In [None]:
beta, pcov = res

In [None]:
beta

In [None]:
pcov.diagonal()

In [None]:
sigmab = np.sqrt(pcov.diagonal())

# Subset Selection and Shrinkage

## Best Subsets

In [None]:
from itertools import combinations
from sklearn.linear_model import LinearRegression

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(data, test_size=0.2)

In [None]:
def LinRSS(X, y):
    f = LinearRegression().fit(X, y)
    return sum((f.predict(X) - y)**2)

In [None]:
RSS = {}
for i in range(1, len(variables) + 1):
    RSS[i] = {}
    for subset in combinations(variables, i):
        RSS[i][subset] = LinRSS(data[list(subset)], data['quality'])

In [None]:
for i in RSS:
    plt.plot([i]*len(RSS[i]), RSS[i].values(), 'o', color='grey')
    plt.plot([i], min(RSS[i].values()), 'o')

## Forward stepwise

In [None]:
RSS_fwd = []
current = []
while len(current) != len(variables):
    candidates = [v for v in variables if v not in current]
    RSS_new = pandas.Series([LinRSS(data[current + [c]], data.quality)
                             for c in candidates], index=candidates)
    RSS_fwd.append(RSS_new.min())
    current.append(RSS_new.idxmin())

In [None]:
current, RSS_fwd

In [None]:
plt.plot(RSS_fwd, '.')
plt.plot([min(RSS[i].values()) for i in RSS], '.')

## Backward Stepwise

In [None]:
current = list(variables)
RSS_bwd = []
while len(current) > 1:
    RSS_current = pandas.Series([LinRSS(data[[i for i in current if i != c]], data.quality) for c in current],
                               index=current)
    current.remove(RSS_current.idxmin())
    RSS_bwd.append(RSS_current.min())

In [None]:
plt.plot(RSS_bwd[::-1])
plt.plot(RSS_fwd)

## Ignore This

In [None]:
def D(theta):
    return f(data, *theta) - data.quality

In [None]:
from scipy.optimize import leastsq

In [None]:
res = leastsq(D, [0]*(1 + len(variables)), full_output=True)

In [None]:
(res[1] * np.sum((res[2]['fvec'])**2)/len(data.quality)).diagonal()

In [None]:
beta, pcov.diagonal()

## Ridge

### Center data points

 name         coef          std      t value     Pr(>|t|)
      const.    21.965208    21.194575     1.036360    3.00e-01
  fixed acid     0.024991     0.025949     0.963083    3.36e-01
  volatile a    -1.083590     0.121101    -8.947802     < 1e-16 ***
  citric aci    -0.182564     0.147176    -1.240445    2.15e-01
  residual s     0.016331     0.015002     1.088599    2.76e-01
   chlorides    -1.874225     0.419283    -4.470070    8.37e-06 ***
  free sulfu     0.004361     0.002171     2.008635    4.47e-02 ***
  total sulf    -0.003265     0.000729    -4.479830    8.00e-06 ***
     density   -17.881164    21.633100    -0.826565    4.09e-01
          pH    -0.413653     0.191597    -2.158971    3.10e-02 ***
   sulphates     0.916334     0.114337     8.014297    2.12e-15 ***
     alcohol     0.276198     0.026484    10.429014     < 1e-16 ***


In [None]:
variables

In [None]:
variables = [u'volatile acidity',
             u'chlorides',
             u'free sulfur dioxide',
 u'total sulfur dioxide',
             u'pH',
 u'sulphates',
 u'alcohol']

In [None]:
data = pandas.read_csv('data/winequality-red.csv', sep=';')
for column in variables:
    mean, sigma = data[column].mean(), data[column].std()
    data[column] = (data[column] - mean) / sigma

In [None]:
theta0 = np.mean(data['quality'])

In [None]:
train, test = train_test_split(data, test_size=0.2)

In [None]:
def f_ridge(x, *thetas):
    return theta0 + sum(theta*x[var] for theta, var in zip(thetas, variables))

In [None]:
def ridge(l):
    def D_ridge(theta):
        fev = f_ridge(data, *theta) - data['quality']
        return np.concatenate((fev, [l*(t**2) for t in theta]))
    return D_ridge

In [None]:
leastsq(ridge(1e-2), [0]*len(variables))

In [None]:
def dof_eff(l):
    X = train[variables].values
    N, p = X.shape
    XTXi = np.linalg.inv(np.dot(X.transpose(), X) + np.identity(p) * l)
    return np.dot(np.dot(X, XTXi), X.transpose()).trace()

In [None]:
from scipy.optimize import minimize_scalar

In [None]:
dof = np.arange(2, 10, 0.5)
lambdas = [minimize_scalar(lambda l: (dof_eff(l) - d)**2).x for d in dof]

In [None]:
plt.plot(dof, lambdas, 'o')
plt.yscale('log')

In [None]:
MSE = []
thetas = []
for l in lambdas:
    theta, jac, d, msg, ok = leastsq(ridge(l), [0]*len(variables), full_output=True)
    thetas.append(theta)
    MSE.append(np.mean((data['quality'] - f_ridge(data, *theta))**2))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
for i in range(len(variables)):
    plt.plot(dof, [theta[i] for theta in thetas], '.', label=variables[i])
#plt.legend(loc='lower left')

In [None]:
plt.plot(dof, MSE, 'o')
plt.plot(RSS.keys(), [min(RSS[i].values())/len(data.quality) for i in RSS.keys()], 'o')

In [None]:
from scipy.optimize import minimize