In [None]:
import pandas

In [None]:
data = pandas.read_csv('data/winequality-red.csv', sep=';')

In [None]:
data.head()

In [None]:
data.columns

In [None]:
variables = [u'fixed acidity', u'volatile acidity', u'citric acid',
       u'residual sugar', u'chlorides', u'free sulfur dioxide',
       u'total sulfur dioxide', u'density', u'pH', u'sulphates', u'alcohol']

In [None]:
%matplotlib inline

In [None]:
data.plot.scatter(x = 'alcohol', y = 'quality')

In [None]:
data.alcohol.hist(bins = 30)

## The numpy way

In [None]:
import numpy as np

In [None]:
X = data[variables].values

In [None]:
N, p = X.shape

In [None]:
N, p

In [None]:
# adds column containing 1
X = np.hstack((np.repeat(1, N).reshape(-1, 1), X))

Calculate

$$\theta = (\mathbf{X}^T\mathbf{X})^{-1}\mathbf{X}^Ty\,.$$

In [None]:
XTXi = np.linalg.inv(np.dot(X.transpose(), X))
theta = np.dot(np.dot(XTXi, X.transpose()), data['quality'])

In [None]:
theta

In [None]:
list(zip(['const'] + variables, theta))

Calculate

$$\operatorname{Var}(\hat \theta) = (\mathbf{X}^T\mathbf{X})^{-1}
\sigma^2\,,$$

approximating

$$\hat \sigma^2 = \frac{1}{N-p-1}\sum_{i=1}^{N} (y_i - \hat y_i)^2\,.$$

In [None]:
sigma = 1./(N - p - 1)*sum((np.dot(X, theta) - data['quality'])**2)

In [None]:
var_theta = np.dot(XTXi, sigma)

In [None]:
d_theta = np.sqrt(var_theta.diagonal())

Calculate the z-scores

$$z_j = \frac{\hat \theta_j}{\hat\sigma \sqrt{v_j}}$$

and corresponding P values.

In [None]:
from scipy.stats import t

In [None]:
for c, dc, name in zip(theta, d_theta, ['const'] + variables):
    z = c / dc
    P = 2*t.sf(abs(z), N - p - 1)
    print "".join(["{:>12f}".format(i) for i in c, dc, z, P]), name,
    if P < 0.05:
        print '***'
    else:# ...
        print

# The scipy way

In [None]:
from scipy.optimize import curve_fit

In [None]:
variables

In [None]:
def f_lin_reg(X, t0, *thetas):
    return t0 + sum(theta*X[var] for theta, var in zip(thetas, variables))

In [None]:
beta, pcov = curve_fit(f_lin_reg,
                       data,
                       data['quality'],
                       [0]*(1 + len(variables)))

In [None]:
list(zip(beta, theta))

In [None]:
pcov.diagonal()

In [None]:
var_theta.diagonal()

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(data.alcohol, data.quality, '.')
plt.plot(data.alcohol, f_lin_reg(data, *theta), '.')

## Best subsets

In [None]:
from itertools import combinations

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
def LinRSS(X, y):
    f = LinearRegression().fit(X, y)
    return sum((f.predict(X) - y)**2)

In [None]:
list(combinations([1,2,3], 2))

In [None]:
RSS = {}
for k in range(1, len(variables) + 1):
    RSS[k] = {}
    for subset in combinations(variables, k):
        RSS[k][subset] = LinRSS(data[list(subset)], data.quality)

In [None]:
for i in RSS:
    plt.plot([i]*len(RSS[i]), RSS[i].values(), 'o', color='grey')
    plt.plot([i], min(RSS[i].values()), 'o', color='red')

## Ridge Regression

In [None]:
for column in variables:
    mean, std = data[column].mean(), data[column].std()
    data[column] = (data[column] - mean) / std

Define function to calculate

  $$\operatorname{df}(\lambda) =
  \operatorname{tr}\left\{\mathbf{X}(\mathbf{X}^T\mathbf{X} + \lambda
  \mathbf{I})^{-1}\mathbf{X}^T\right\}$$


In [None]:
def dof_eff(l):
    X = data[variables].values
    N, p = X.shape
    XTXi = np.linalg.inv(np.dot(X.transpose(), X) + np.identity(p) * l)
    return np.dot(np.dot(X, XTXi), X.transpose()).trace()

Now, we want to calculate which values for $\lambda$ correspond to a set of pre-defined values for degrees of freedom.

In [None]:
from scipy.optimize import minimize_scalar
dof = np.arange(0.5, 10, 0.5)
lambdas = [minimize_scalar(lambda l: (dof_eff(l) - d)**2).x for d in dof]

Define regression function for ridge regresision, fixing $\theta_0 = \hat y$.

In [None]:
theta0 = np.mean(data.quality)

In [None]:
def f_ridge(X, *thetas):
    return theta0 + sum(theta*X[var]
                        for theta, var in zip(thetas, variables))

In [None]:
def ridge(l):
    def D_rige(thetas):
        fev = f_ridge(data, *thetas) - data['quality']
        penalty = [np.sqrt(l)*theta for theta in thetas]
        return np.concatenate((fev, penalty))
    return D_rige

Minimize ridge regressoin equation using least squares.

In [None]:
from scipy.optimize import leastsq

In [None]:
from regression_params import lambdas as lds

In [None]:
MSE = []
thetas = []
for l in lambdas:
    theta = leastsq(ridge(l), [0]*len(variables))[0]
    thetas.append(theta)
    f = f_ridge(data, *theta)
    MSE.append(np.mean((data['quality'] - f)**2))

In [None]:
for i in range(len(variables)):
    plt.plot(dof, [theta[i] for theta in thetas], 'o')

In [None]:
plt.plot(dof, MSE, 'o')