In [1]:
import numpy as np
from src.helpers import load_csv_data, standardize
from src.regression.regression_impl import least_squares
from src.regression.costs import compute_loss

In [2]:
y, tx, ids = load_csv_data('data/test.csv')

tx, x_mean, x_std = standardize(tx)

In [20]:
def build_poly_matrix(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    
    powers = np.tile(np.tile(np.arange(degree + 1), x.shape[1]), (x.shape[0], 1))
    expanded_x = np.repeat(x, degree + 1, axis=1)
    
    return np.power(expanded_x, powers)

In [21]:
def build_poly_manuel(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    for d in range (0, degree+1):
        if d==0:
            psi=np.power(x,d)
        else:
            psi = np.hstack((psi, np.power(x,d)))
    return psi

In [29]:
test = np.random.randint(10, size=(300, 30))

%timeit -n 1000 build_poly_matrix(test, 2)

235 µs ± 12.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [31]:
%timeit -n 1000 build_poly_manuel(test, 2)

122 µs ± 20.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [32]:
def polynomial_regression():
    """Constructing the polynomial basis function expansion of the data,
       and then running least squares regression."""
    # define parameters
    degrees = [2]

    for ind, degree in enumerate(degrees):
        
        # Form the data to do polynomial regression
        phi_x = build_poly_manuel(tx, degree)
        print(tx.shape)
        print(phi_x.shape)

        # Least square and calculate RMSE
        mse, weights = least_squares(y, phi_x)
        
        rmse = np.math.sqrt(2 * mse)

        print("Processing {i}th experiment, degree={d}, rmse={loss}".format(
              i=ind + 1, d=degree, loss=rmse))

polynomial_regression()

(568238, 30)
(568238, 90)


LinAlgError: Singular matrix

In [33]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

In [34]:
def compute_rmse(y, x, w):
    l = compute_loss(y, x, w)
    return np.math.sqrt(2*l)

In [35]:
from src.regression.implementations import ridge_regression
from src.regression.polynomials import build_poly_matrix

def cross_validation(y, x, k_indices, k, lambda_, degree, mean=True):
    """return the loss of ridge regression."""
    # Get k'th subgroup in test, others in train
    
    losses_tr, losses_te, ws = [], [], []
    
    for k_ in range(k):
        
        test_indices = k_indices[k_]
        train_indices = np.setdiff1d(k_indices.flatten(), test_indices)

        y_train = y[train_indices]
        x_train = x[train_indices]
        y_test = y[test_indices]
        x_test = x[test_indices]

        # Form data with polynomial degree
        x_train_poly = build_poly_matrix(x_train, degree)
        x_test_poly = build_poly_matrix(x_test, degree)

        # Ridge regression
        loss_tr, w_ridge = ridge_regression(y_train, x_train_poly, lambda_)

        # Calculate the loss for test data
        loss_te = compute_loss(y_test, x_test_poly, w_ridge)
        
        losses_tr.append(np.math.sqrt(2 * loss_tr))
        losses_te.append(np.math.sqrt(2 * loss_te))
        ws.append(w_ridge)
    
        
        
    return np.mean(losses_tr), np.mean(losses_te)