# Cross validation

## I) Imports

In [1]:
from proj1_helpers import *
from custom_helpers import *
from plot import *
from implementations import *
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext autoreload
%autoreload 2

## II) Required functions

In [2]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

In [7]:
def cross_validation(x, y, flag_method, degree, lambda_=0, gamma=1.e-6, max_iters=1000, k_fold=int(5), seed=143225):
    "Train the model and evaluate loss based on cross validation"
    mses_tr = []
    mses_te = []
    accuracy_tr = []
    accuracy_te = []
    
    flag_add_offset = True
    flag_standardize = True
    flag_remove_outliers = False
    
    k_indices = build_k_indices(y, k_fold, seed);
    for i in range(k_fold):
        newk_index = np.delete(k_indices, i, 0)
        indices_train = newk_index.ravel()
        indices_test = k_indices[i]

        # Train data at each iteration "i" of the loop
        x_train = x[indices_train]
        y_train = y[indices_train]

        # Validate the data at each iteration "i" of the loop
        x_test = x[indices_test]
        y_test = y[indices_test]

        # Prepare data (Standardisation and offset)
        training_tx, testing_tx = prepare_data(x_train, x_test, flag_add_offset, flag_standardize, flag_remove_outliers, degree)
        
        # create initial w for methods using it
        initial_w = np.zeros(training_tx.shape[1])

        if flag_method == 0:
            # Use linear regression (full gradient descent)
            weight, _ = least_squares_GD(y_train, training_tx, initial_w, max_iters, gamma)
            
        if flag_method == 1:
            # Use linear regression (stochastic gradient descent)
            weight, _ = least_squares_SGD(y_train, training_tx, initial_w, max_iters, gamma)
            
        if flag_method == 2:
            # Use least squares method
            weight, _ = least_squares(y_train, training_tx)
            
        if flag_method == 3:
            # Use ridge regression
            weight, _ = ridge_regression(y_train, training_tx, lambda_)
            
        if flag_method == 4:
            # Use logistic regression
            weight, _ = logistic_regression(y_train, training_tx, initial_w, max_iters, gamma)
            
        if flag_method == 5:
            # Use regularized logistic regression
            weight, _ = reg_logistic_regression(y_train, training_tx, initial_w, max_iters, gamma, lambda_)
            
        loss_te = np.sqrt(2 * compute_mse(y_test, testing_tx, weight))
        loss_tr = np.sqrt(2 * compute_mse(y_train, training_tx, weight))
        
        # Append loss of this round to list
        mses_tr.append(loss_tr)
        mses_te.append(loss_te)
        
        # calculate accuracy and add it to list
        y_pred_tr = predict_labels(weight, training_tx)
        y_pred_te = predict_labels(weight, testing_tx)
        accuracy_tr.append(np.sum(y_pred_tr == y_train)/len(y_train))
        accuracy_te.append(np.sum(y_pred_te == y_test)/len(y_test))


    mean_accuracy_tr = np.mean(accuracy_tr)
    mean_accuracy_te = np.mean(accuracy_te)
    loss_tr = np.mean(mses_tr)
    loss_te = np.mean(mses_te)
    return loss_tr, loss_te, mean_accuracy_tr, mean_accuracy_te

## III) Main
### Load the data

In [8]:
# Loading Data
print("Loading Data, please wait")
train_y, train_x, ids_train = load_csv_data('data/train.csv')
print("Data loaded, continue!!")

Loading Data, please wait
Data loaded, continue!!


In [9]:
"""
Methods mapping
0    Linear regression (full gradient descent)
1    Linear regression (stochastic gradient descent)
2    Least squares method
3    Ridge regression
4    Logistic regression (stochastic gradient descent)
5    Regularized logistic regression (stochastic gradient descent)

"""

'\nMethods mapping\n0    Linear regression (full gradient descent)\n1    Linear regression (stochastic gradient descent)\n2    Least squares method\n3    Ridge regression\n4    Logistic regression (stochastic gradient descent)\n5    Regularized logistic regression (stochastic gradient descent)\n\n'

### Cross validation for one set of parameters only
Get the RMSE for one method with defined parameters

In [10]:
# Chose learnig method to use (see mapping above)
flag_method = 4;
degree = 1
lambda_ = 0

# set Gradient descent parameters
gamma = 0.1
max_iters = 2500

# Preparing data for cross validation
ytrain_cross_validation = train_y.copy()
xtrain=remove_invalid(train_x)

_, loss_te, _, accuracy_te = cross_validation(xtrain, ytrain_cross_validation, flag_method, degree, lambda_, gamma, max_iters)
print("For the Degree: %d cross-validation loss is %f, and accuracy is %f" %(degree, loss_te, accuracy_te))

iteration	 0 	loss:  0.013862943611198907
iteration	 50 	loss:  0.6244300482999456
iteration	 100 	loss:  0.5947400919348644
iteration	 150 	loss:  0.5820373274626097
iteration	 200 	loss:  0.5728809514233614
iteration	 250 	loss:  0.5658750402261199
iteration	 300 	loss:  0.5604126331420326
iteration	 350 	loss:  0.5560955680418571
iteration	 400 	loss:  0.552643123927952
iteration	 450 	loss:  0.54985257884781
iteration	 500 	loss:  0.5475749687839848
iteration	 550 	loss:  0.5456991863830151
iteration	 600 	loss:  0.5441412705414723
iteration	 650 	loss:  0.5428370302379633
iteration	 700 	loss:  0.5417368710804673
iteration	 750 	loss:  0.5408021107788042
iteration	 800 	loss:  0.5400023182520437
iteration	 850 	loss:  0.5393133667423564
iteration	 900 	loss:  0.5387159919111021
iteration	 950 	loss:  0.5381947119067703
iteration	 1000 	loss:  0.5377370102125572
iteration	 1050 	loss:  0.5373327115312614
iteration	 1100 	loss:  0.5369735010272224
iteration	 1150 	loss:  0.536652551

iteration	 2150 	loss:  0.5338719596397277
iteration	 2200 	loss:  0.5337934225505472
iteration	 2250 	loss:  0.5337182097430049
iteration	 2300 	loss:  0.5336461038796156
iteration	 2350 	loss:  0.5335769088695523
iteration	 2400 	loss:  0.533510447160834
iteration	 2450 	loss:  0.5334465574317039
iteration	 0 	loss:  0.013862943611198907
iteration	 50 	loss:  0.624740661707601
iteration	 100 	loss:  0.5951854945685351
iteration	 150 	loss:  0.5825106153583675
iteration	 200 	loss:  0.5733813222837236
iteration	 250 	loss:  0.5664062266373993
iteration	 300 	loss:  0.5609753549397323
iteration	 350 	loss:  0.5566883366555754
iteration	 400 	loss:  0.5532633068497474
iteration	 450 	loss:  0.550497078282443
iteration	 500 	loss:  0.5482406269812558
iteration	 550 	loss:  0.5463830133339355
iteration	 600 	loss:  0.5448405513653972
iteration	 650 	loss:  0.5435493590801674
iteration	 700 	loss:  0.5424601463867212
iteration	 750 	loss:  0.5415345129942449
iteration	 800 	loss:  0.540742

### Grid search for methods without regularisation
Test polynominal expansion of different degrees

In [None]:
# Chose learnig method to use (see mapping above)
flag_method = 4;

# set Gradient descent parameters
gamma = 0.1
max_iters = 1000

# Define range for the polynomial expansion
degree_range = np.arange(1, 3)

train_losses = np.zeros(len(degree_range))
test_losses = np.zeros(len(degree_range))
train_accuracies = np.zeros(len(degree_range))
test_accuracies = np.zeros(len(degree_range))

# Preparing data for cross validation
ytrain_cross_validation = train_y.copy()
xtrain=remove_invalid(train_x)

for ind_degree, degree in enumerate(degree_range):
    loss_tr, loss_te , accuracy_tr, accuracy_te= cross_validation(xtrain, ytrain_cross_validation, flag_method, degree, 0, gamma, max_iters)
    print("For the Degree: %d , The LOSS is : %f" %(degree, loss_te))
    train_losses[ind_degree] = loss_tr
    test_losses[ind_degree] = loss_te
    test_accuracies[ind_degree] = accuracy_tr
    test_accuracies[ind_degree] = accuracy_te
    
print("Cross Validation finished!!")
best_value = np.unravel_index(np.argmin(test_losses), test_losses.shape)
print("The best degrees are: ", degree_range[best_value])

In [None]:
# Visualize
print("Test accuracy:")
print(test_accuracies)
cross_validation_visualization_degree(degree_range, train_losses, test_losses)

### Grid search for Methods using regularisation
Grid search over different degrees of polynominal expansion and for different lambdas

In [None]:
# Chose learnig method to use (see mapping above)
flag_method = 5;

# set Gradient descent parameters
gamma = 0.1
max_iters = 500

# Define range for the polynomial expansion and for lambda
degree_range = np.arange(1, 2)
lambda_range = np.arange(0.0, 10, 3)

train_losses_matrix = np.zeros((len(degree_range), len(lambda_range)))
test_losses_matrix = np.zeros((len(degree_range), len(lambda_range)))
train_accuracies_matrix = np.zeros((len(degree_range), len(lambda_range)))
test_accuracies_matrix = np.zeros((len(degree_range), len(lambda_range)))

# Preparing data for cross validation
ytrain_cross_validation = train_y.copy()
xtrain = remove_invalid(train_x)

for ind_degree, degree in enumerate(degree_range):
    for ind_lambda_, lambda_ in enumerate(lambda_range):
        loss_tr, loss_te , accuracy_tr, accuracy_te= cross_validation(xtrain, ytrain_cross_validation, flag_method, degree, lambda_, gamma, max_iters)
        print("For the Degree: %d and lambda %.2E, The LOSS is : %f" %(degree, lambda_, loss_te))
        train_losses_matrix[ind_degree, ind_lambda_] = loss_tr
        test_losses_matrix[ind_degree, ind_lambda_] = loss_te
        train_accuracies_matrix[ind_degree, ind_lambda_] = accuracy_tr
        test_accuracies_matrix[ind_degree, ind_lambda_] = accuracy_te

print("Cross Validation finished!!")
best_value = np.unravel_index(np.argmin(test_losses_matrix), test_losses_matrix.shape)
print(best_value)
print("Best degree: %d, with lambda %f " %(degree_range[best_value[0]], lambda_range[best_value[1]]))

In [None]:
# Visualize
print("Test accuracy:")
print(test_accuracies_matrix)
cross_validation_visualization_lambda(lambda_range, train_losses_matrix[0, :], test_losses_matrix[0, :])