# ML with Ridge Regression

In this notebook, we will use the functions in the file ridge_regression.py. 

In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
from IPython import display
# Import everything in the functions folder
from functions.costs import *
from functions.proj1_helpers import *
from functions.split import *
from functions.ridge_regression import *
from functions.helpers import *

First, we load the data

In [11]:
DATA_TRAIN_PATH = 'data/train.csv' 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)
tX, mean_tX, std_tX = standardize(tX)

In [12]:
tX_poly = build_poly(tX, 4)
print(tX_poly)

[[ 1.          1.          1.         ...,  0.17016491  0.07019481
   0.0289561 ]
 [ 1.          1.          1.         ...,  0.07497737 -0.0205303
   0.00562161]
 [ 1.          1.          1.         ...,  0.08641827 -0.02540437
   0.00746812]
 ..., 
 [ 1.          1.          1.         ...,  0.10049992 -0.03186021
   0.01010023]
 [ 1.          1.          1.         ...,  0.55567992 -0.41422571
   0.30878017]
 [ 1.          1.          1.         ...,  0.55567992 -0.41422571
   0.30878017]]


In [14]:
loss, w_star = ridge_regression(y, tX_poly, 0.01)
#print(w_star)


[  5.16477250e-01   5.16303432e-01   5.16363869e-01   5.16330218e-01
   5.16330218e-01   5.16330218e-01   4.11621381e+00  -2.77130900e+00
  -7.65139638e-01   5.36964281e-01   5.16364332e-01  -2.43673671e-01
   5.04286368e-02   6.40992182e-04  -2.04028028e-04   5.16362293e-01
  -3.19917891e-01   1.00587186e-02   1.86537307e-03  -6.87069610e-05
   5.16358995e-01   6.05861112e-02   5.22699048e-02  -1.15270357e-02
   7.24867193e-04   5.16361649e-01   9.07716991e-01  -6.22626128e+00
  -5.63457818e+00   4.64434286e+00   5.16361649e-01  -1.77554711e+00
   1.11975322e+00  -2.37678094e-01   1.59681632e-02   5.16361650e-01
  -1.22914662e+01  -3.25874859e+01  -3.22540361e+01   2.30291692e+01
   5.16361672e-01   1.86394174e-01  -8.79736063e-02   1.70334495e-02
   7.11149173e-03   5.16361665e-01  -2.37268464e-02   1.25696311e-02
  -3.52668914e-04  -1.37485761e-05   5.16361670e-01   2.38820396e-02
   7.59472988e-02  -6.73741795e-03   1.15188606e-04   5.16361671e-01
  -1.63948919e-01   6.72486478e-02

In [15]:
degree_star = 4
lambda_star = 0.01

We apply a 5-fold cross validation to find the best lambda and best degree. 

In [1]:
# Degree can't go higher than 8 because matrix becomes singular
degrees = np.linspace(1, 5, 5) 
lambdas = np.logspace(-5, 0, 30)
rmse_te = cross_validation(y, tX, lambdas, degrees, 5)

NameError: name 'np' is not defined

Get the best parameter with min value for rmse_te

In [5]:
lambda_star, degree_star = find_min(rmse_tr, rmse_te, lambdas, degrees)
print("Lambda* = %f"%lambda_star)
print("Degree* = %f"%degree_star)

NameError: name 'rmse_tr' is not defined

Now, that we have the best degree and best lambda, we can do the Ridge Regression and get the best weights. 

In [6]:
# Build poly first
tX_train = build_poly(x_train, degree_star)
tX_test = build_poly(x_test, degree_star)
print("Polynomials done")

# Ridge Regression
loss, w_star = ridge_regression(y_train, tX_train, lambda_star)
print("Loss = %f"%(loss))

Polynomials done
Loss = 0.777447


In [7]:
prediction(y_test, tX_test, w_star) 

Good prediction: 39238/50000 (78.476000%)
Wrong prediction: 10762/50000 (21.524000%)
0.34576


## Generate predictions and save ouput in csv format for submission:


We retrain on all the data.

In [16]:
tX_poly = build_poly(tX, degree_star)
loss, w_star = ridge_regression(y, tX_poly, lambda_star)


[  5.16477250e-01   5.16303432e-01   5.16363869e-01   5.16330218e-01
   5.16330218e-01   5.16330218e-01   4.11621381e+00  -2.77130900e+00
  -7.65139638e-01   5.36964281e-01   5.16364332e-01  -2.43673671e-01
   5.04286368e-02   6.40992182e-04  -2.04028028e-04   5.16362293e-01
  -3.19917891e-01   1.00587186e-02   1.86537307e-03  -6.87069610e-05
   5.16358995e-01   6.05861112e-02   5.22699048e-02  -1.15270357e-02
   7.24867193e-04   5.16361649e-01   9.07716991e-01  -6.22626128e+00
  -5.63457818e+00   4.64434286e+00   5.16361649e-01  -1.77554711e+00
   1.11975322e+00  -2.37678094e-01   1.59681632e-02   5.16361650e-01
  -1.22914662e+01  -3.25874859e+01  -3.22540361e+01   2.30291692e+01
   5.16361672e-01   1.86394174e-01  -8.79736063e-02   1.70334495e-02
   7.11149173e-03   5.16361665e-01  -2.37268464e-02   1.25696311e-02
  -3.52668914e-04  -1.37485761e-05   5.16361670e-01   2.38820396e-02
   7.59472988e-02  -6.73741795e-03   1.15188606e-04   5.16361671e-01
  -1.63948919e-01   6.72486478e-02

In [17]:
DATA_TEST_PATH = 'data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)
tX_test, mean_tX_test, std_tX_test = standardize(tX_test)
tX_test_poly = build_poly(tX_test, degree_star)

In [18]:
OUTPUT_PATH = 'output/LS_RR.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(w_star, tX_test_poly)
#create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [19]:
sum(y_pred == -1)/len(ids_test)

0.80048676786839312