# ML with Ridge Regression

In this notebook, we will use the functions in the file ridge_regression.py. This time, we will use the 4 data sets and see if the prediction becomes better.

In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
from IPython import display
# Import everything in the functions folder
from functions.costs import *
from functions.proj1_helpers import *
from functions.split import *
from functions.ridge_regression import *
from functions.helpers import *
from functions.least_squares_GD import *

## Jet 0

In [114]:
DATA_TRAIN_PATH = 'data/train_jet_0_cv.csv' 
y_0, tX_0, ids_0 = load_csv_data(DATA_TRAIN_PATH)
tX_0, _, _ = standardize(tX_0)

We do a 5-fold cross validation to find the best lambda and best degree

In [115]:
degrees = np.arange(3, 10) 
deg_lambdas = np.arange(-10, 5)

min_loss_0, degree_star_0, lambda_star_0 = cross_validation(y_0, tX_0, 
                                            deg_lambdas, degrees, k_fold=5, digits=3)

print("Min Loss = %f"%min_loss_0)
print("Lambda* = %10.3e"%lambda_star_0)
print("Degree* = %i"%degree_star_0)    

Start the 5-fold Cross Validation!
Start degree 1
  Start for digit 1
    Power of lambda: -10
    Power of lambda: -9
    Power of lambda: -8
    Power of lambda: -7
    Power of lambda: -6
    Power of lambda: -5
    Power of lambda: -4
    Power of lambda: -3
    Power of lambda: -2
    Power of lambda: -1
    Power of lambda: 0
    Power of lambda: 1
    Power of lambda: 2
    Power of lambda: 3
    Power of lambda: 4
  Start for digit 2
  Start for digit 3
Finished Degree 1. Best lambda is  3.000e+02 with RMSE 0.737740
--------------------
Start degree 2
  Start for digit 1
    Power of lambda: -10
    Power of lambda: -9
    Power of lambda: -8
    Power of lambda: -7
    Power of lambda: -6
    Power of lambda: -5
    Power of lambda: -4
    Power of lambda: -3
    Power of lambda: -2
    Power of lambda: -1
    Power of lambda: 0
    Power of lambda: 1
    Power of lambda: 2
    Power of lambda: 3
    Power of lambda: 4
  Start for digit 2
  Start for digit 3
Finished Degree 2.

In [63]:
# Just to avoid retraining =)
#lambda_star_0 = 6.9e-1
#degree_star_0 = 7

We can split the data just to see if we have a good prediction.

In [128]:
ratio = 0.8
x_train_0, y_train_0, x_test_0, y_test_0 = split_data(tX_0, y_0, ratio)

Now, that we have the best degree and best lambda, we can do the Ridge Regression and get the best weights. 

In [129]:
# Build poly first
tX_train_0 = build_poly(x_train_0, degree_star_0)
tX_test_0 = build_poly(x_test_0, degree_star_0)
print("Polynomials done")

# Ridge Regression
loss_0, w_star_0 = ridge_regression(y_train_0, tX_train_0, lambda_star_0)
print("Loss = %f"%(loss_0))

Polynomials done
Loss = 0.685650


In [130]:
prediction(y_test_0, tX_test_0, w_star_0)

Good prediction: 16740/19983 (83.771206%)
Wrong prediction: 3243/19983 (16.228794%)


Retrain on all the train data

In [131]:
tX_poly_0 = build_poly(tX_0, degree_star_0)
loss_0, w_star_0 = ridge_regression(y_0, tX_poly_0, lambda_star_0)
print("Loss = %f"%(loss_0))

Loss = 0.686666


Load the test data and predict.

In [237]:
DATA_TEST_PATH = 'data/test_jet_0_cv.csv' # TODO: download train data and supply path here 
_, tX_test_0, ids_test_0 = load_csv_data(DATA_TEST_PATH)
tX_test_0, _, _ = standardize(tX_test_0)
tX_test_poly_0 = build_poly(tX_test_0, degree_star_0)

y_pred_0 = predict_labels(w_star_0, tX_test_poly_0)

## Jet 1

In [155]:
DATA_TRAIN_PATH = 'data/train_jet_1_cv.csv' 
y_1, tX_1, ids_1 = load_csv_data(DATA_TRAIN_PATH)
#tX_1, _, _ = standardize(tX_1)

We do a 5-fold cross validation to find the best lambda and best degree

In [156]:
degrees = np.arange(1, 12) 
deg_lambdas = np.arange(-10, 5)

min_loss_1, degree_star_1, lambda_star_1 = cross_validation(y_1, tX_1, 
                                            deg_lambdas, degrees, k_fold=5, digits=3)

print("Min Loss = %f"%min_loss_1)
print("Lambda* = %10.3e"%lambda_star_1)
print("Degree* = %i"%degree_star_1) 

Start the 5-fold Cross Validation!
Start degree 1
  Start for digit 1
    Power of lambda: -10
    Power of lambda: -9
    Power of lambda: -8
    Power of lambda: -7
    Power of lambda: -6
    Power of lambda: -5
    Power of lambda: -4
    Power of lambda: -3
    Power of lambda: -2
    Power of lambda: -1
    Power of lambda: 0
    Power of lambda: 1
    Power of lambda: 2
    Power of lambda: 3
    Power of lambda: 4
  Start for digit 2
  Start for digit 3
Finished Degree 1. Best lambda is  4.570e+02 with RMSE 0.877243
--------------------
Start degree 2
  Start for digit 1
    Power of lambda: -10
    Power of lambda: -9
    Power of lambda: -8
    Power of lambda: -7
    Power of lambda: -6
    Power of lambda: -5
    Power of lambda: -4
    Power of lambda: -3
    Power of lambda: -2
    Power of lambda: -1
    Power of lambda: 0
    Power of lambda: 1
    Power of lambda: 2
    Power of lambda: 3
    Power of lambda: 4
  Start for digit 2
  Start for digit 3
Finished Degree 2.

In [157]:
# Just to avoid retraining =)
#lambda_star_1 = 9e-9
#degree_star_1 = 10

We can split the data just to see if we have a good prediction.

In [164]:
ratio = 0.8
x_train_1, y_train_1, x_test_1, y_test_1 = split_data(tX_1, y_1, ratio)

Now, that we have the best degree and best lambda, we can do the Ridge Regression and get the best weights. 

In [167]:
# Build poly first
tX_train_1 = build_poly(x_train_1, degree_star_1)
tX_test_1 = build_poly(x_test_1, degree_star_1)
print("Polynomials done")

# Ridge Regression
loss_1, w_star_1 = ridge_regression(y_train_1, tX_train_1, lambda_star_1)
print("Loss = %f"%(loss_1))

Polynomials done
Loss = 0.762413
  Iter=0, loss=0.6431948757027607, diff=0.6431948757027607
  Iter=100, loss=0.6368030018930573, diff=-0.006391873809703408
  Iter=200, loss=0.6367997576149236, diff=-3.244278133696099e-06
  Iter=300, loss=0.6367999190971673, diff=1.6148224368794928e-07
  Iter=400, loss=0.6368000805770678, diff=1.614799005622558e-07
  Iter=500, loss=0.636800242053002, diff=1.6147593417947803e-07
  Iter=600, loss=0.6368004035316852, diff=1.614786832027093e-07
  Iter=700, loss=0.6368005650113883, diff=1.6147970305357973e-07
  Iter=800, loss=0.6368007794753664, diff=2.144639781320734e-07
  Iter=900, loss=0.6367941737920172, diff=-6.605683349203417e-06
  Iter=999, loss=0.6368027223842582, diff=8.548592240975594e-06
Min loss = 0.636197


In [166]:
print("Without Gradient")
prediction(y_test_1, tX_test_1, w_star_1)
print("")
print("With Gradient")
prediction(y_test_1, tX_test_1, w_star_1_GD)


Without Gradient
Good prediction: 12430/15509 (80.147011%)
Wrong prediction: 3079/15509 (19.852989%)

With Gradient
Good prediction: 12445/15509 (80.243729%)
Wrong prediction: 3064/15509 (19.756271%)


Retrain on all the train data

In [168]:
tX_poly_1 = build_poly(tX_1, degree_star_1)
loss_1, w_star_1 = ridge_regression(y_1, tX_poly_1, lambda_star_1)
print("Loss = %f"%(loss_1))

# GD
max_iters = 1000
gamma = 1

losses, ws = gradient_descent(y_1, tX_poly_1, w_star_1, max_iters, gamma, 'MAE')
w_star_1_GD, min_loss = get_best_model(losses, ws)
print("Min loss = %f"%(min_loss))

Loss = 0.763001
  Iter=0, loss=0.6439864046804787, diff=0.6439864046804787
  Iter=11, loss=0.6375636873096008, diff=-0.006422717370877962
Min loss = 0.637562


Load the test data and predict.

In [238]:
DATA_TEST_PATH = 'data/test_jet_1_cv.csv' # TODO: download train data and supply path here 
_, tX_test_1, ids_test_1 = load_csv_data(DATA_TEST_PATH)
#tX_test_1, _, _ = standardize(tX_test_1)
tX_test_poly_1 = build_poly(tX_test_1, degree_star_1)

y_pred_1 = predict_labels(w_star_1_GD, tX_test_poly_1)

## Jet 2

In [83]:
DATA_TRAIN_PATH = 'data/train_jet_2_cv.csv' 
y_2, tX_2, ids_2 = load_csv_data(DATA_TRAIN_PATH)
#tX_2, _, _ = standardize(tX_2)

We do a 5-fold cross validation to find the best lambda and best degree

In [84]:
degrees = np.arange(1, 12) 
deg_lambdas = np.arange(-10, 5)

min_loss_2, degree_star_2, lambda_star_2 = cross_validation(y_2, tX_2, 
                                            deg_lambdas, degrees, k_fold=5, digits=3)

print("Min Loss = %f"%min_loss_2)
print("Lambda* = %10.3e"%lambda_star_2)
print("Degree* = %i"%degree_star_2) 

Start the 5-fold Cross Validation!
Start degree 1
  Start for digit 1
    Power of lambda: -10
    Power of lambda: -9
    Power of lambda: -8
    Power of lambda: -7
    Power of lambda: -6
    Power of lambda: -5
    Power of lambda: -4
    Power of lambda: -3
    Power of lambda: -2
    Power of lambda: -1
    Power of lambda: 0
    Power of lambda: 1
    Power of lambda: 2
    Power of lambda: 3
    Power of lambda: 4
  Start for digit 2
  Start for digit 3
Finished Degree 1. Best lambda is  7.590e-01 with RMSE 0.855689
--------------------
Start degree 2
  Start for digit 1
    Power of lambda: -10
    Power of lambda: -9
    Power of lambda: -8
    Power of lambda: -7
    Power of lambda: -6
    Power of lambda: -5
    Power of lambda: -4
    Power of lambda: -3
    Power of lambda: -2
    Power of lambda: -1
    Power of lambda: 0
    Power of lambda: 1
    Power of lambda: 2
    Power of lambda: 3
    Power of lambda: 4
  Start for digit 2
  Start for digit 3
Finished Degree 2.

In [85]:
# Just to avoid retraining =)
#lambda_star_2 = 1.06e-9
#degree_star_2 = 6

We can split the data just to see if we have a good prediction.

In [181]:
ratio = 0.8
x_train_2, y_train_2, x_test_2, y_test_2 = split_data(tX_2, y_2, ratio)

Now, that we have the best degree and best lambda, we can do the Ridge Regression and get the best weights. 

In [182]:
# Build poly first
tX_train_2 = build_poly(x_train_2, degree_star_2)
tX_test_2 = build_poly(x_test_2, degree_star_2)
print("Polynomials done")

# Ridge Regression
loss_2, w_star_2 = ridge_regression(y_train_2, tX_train_2, lambda_star_2)
print("Loss = %f"%(loss_2))

Polynomials done
Loss = 0.755694


In [183]:
prediction(y_test_2, tX_test_2, w_star_2)

Good prediction: 8180/10076 (81.183009%)
Wrong prediction: 1896/10076 (18.816991%)


Retrain on all the train data

In [89]:
tX_poly_2 = build_poly(tX_2, degree_star_2)
print(tX_2.shape)
loss_2, w_star_2 = ridge_regression(y_2, tX_poly_2, lambda_star_2)
print("Loss = %f"%(loss_2))

(50379, 29)
Loss = 0.756214


Load the test data and predict.

In [239]:
DATA_TEST_PATH = 'data/test_jet_2_cv.csv' # TODO: download train data and supply path here 
_, tX_test_2, ids_test_2 = load_csv_data(DATA_TEST_PATH)
#tX_test_2, _, _ = standardize(tX_test_2)
tX_test_poly_2 = build_poly(tX_test_2, degree_star_2)

y_pred_2 = predict_labels(w_star_2, tX_test_poly_2)

## Jet 3

In [225]:
DATA_TRAIN_PATH = 'data/train_jet_3_cv.csv' 
y_3, tX_3, ids_3 = load_csv_data(DATA_TRAIN_PATH)
#tX_3, _, _ = standardize(tX_3)

We do a 5-fold cross validation to find the best lambda and best degree

In [215]:
degrees = np.arange(1, 10) 
deg_lambdas = np.arange(-10, 5)

min_loss_3, degree_star_3, lambda_star_3 = cross_validation(y_3, tX_3, 
                                            deg_lambdas, degrees, k_fold=5, digits=3)

print("Min Loss = %f"%min_loss_3)
print("Lambda* = %10.3e"%lambda_star_3)
print("Degree* = %i"%degree_star_3) 

Start the 5-fold Cross Validation!
Start degree 1
  Start for digit 1
    Power of lambda: -10
    Power of lambda: -9
    Power of lambda: -8
    Power of lambda: -7
    Power of lambda: -6
    Power of lambda: -5
    Power of lambda: -4
    Power of lambda: -3
    Power of lambda: -2
    Power of lambda: -1
    Power of lambda: 0
    Power of lambda: 1
    Power of lambda: 2
    Power of lambda: 3
    Power of lambda: 4
  Start for digit 2
  Start for digit 3
Finished Degree 1. Best lambda is  3.020e-10 with RMSE 0.859004
--------------------
Start degree 2
  Start for digit 1
    Power of lambda: -10
    Power of lambda: -9
    Power of lambda: -8
    Power of lambda: -7
    Power of lambda: -6
    Power of lambda: -5
    Power of lambda: -4
    Power of lambda: -3
    Power of lambda: -2
    Power of lambda: -1
    Power of lambda: 0
    Power of lambda: 1
    Power of lambda: 2
    Power of lambda: 3
    Power of lambda: 4
  Start for digit 2
  Start for digit 3
Finished Degree 2.

In [226]:
# Just to avoid retraining =)
lambda_star_3 = 2
degree_star_3 = 5

We can split the data just to see if we have a good prediction.

In [231]:
ratio = 0.8
x_train_3, y_train_3, x_test_3, y_test_3 = split_data(tX_3, y_3, ratio)

Now, that we have the best degree and best lambda, we can do the Ridge Regression and get the best weights. 

In [232]:
# Build poly first
tX_train_3 = build_poly(x_train_3, degree_star_3)
tX_test_3 = build_poly(x_test_3, degree_star_3)
print("Polynomials done")

# Ridge Regression
loss_3, w_star_3 = ridge_regression(y_train_3, tX_train_3, lambda_star_3)
print("Loss = %f"%(loss_3))

Polynomials done
Loss = 0.772975
  Iter=0, loss=0.6600310021490399, diff=0.6600310021490399
  Iter=100, loss=0.6458276250260786, diff=-0.01420337712296127
  Iter=200, loss=0.6458302220382908, diff=2.597012212168792e-06
  Iter=300, loss=0.6458185848989024, diff=-1.1637139388342987e-05
  Iter=400, loss=0.6458211837073551, diff=2.5988084526584743e-06
  Iter=500, loss=0.6458237804552517, diff=2.5967478965993607e-06
  Iter=600, loss=0.6458263773097115, diff=2.596854459802067e-06
  Iter=700, loss=0.6458289742707256, diff=2.5969610141229893e-06
  Iter=800, loss=0.6458315785794254, diff=2.6043086998051024e-06
  Iter=900, loss=0.645819939326193, diff=-1.1639253232442748e-05
  Iter=999, loss=0.645946454950087, diff=0.00012651562389398574
Min loss = 0.643913


In [233]:
prediction(y_test_3, tX_test_3, w_star_3)

Without Gradient
Good prediction: 3528/4433 (79.584931%)
Wrong prediction: 905/4433 (20.415069%)

With Gradient
Good prediction: 3528/4433 (79.584931%)
Wrong prediction: 905/4433 (20.415069%)


Retrain on all the train data

In [234]:
tX_poly_3 = build_poly(tX_3, degree_star_3)
loss_3, w_star_3 = ridge_regression(y_3, tX_poly_3, lambda_star_3)
print("Loss = %f"%(loss_3))

Loss = 0.774098


Load the test data and predict.

In [240]:
DATA_TEST_PATH = 'data/test_jet_3_cv.csv' # TODO: download train data and supply path here 
_, tX_test_3, ids_test_3 = load_csv_data(DATA_TEST_PATH)
#tX_test_3, _, _ = standardize(tX_test_3)
tX_test_poly_3 = build_poly(tX_test_3, degree_star_3)

y_pred_3 = predict_labels(w_star_3, tX_test_poly_3)

## Put the prediction back together

In [241]:
length = len(y_pred_0) + len(y_pred_1) + len(y_pred_2) + len(y_pred_3)
print(len(y_pred_0))
print(len(y_pred_1))
print(len(y_pred_2))
print(len(y_pred_3))
length

227458
175338
114648
50794


568238

In [242]:
ids_test = []
y_pred = []

idx = min(ids_test_0[0], ids_test_1[0], ids_test_2[0], ids_test_3[0])

for i in range(length):
    if ids_test_0[0] == idx:
        ids_test.append(idx)
        y_pred.append(y_pred_0[0])
        ids_test_0 = np.delete(ids_test_0, 0)
        y_pred_0 = np.delete(y_pred_0, 0)        
        
    elif ids_test_1[0] == idx:
        ids_test.append(idx)
        y_pred.append(y_pred_1[0])
        ids_test_1 = np.delete(ids_test_1, 0)
        y_pred_1 = np.delete(y_pred_1, 0)
        
    elif ids_test_2[0] == idx:
        ids_test.append(idx)
        y_pred.append(y_pred_2[0])
        ids_test_2 = np.delete(ids_test_2, 0)
        y_pred_2 = np.delete(y_pred_2, 0)
        
    elif ids_test_3[0] == idx:
        ids_test.append(idx)
        y_pred.append(y_pred_3[0])
        ids_test_3 = np.delete(ids_test_3, 0)
        y_pred_3 = np.delete(y_pred_3, 0)

    else:
        print("Problem with idx: %i"%idx)
        
    if (i % 100000 == 0):
        print(i)
        
    idx += 1

0
100000
200000
300000
400000
500000


In [243]:
len(y_pred)

568238

In [244]:
y_pred = np.array(y_pred)
ids_test = np.array(ids_test)

In [245]:
OUTPUT_PATH = 'output/RR.csv' # TODO: fill in desired name of output file for submission
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [246]:
sum(y_pred==1)/len(y_pred)

0.31306424420753276