# ML with Ridge Regression

In this notebook, we will use the functions in the file ridge_regression.py. This time, we will use the 4 data sets and see if the prediction becomes better.

In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
from IPython import display
# Import everything in the functions folder
from functions.costs import *
from functions.helpers import *
from functions.split import *
from functions.ridge_regression import *
from functions.helpers import *
from functions.least_squares_GD import *

## Jet 0

In [2]:
DATA_TRAIN_PATH = 'data/train_jet_0.csv' 
y_0, tX_0, ids_0 = load_csv_data(DATA_TRAIN_PATH)
tX_0, _, _ = standardize(tX_0)

We do a 5-fold cross validation to find the best lambda and best degree

In [3]:
degrees = np.arange(3, 15) 
deg_lambdas = np.arange(-10, 5)

min_loss_0, degree_star_0, lambda_star_0 = cross_validation(y_0, tX_0, 
                                            deg_lambdas, degrees, k_fold=5, digits=3)

print("Min Loss = %f"%min_loss_0)
print("Lambda* = %10.3e"%lambda_star_0)
print("Degree* = %i"%degree_star_0)    

Start the 5-fold Cross Validation!
Start degree 3
  Start for digit 1
    Power of lambda: -10
    Power of lambda: -9
    Power of lambda: -8
    Power of lambda: -7
    Power of lambda: -6
    Power of lambda: -5
    Power of lambda: -4
    Power of lambda: -3
    Power of lambda: -2
    Power of lambda: -1
    Power of lambda: 0
    Power of lambda: 1
    Power of lambda: 2
    Power of lambda: 3
    Power of lambda: 4
  Start for digit 2
  Start for digit 3
Finished Degree 3. Best lambda is  2.970e+02 with percentage wrong pred 0.172355
--------------------
Start degree 4
  Start for digit 1
    Power of lambda: -10
    Power of lambda: -9
    Power of lambda: -8
    Power of lambda: -7
    Power of lambda: -6
    Power of lambda: -5
    Power of lambda: -4
    Power of lambda: -3
    Power of lambda: -2
    Power of lambda: -1
    Power of lambda: 0
    Power of lambda: 1
    Power of lambda: 2
    Power of lambda: 3
    Power of lambda: 4
  Start for digit 2
  Start for digit 3
F

In [4]:
# Just to avoid retraining =)
#lambda_star_0 = 6.5e-1
#degree_star_0 = 7

We can split the data just to see if we have a good prediction.

In [5]:
ratio = 0.8
x_train_0, y_train_0, x_test_0, y_test_0 = split_data(tX_0, y_0, ratio)

Now, that we have the best degree and best lambda, we can do the Ridge Regression and get the best weights. 

In [6]:
# Build poly first
tX_train_0 = build_poly(x_train_0, degree_star_0)
tX_test_0 = build_poly(x_test_0, degree_star_0)
print("Polynomials done")

# Ridge Regression
loss_0, w_star_0 = ridge_regression(y_train_0, tX_train_0, lambda_star_0)
print("Loss = %f"%(loss_0))

Polynomials done
Loss = 0.818281


In [7]:
prediction(y_test_0, tX_test_0, w_star_0)

Good prediction: 16744/19983 (83.791223%)
Wrong prediction: 3239/19983 (16.208777%)


Retrain on all the train data

In [8]:
tX_poly_0 = build_poly(tX_0, degree_star_0)
loss_0, w_star_0 = ridge_regression(y_0, tX_poly_0, lambda_star_0)
print("Loss = %f"%(loss_0))

Loss = 4.089678


Load the test data and predict.

In [9]:
DATA_TEST_PATH = 'data/test_jet_0.csv' # TODO: download train data and supply path here 
_, tX_test_0, ids_test_0 = load_csv_data(DATA_TEST_PATH)
tX_test_0, _, _ = standardize(tX_test_0)
tX_test_poly_0 = build_poly(tX_test_0, degree_star_0)

y_pred_0 = predict_labels(w_star_0, tX_test_poly_0)

## Jet 1

In [10]:
DATA_TRAIN_PATH = 'data/train_jet_1.csv' 
y_1, tX_1, ids_1 = load_csv_data(DATA_TRAIN_PATH)
#tX_1, _, _ = standardize(tX_1)

We do a 5-fold cross validation to find the best lambda and best degree

In [11]:
degrees = np.arange(1, 15) 
deg_lambdas = np.arange(-10, 5)

min_loss_1, degree_star_1, lambda_star_1 = cross_validation(y_1, tX_1, 
                                            deg_lambdas, degrees, k_fold=5, digits=3)

print("Min Loss = %f"%min_loss_1)
print("Lambda* = %10.3e"%lambda_star_1)
print("Degree* = %i"%degree_star_1) 

Start the 5-fold Cross Validation!
Start degree 1
  Start for digit 1
    Power of lambda: -10
    Power of lambda: -9
    Power of lambda: -8
    Power of lambda: -7
    Power of lambda: -6
    Power of lambda: -5
    Power of lambda: -4
    Power of lambda: -3
    Power of lambda: -2
    Power of lambda: -1
    Power of lambda: 0
    Power of lambda: 1
    Power of lambda: 2
    Power of lambda: 3
    Power of lambda: 4
  Start for digit 2
  Start for digit 3
Finished Degree 1. Best lambda is  4.220e+00 with percentage wrong pred 0.305649
--------------------
Start degree 2
  Start for digit 1
    Power of lambda: -10
    Power of lambda: -9
    Power of lambda: -8
    Power of lambda: -7
    Power of lambda: -6
    Power of lambda: -5
    Power of lambda: -4
    Power of lambda: -3
    Power of lambda: -2
    Power of lambda: -1
    Power of lambda: 0
    Power of lambda: 1
    Power of lambda: 2
    Power of lambda: 3
    Power of lambda: 4
  Start for digit 2
  Start for digit 3
F

In [12]:
# Just to avoid retraining =)
#lambda_star_1 = 9e-9
#degree_star_1 = 10

We can split the data just to see if we have a good prediction.

In [13]:
ratio = 0.8
x_train_1, y_train_1, x_test_1, y_test_1 = split_data(tX_1, y_1, ratio)

Now, that we have the best degree and best lambda, we can do the Ridge Regression and get the best weights. 

In [14]:
# Build poly first
tX_train_1 = build_poly(x_train_1, degree_star_1)
tX_test_1 = build_poly(x_test_1, degree_star_1)
print("Polynomials done")

# Ridge Regression
loss_1, w_star_1 = ridge_regression(y_train_1, tX_train_1, lambda_star_1)
print("Loss = %f"%(loss_1))

Polynomials done
Loss = 0.763316


In [15]:
prediction(y_test_1, tX_test_1, w_star_1)


Good prediction: 12416/15509 (80.056741%)
Wrong prediction: 3093/15509 (19.943259%)


Retrain on all the train data

In [16]:
tX_poly_1 = build_poly(tX_1, degree_star_1)
loss_1, w_star_1 = ridge_regression(y_1, tX_poly_1, lambda_star_1)
print("Loss = %f"%(loss_1))

Loss = 0.763572


Load the test data and predict.

In [17]:
DATA_TEST_PATH = 'data/test_jet_1.csv' # TODO: download train data and supply path here 
_, tX_test_1, ids_test_1 = load_csv_data(DATA_TEST_PATH)
#tX_test_1, _, _ = standardize(tX_test_1)
tX_test_poly_1 = build_poly(tX_test_1, degree_star_1)

y_pred_1 = predict_labels(w_star_1, tX_test_poly_1)

## Jet 2

In [None]:
DATA_TRAIN_PATH = 'data/train_jet_2.csv' 
y_2, tX_2, ids_2 = load_csv_data(DATA_TRAIN_PATH)
#tX_2, _, _ = standardize(tX_2)

We do a 5-fold cross validation to find the best lambda and best degree

In [None]:
degrees = np.arange(1, 15) 
deg_lambdas = np.arange(-10, 5)

min_loss_2, degree_star_2, lambda_star_2 = cross_validation(y_2, tX_2, 
                                            deg_lambdas, degrees, k_fold=5, digits=3)

print("Min Loss = %f"%min_loss_2)
print("Lambda* = %10.3e"%lambda_star_2)
print("Degree* = %i"%degree_star_2) 

Start the 5-fold Cross Validation!
Start degree 1
  Start for digit 1
    Power of lambda: -10
    Power of lambda: -9
    Power of lambda: -8
    Power of lambda: -7
    Power of lambda: -6
    Power of lambda: -5
    Power of lambda: -4
    Power of lambda: -3
    Power of lambda: -2
    Power of lambda: -1
    Power of lambda: 0
    Power of lambda: 1
    Power of lambda: 2
    Power of lambda: 3
    Power of lambda: 4
  Start for digit 2
  Start for digit 3
Finished Degree 1. Best lambda is  9.160e+02 with percentage wrong pred 0.273350
--------------------
Start degree 2
  Start for digit 1
    Power of lambda: -10
    Power of lambda: -9
    Power of lambda: -8
    Power of lambda: -7
    Power of lambda: -6
    Power of lambda: -5
    Power of lambda: -4
    Power of lambda: -3
    Power of lambda: -2
    Power of lambda: -1
    Power of lambda: 0
    Power of lambda: 1
    Power of lambda: 2
    Power of lambda: 3
    Power of lambda: 4
  Start for digit 2
  Start for digit 3
F

In [None]:
# Just to avoid retraining =)
#lambda_star_2 = 1.06e-9
#degree_star_2 = 6

We can split the data just to see if we have a good prediction.

In [None]:
ratio = 0.8
x_train_2, y_train_2, x_test_2, y_test_2 = split_data(tX_2, y_2, ratio)

Now, that we have the best degree and best lambda, we can do the Ridge Regression and get the best weights. 

In [None]:
# Build poly first
tX_train_2 = build_poly(x_train_2, degree_star_2)
tX_test_2 = build_poly(x_test_2, degree_star_2)
print("Polynomials done")

# Ridge Regression
loss_2, w_star_2 = ridge_regression(y_train_2, tX_train_2, lambda_star_2)
print("Loss = %f"%(loss_2))

In [None]:
prediction(y_test_2, tX_test_2, w_star_2)

Retrain on all the train data

In [None]:
tX_poly_2 = build_poly(tX_2, degree_star_2)
print(tX_2.shape)
loss_2, w_star_2 = ridge_regression(y_2, tX_poly_2, lambda_star_2)
print("Loss = %f"%(loss_2))

Load the test data and predict.

In [None]:
DATA_TEST_PATH = 'data/test_jet_2.csv' # TODO: download train data and supply path here 
_, tX_test_2, ids_test_2 = load_csv_data(DATA_TEST_PATH)
#tX_test_2, _, _ = standardize(tX_test_2)
tX_test_poly_2 = build_poly(tX_test_2, degree_star_2)

y_pred_2 = predict_labels(w_star_2, tX_test_poly_2)

## Jet 3

In [None]:
DATA_TRAIN_PATH = 'data/train_jet_3.csv' 
y_3, tX_3, ids_3 = load_csv_data(DATA_TRAIN_PATH)
#tX_3, _, _ = standardize(tX_3)

We do a 5-fold cross validation to find the best lambda and best degree

In [None]:
degrees = np.arange(1, 15) 
deg_lambdas = np.arange(-10, 5)

min_loss_3, degree_star_3, lambda_star_3 = cross_validation(y_3, tX_3, 
                                            deg_lambdas, degrees, k_fold=5, digits=3)

print("Min Loss = %f"%min_loss_3)
print("Lambda* = %10.3e"%lambda_star_3)
print("Degree* = %i"%degree_star_3) 

In [None]:
# Just to avoid retraining =)
#lambda_star_3 = 2
#degree_star_3 = 5

We can split the data just to see if we have a good prediction.

In [None]:
ratio = 0.8
x_train_3, y_train_3, x_test_3, y_test_3 = split_data(tX_3, y_3, ratio)

Now, that we have the best degree and best lambda, we can do the Ridge Regression and get the best weights. 

In [None]:
# Build poly first
tX_train_3 = build_poly(x_train_3, degree_star_3)
tX_test_3 = build_poly(x_test_3, degree_star_3)
print("Polynomials done")

# Ridge Regression
loss_3, w_star_3 = ridge_regression(y_train_3, tX_train_3, lambda_star_3)
print("Loss = %f"%(loss_3))

In [None]:
prediction(y_test_3, tX_test_3, w_star_3)

Retrain on all the train data

In [None]:
tX_poly_3 = build_poly(tX_3, degree_star_3)
loss_3, w_star_3 = ridge_regression(y_3, tX_poly_3, lambda_star_3)
print("Loss = %f"%(loss_3))

Load the test data and predict.

In [None]:
DATA_TEST_PATH = 'data/test_jet_3.csv' # TODO: download train data and supply path here 
_, tX_test_3, ids_test_3 = load_csv_data(DATA_TEST_PATH)
#tX_test_3, _, _ = standardize(tX_test_3)
tX_test_poly_3 = build_poly(tX_test_3, degree_star_3)

y_pred_3 = predict_labels(w_star_3, tX_test_poly_3)

## Put the prediction back together

In [None]:
length = len(y_pred_0) + len(y_pred_1) + len(y_pred_2) + len(y_pred_3)
print(len(y_pred_0))
print(len(y_pred_1))
print(len(y_pred_2))
print(len(y_pred_3))
length

In [None]:
ids_test = []
y_pred = []

idx = min(ids_test_0[0], ids_test_1[0], ids_test_2[0], ids_test_3[0])

for i in range(length):
    if ids_test_0[0] == idx:
        ids_test.append(idx)
        y_pred.append(y_pred_0[0])
        ids_test_0 = np.delete(ids_test_0, 0)
        y_pred_0 = np.delete(y_pred_0, 0)        
        
    elif ids_test_1[0] == idx:
        ids_test.append(idx)
        y_pred.append(y_pred_1[0])
        ids_test_1 = np.delete(ids_test_1, 0)
        y_pred_1 = np.delete(y_pred_1, 0)
        
    elif ids_test_2[0] == idx:
        ids_test.append(idx)
        y_pred.append(y_pred_2[0])
        ids_test_2 = np.delete(ids_test_2, 0)
        y_pred_2 = np.delete(y_pred_2, 0)
        
    elif ids_test_3[0] == idx:
        ids_test.append(idx)
        y_pred.append(y_pred_3[0])
        ids_test_3 = np.delete(ids_test_3, 0)
        y_pred_3 = np.delete(y_pred_3, 0)

    else:
        print("Problem with idx: %i"%idx)
        
    if (i % 100000 == 0):
        print(i)
        
    idx += 1

In [None]:
len(y_pred)

In [None]:
y_pred = np.array(y_pred)
ids_test = np.array(ids_test)

In [None]:
OUTPUT_PATH = 'output/RR.csv' # TODO: fill in desired name of output file for submission
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [None]:
sum(y_pred==1)/len(y_pred)