# ML with Ridge Regression

In this notebook, we will use the functions in the file ridge_regression.py. This time, we will use the 4 data sets and see if the prediction becomes better.

In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
from IPython import display
# Import everything in the functions folder
from functions.costs import *
from functions.proj1_helpers import *
from functions.split import *
from functions.ridge_regression import *
from functions.helpers import *

## Jet 0

In [2]:
DATA_TRAIN_PATH = 'data/train_jet_0_with_nan.csv' 
y_0, tX_0, ids_0 = load_csv_data(DATA_TRAIN_PATH)
#tX_0, _, _ = standardize(tX_0)

We do a 5-fold cross validation to find the best lambda and best degree

In [3]:
degrees = np.linspace(3, 10, 8) 
lambdas = np.logspace(-5, 0, 20)
rmse_te_0 = cross_validation(y_0, tX_0, lambdas, degrees, 5, False, 1)

Start the 5-fold Cross Validation!
Degree 3/10 done!
Degree 4/10 done!
Degree 5/10 done!
Degree 6/10 done!
Degree 7/10 done!
Degree 8/10 done!
Degree 9/10 done!
Degree 10/10 done!
5-fold Cross Validation finished!


Get the best parameter with min value for rmse_te

In [4]:
lambda_star_0, degree_star_0 = find_min(rmse_te_0, lambdas, degrees)
print("Lambda* = %f"%lambda_star_0)
print("Degree* = %f"%degree_star_0)

Min for rmse_te: 0.700164
test = 0.700164
Lambda* = 0.014384
Degree* = 5.000000


We can split the data just to see if we have a good prediction.

In [5]:
ratio = 0.8
x_train_0, y_train_0, x_test_0, y_test_0 = split_data(tX_0, y_0, ratio)

Now, that we have the best degree and best lambda, we can do the Ridge Regression and get the best weights. 

In [6]:
# Build poly first
tX_train_0 = build_poly(x_train_0, degree_star_0)
tX_test_0 = build_poly(x_test_0, degree_star_0)
print("Polynomials done")

# Ridge Regression
loss_0, w_star_0 = ridge_regression(y_train_0, tX_train_0, lambda_star_0)
print("Loss = %f"%(loss_0))

Polynomials done
Loss = 0.690801


In [7]:
prediction(y_test_0, tX_test_0, w_star_0)

Good prediction: 16745/19983 (83.796227%)
Wrong prediction: 3238/19983 (16.203773%)


Retrain on all the train data

In [8]:
tX_poly_0 = build_poly(tX_0, degree_star_0)
loss_0, w_star_0 = ridge_regression(y_0, tX_poly_0, lambda_star_0)
print("Loss = %f"%(loss_0))

Loss = 0.690535


Load the test data and predict.

In [9]:
DATA_TEST_PATH = 'data/test_jet_0_with_nan.csv' # TODO: download train data and supply path here 
_, tX_test_0, ids_test_0 = load_csv_data(DATA_TEST_PATH)
#tX_test, mean_tX_test, std_tX_test = standardize(tX_test)
tX_test_poly_0 = build_poly(tX_test_0, degree_star_0)

y_pred_0 = predict_labels(w_star_0, tX_test_poly_0)

## Jet 1

In [10]:
DATA_TRAIN_PATH = 'data/train_jet_1_with_nan.csv' 
y_1, tX_1, ids_1 = load_csv_data(DATA_TRAIN_PATH)
#tX_1, _, _ = standardize(tX_1)

We do a 5-fold cross validation to find the best lambda and best degree

In [11]:
degrees = np.linspace(3, 10, 8) 
lambdas = np.logspace(-5, 0, 20)
rmse_te_1 = cross_validation(y_1, tX_1, lambdas, degrees, 5, False, 1)

Start the 5-fold Cross Validation!
Degree 3/10 done!
Degree 4/10 done!
Degree 5/10 done!
Degree 6/10 done!
Degree 7/10 done!
Degree 8/10 done!
Degree 9/10 done!
Degree 10/10 done!
5-fold Cross Validation finished!


Get the best parameter with min value for rmse_te

In [12]:
lambda_star_1, degree_star_1 = find_min(rmse_te_1, lambdas, degrees)
print("Lambda* = %f"%lambda_star_1)
print("Degree* = %f"%degree_star_1)

Min for rmse_te: 0.778513
test = 0.778513
Lambda* = 1.000000
Degree* = 10.000000


We can split the data just to see if we have a good prediction.

In [13]:
ratio = 0.8
x_train_1, y_train_1, x_test_1, y_test_1 = split_data(tX_1, y_1, ratio)

Now, that we have the best degree and best lambda, we can do the Ridge Regression and get the best weights. 

In [14]:
# Build poly first
tX_train_1 = build_poly(x_train_1, degree_star_1)
tX_test_1 = build_poly(x_test_1, degree_star_1)
print("Polynomials done")

# Ridge Regression
loss_1, w_star_1 = ridge_regression(y_train_1, tX_train_1, lambda_star_1)
print("Loss = %f"%(loss_1))

Polynomials done
Loss = 0.762857


In [15]:
prediction(y_test_1, tX_test_1, w_star_1)

Good prediction: 12430/15509 (80.147011%)
Wrong prediction: 3079/15509 (19.852989%)


Retrain on all the train data

In [16]:
tX_poly_1 = build_poly(tX_1, degree_star_1)
loss_1, w_star_1 = ridge_regression(y_1, tX_poly_1, lambda_star_1)
print("Loss = %f"%(loss_1))

Loss = 0.762937


Load the test data and predict.

In [17]:
DATA_TEST_PATH = 'data/test_jet_1_with_nan.csv' # TODO: download train data and supply path here 
_, tX_test_1, ids_test_1 = load_csv_data(DATA_TEST_PATH)
#tX_test, mean_tX_test, std_tX_test = standardize(tX_test)
tX_test_poly_1 = build_poly(tX_test_1, degree_star_1)

y_pred_1 = predict_labels(w_star_1, tX_test_poly_1)

## Jet 2

In [18]:
DATA_TRAIN_PATH = 'data/train_jet_2_with_nan.csv' 
y_2, tX_2, ids_2 = load_csv_data(DATA_TRAIN_PATH)
#tX_2, _, _ = standardize(tX_2)

We do a 5-fold cross validation to find the best lambda and best degree

In [19]:
degrees = np.linspace(3, 10, 8) 
lambdas = np.logspace(-5, 0, 20)
rmse_te_2 = cross_validation(y_2, tX_2, lambdas, degrees, 5, False, 1)

Start the 5-fold Cross Validation!
Degree 3/10 done!
Degree 4/10 done!
Degree 5/10 done!
Degree 6/10 done!
Degree 7/10 done!
Degree 8/10 done!
Degree 9/10 done!
Degree 10/10 done!
5-fold Cross Validation finished!


Get the best parameter with min value for rmse_te

In [20]:
lambda_star_2, degree_star_2 = find_min(rmse_te_2, lambdas, degrees)
print("Lambda* = %f"%lambda_star_2)
print("Degree* = %f"%degree_star_2)

Min for rmse_te: 0.783230
test = 0.783230
Lambda* = 0.000010
Degree* = 6.000000


We can split the data just to see if we have a good prediction.

In [21]:
ratio = 0.8
x_train_2, y_train_2, x_test_2, y_test_2 = split_data(tX_2, y_2, ratio)

Now, that we have the best degree and best lambda, we can do the Ridge Regression and get the best weights. 

In [22]:
# Build poly first
tX_train_2 = build_poly(x_train_2, degree_star_2)
tX_test_2 = build_poly(x_test_2, degree_star_2)
print("Polynomials done")

# Ridge Regression
loss_2, w_star_2 = ridge_regression(y_train_2, tX_train_2, lambda_star_2)
print("Loss = %f"%(loss_2))

Polynomials done
Loss = 0.755808


In [23]:
prediction(y_test_2, tX_test_2, w_star_2)

Good prediction: 8164/10076 (81.024216%)
Wrong prediction: 1912/10076 (18.975784%)


Retrain on all the train data

In [24]:
tX_poly_2 = build_poly(tX_2, degree_star_2)
loss_2, w_star_2 = ridge_regression(y_2, tX_poly_2, lambda_star_2)
print("Loss = %f"%(loss_2))

Loss = 0.756214


Load the test data and predict.

In [25]:
DATA_TEST_PATH = 'data/test_jet_2_with_nan.csv' # TODO: download train data and supply path here 
_, tX_test_2, ids_test_2 = load_csv_data(DATA_TEST_PATH)
#tX_test, mean_tX_test, std_tX_test = standardize(tX_test)
tX_test_poly_2 = build_poly(tX_test_2, degree_star_2)

y_pred_2 = predict_labels(w_star_2, tX_test_poly_2)

## Jet 3

In [26]:
DATA_TRAIN_PATH = 'data/train_jet_3_with_nan.csv' 
y_3, tX_3, ids_3 = load_csv_data(DATA_TRAIN_PATH)
#tX_3, _, _ = standardize(tX_3)

We do a 5-fold cross validation to find the best lambda and best degree

In [27]:
degrees = np.linspace(3, 10, 8) 
lambdas = np.logspace(-5, 0, 20)
rmse_te_3 = cross_validation(y_3, tX_3, lambdas, degrees, 5, False, 1)

Start the 5-fold Cross Validation!
Degree 3/10 done!
Degree 4/10 done!
Degree 5/10 done!
Degree 6/10 done!
Degree 7/10 done!
Degree 8/10 done!
Degree 9/10 done!
Degree 10/10 done!
5-fold Cross Validation finished!


Get the best parameter with min value for rmse_te

In [28]:
lambda_star_3, degree_star_3 = find_min(rmse_te_3, lambdas, degrees)
print("Lambda* = %f"%lambda_star_2)
print("Degree* = %f"%degree_star_2)

Min for rmse_te: 0.788872
test = 0.788872
Lambda* = 0.000010
Degree* = 6.000000


We can split the data just to see if we have a good prediction.

In [29]:
ratio = 0.8
x_train_3, y_train_3, x_test_3, y_test_3 = split_data(tX_3, y_3, ratio)

Now, that we have the best degree and best lambda, we can do the Ridge Regression and get the best weights. 

In [30]:
# Build poly first
tX_train_3 = build_poly(x_train_3, degree_star_3)
tX_test_3 = build_poly(x_test_3, degree_star_3)
print("Polynomials done")

# Ridge Regression
loss_3, w_star_3 = ridge_regression(y_train_3, tX_train_3, lambda_star_3)
print("Loss = %f"%(loss_3))

Polynomials done
Loss = 0.771266


In [31]:
prediction(y_test_3, tX_test_3, w_star_3)

Good prediction: 3521/4433 (79.427025%)
Wrong prediction: 912/4433 (20.572975%)


Retrain on all the train data

In [32]:
tX_poly_3 = build_poly(tX_3, degree_star_3)
loss_3, w_star_3 = ridge_regression(y_3, tX_poly_3, lambda_star_3)
print("Loss = %f"%(loss_3))

Loss = 0.774088


Load the test data and predict.

In [33]:
DATA_TEST_PATH = 'data/test_jet_3_with_nan.csv' # TODO: download train data and supply path here 
_, tX_test_3, ids_test_3 = load_csv_data(DATA_TEST_PATH)
#tX_test, mean_tX_test, std_tX_test = standardize(tX_test)
tX_test_poly_3 = build_poly(tX_test_3, degree_star_3)

y_pred_3 = predict_labels(w_star_3, tX_test_poly_3)

## Put the prediction back together

In [34]:
length = len(y_pred_0) + len(y_pred_1) + len(y_pred_2) + len(y_pred_3)
length

568238

In [35]:
ids_test = []
y_pred = []

idx = min(ids_test_0[0], ids_test_1[0], ids_test_2[0], ids_test_3[0])

for i in range(length):
    if ids_test_0[0] == idx:
        ids_test.append(idx)
        y_pred.append(y_pred_0[0])
        ids_test_0 = np.delete(ids_test_0, 0)
        y_pred_0 = np.delete(y_pred_0, 0)        
        
    elif ids_test_1[0] == idx:
        ids_test.append(idx)
        y_pred.append(y_pred_1[0])
        ids_test_1 = np.delete(ids_test_1, 0)
        y_pred_1 = np.delete(y_pred_1, 0)
        
    elif ids_test_2[0] == idx:
        ids_test.append(idx)
        y_pred.append(y_pred_2[0])
        ids_test_2 = np.delete(ids_test_2, 0)
        y_pred_2 = np.delete(y_pred_2, 0)
        
    elif ids_test_3[0] == idx:
        ids_test.append(idx)
        y_pred.append(y_pred_3[0])
        ids_test_3 = np.delete(ids_test_3, 0)
        y_pred_3 = np.delete(y_pred_3, 0)

    else:
        print("Problem with idx: %i"%idx)
        
    if (i % 100000 == 0):
        print(i)
        
    idx += 1

0
100000
200000
300000
400000
500000


In [36]:
len(y_pred)

568238

In [37]:
y_pred = np.array(y_pred)
ids_test = np.array(ids_test)

In [38]:
OUTPUT_PATH = 'output/RR_clean_with_nan.csv' # TODO: fill in desired name of output file for submission
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [39]:
sum(y_pred==1)/len(y_pred)

0.30951115553694053