# ML with Ridge Regression (8 models)

In this notebook, we will use the functions in the file ridge_regression.py. This time, we will use the 8 data sets and see if the prediction becomes better.

In [70]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
from IPython import display
# Import everything in the functions folder
from functions.costs import *
from functions.helpers import *
from functions.split import *
from functions.ridge_regression import *
from functions.helpers import *
from functions.least_squares_GD import *
import pickle

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
DATA_FOLDER = 'data'

"""
TRAINING_DATA = ['train_jet_0_wout_mass_pca.csv' , 'train_jet_0_with_mass_pca.csv',
                 'train_jet_1_wout_mass_pca.csv' , 'train_jet_1_with_mass_pca.csv',
                 'train_jet_2_wout_mass_pca.csv' , 'train_jet_2_with_mass_pca.csv',
                 'train_jet_3_wout_mass_pca.csv' , 'train_jet_3_with_mass_pca.csv']

TEST_DATA = ['test_jet_0_wout_mass_pca.csv' , 'test_jet_0_with_mass_pca.csv',
             'test_jet_1_wout_mass_pca.csv' , 'test_jet_1_with_mass_pca.csv',
             'test_jet_2_wout_mass_pca.csv' , 'test_jet_2_with_mass_pca.csv',
             'test_jet_3_wout_mass_pca.csv' , 'test_jet_3_with_mass_pca.csv']
"""

TRAINING_DATA = ['train_jet_0_wout_mass.csv' , 'train_jet_0_with_mass.csv',
                 'train_jet_1_wout_mass.csv' , 'train_jet_1_with_mass.csv',
                 'train_jet_2_wout_mass.csv' , 'train_jet_2_with_mass.csv',
                 'train_jet_3_wout_mass.csv' , 'train_jet_3_with_mass.csv']

TEST_DATA = ['test_jet_0_wout_mass.csv' , 'test_jet_0_with_mass.csv',
             'test_jet_1_wout_mass.csv' , 'test_jet_1_with_mass.csv',
             'test_jet_2_wout_mass.csv' , 'test_jet_2_with_mass.csv',
             'test_jet_3_wout_mass.csv' , 'test_jet_3_with_mass.csv']

degrees_poly = np.arange(1, 14)
degrees_lambdas = np.arange(-10, 5)

#degrees_poly = np.arange(5,6)
#degrees_lambdas = np.arange(-2, 5)

k_fold = 10
digits = 3

Do the loop on all the training data.
We use CV to find best lambda and best degree and then we use the RR again to get the weights.

In [51]:
lambda_star = []
degree_star = []
perc_right_pred = 0
nbr_labels = 0

for data in TRAINING_DATA:
    # Print that we start the training
    print("Cross-validation with file %s"%data)
    print("-----------------------------------------------------")
    # Recreate the file
    data_file = DATA_FOLDER + '/' + data
    # Load the file
    y_train, x_train, ids_train = load_csv_data(data_file)
    #x_train, _, _ = standardize(x_train)
    # Do the Cross Validation for the ridge regression
    min_loss, deg, lamb = cross_validation(y_train, x_train, 
                                           degrees_lambdas, degrees_poly,
                                           k_fold, digits, verbose = False)
    # Print some interesting values
    print("  Max pred = %f"%(1-min_loss))
    print("  Lambda* = %10.3e"%lamb)
    print("  Degree* = %i"%deg)
    print("\n")
    lambda_star.append(lamb)
    degree_star.append(deg)
    # RR to get the best weights
    tX_train = ct_poly(x_train, deg)
    _, w_star = ridge_regression(y_train, tX_train, lamb)
    weights.append(w_star)
    
    perc_right_pred += len(y_train)*(1-min_loss)
    nbr_labels += len(y_train)
    
perc_right_pred = perc_right_pred/nbr_labels
print("Percentage of right pred on training set: %f"%perc_right_pred)
    

Cross-validation with file train_jet_0_wout_mass.csv
-----------------------------------------------------
  Max pred = 0.950804
  Lambda* =  9.000e-06
  Degree* = 12


Cross-validation with file train_jet_0_with_mass.csv
-----------------------------------------------------
  Max pred = 0.809840
  Lambda* =  2.120e-02
  Degree* = 9


Cross-validation with file train_jet_1_wout_mass.csv
-----------------------------------------------------
  Max pred = 0.918651
  Lambda* =  1.648e-05
  Degree* = 7


Cross-validation with file train_jet_1_with_mass.csv
-----------------------------------------------------
  Max pred = 0.794441
  Lambda* =  2.700e-04
  Degree* = 9


Cross-validation with file train_jet_2_wout_mass.csv
-----------------------------------------------------
  Max pred = 0.906780
  Lambda* =  2.420e-06
  Degree* = 10


Cross-validation with file train_jet_2_with_mass.csv
-----------------------------------------------------
  Max pred = 0.834247
  Lambda* =  3.090e-04
  Degr

Write the results into *pickle* files.

In [52]:
with open('data/degrees.p', 'wb') as pickle_file:
    pickle.dump(degree_star, pickle_file)
    
with open('data/lambdas.p', 'wb') as pickle_file:
    pickle.dump(lambda_star, pickle_file)

Read the results from the *pickle* files (in case we don't want to train again)

In [29]:
with open('data/degrees.p', 'rb') as pickle_file:
    degree_star = pickle.load(pickle_file)
    
with open('data/lambdas.p', 'rb') as pickle_file:
    lambda_star = pickle.load(pickle_file)

In [119]:
print(degree_star)
print(lambda_star)

[12, 9, 7, 9, 10, 10, 8, 9]
[9.0000000000000002e-06, 0.021199999999999997, 1.6479999999999998e-05, 0.00027000000000000006, 2.4199999999999997e-06, 0.00030899999999999998, 4.000000000000001e-05, 3.6299999999999999e-10]


Do the training (get the weights)

In [120]:
weights = []
triplets = [1, 4, 6]
for i in range(len(TRAINING_DATA)):
    # Print that we start the training
    print("Training with file %s"%TRAINING_DATA[i])
    print("-----------------------------------------------------")
    # Recreate the file
    data_file = DATA_FOLDER + '/' + TRAINING_DATA[i]
    # Load the file
    y_train, x_train, ids_train = load_csv_data(data_file)
    
    # RR to get the best weights
    if i in triplets:
        tX_train = ct_poly_triplet(x_train, degree_star[i])
        _, w_star = ridge_regression(y_train, tX_train, lambda_star[i]) 
    else:
        tX_train = ct_poly(x_train, degree_star[i])
        _, w_star = ridge_regression(y_train, tX_train, lambda_star[i])   
    prediction(y_train, tX_train, w_star)
    weights.append(w_star)    

Training with file train_jet_0_wout_mass.csv
-----------------------------------------------------
Good prediction: 24850/26123 (95.126900%)
Wrong prediction: 1273/26123 (4.873100%)
Training with file train_jet_0_with_mass.csv
-----------------------------------------------------
Good prediction: 60150/73789 (81.516215%)
Wrong prediction: 13639/73789 (18.483785%)
Training with file train_jet_1_wout_mass.csv
-----------------------------------------------------
Good prediction: 7001/7562 (92.581328%)
Wrong prediction: 561/7562 (7.418672%)
Training with file train_jet_1_with_mass.csv
-----------------------------------------------------
Good prediction: 55904/69982 (79.883399%)
Wrong prediction: 14078/69982 (20.116601%)
Training with file train_jet_2_wout_mass.csv
-----------------------------------------------------
Good prediction: 2952/2952 (100.000000%)
Wrong prediction: 0/2952 (0.000000%)
Training with file train_jet_2_with_mass.csv
--------------------------------------------------

Do the loop on the test data to get the predicted labels

In [133]:
y_pred = []
ids_pred = []

for idx, data in enumerate(TEST_DATA):
    # Print that we start the testing
    print("Testing with file %s"%data)
    print("-----------------------------------------------------")
    # Recreate the file
    data_file = DATA_FOLDER + '/' + data
    # Load the file
    _, x_test, ids_test = load_csv_data(data_file)
    # Build the polynomial
    if idx in triplets:
        tX_test = ct_poly_triplet(x_test, degree_star[idx])
    else:
        tX_test = ct_poly(x_test, degree_star[idx])
    
    # Predict the labels
    y_pred.append(predict_labels(weights[idx], tX_test)) 
    ids_pred.append(ids_test)

Testing with file test_jet_0_wout_mass.csv
-----------------------------------------------------
Testing with file test_jet_0_with_mass.csv
-----------------------------------------------------
Testing with file test_jet_1_wout_mass.csv
-----------------------------------------------------
Testing with file test_jet_1_with_mass.csv
-----------------------------------------------------
Testing with file test_jet_2_wout_mass.csv
-----------------------------------------------------
Testing with file test_jet_2_with_mass.csv
-----------------------------------------------------
Testing with file test_jet_3_wout_mass.csv
-----------------------------------------------------
Testing with file test_jet_3_with_mass.csv
-----------------------------------------------------


In [134]:
ids = []
pred = []

idx = min(ids_pred[:][0])

length = np.sum(len(i) for i in y_pred)

for i in range(length):
    for j in range(len(TEST_DATA)):
        if len(ids_pred[j]) > 0:
            if ids_pred[j][0] == idx:
                ids.append(idx)
                pred.append(y_pred[j][0])
                ids_pred[j] = np.delete(ids_pred[j], 0)
                y_pred[j] = np.delete(y_pred[j], 0)
                break

    if (i % 100000 == 0):
        print(i)
        
    idx += 1

0
100000
200000
300000
400000
500000


In [135]:
print(len(pred))
print(len(ids))

568238
568238


In [136]:
pred = np.array(pred)
ids = np.array(ids)

In [137]:
OUTPUT_PATH = 'output/RR_8_models_10_fold_ct_triplets.csv' # TODO: fill in desired name of output file for submission
create_csv_submission(ids, pred, OUTPUT_PATH)

In [138]:
sum(pred==1)/len(pred)

0.32987938152675461