# ML with Ridge Regression (8 models)

In this notebook, we will use the functions in the file ridge_regression.py. This time, we will use the 8 data sets and see if the prediction becomes better.

In [10]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
from IPython import display
# Import everything in the functions folder
from functions.costs import *
from functions.helpers import *
from functions.split import *
#from functions.ridge_regression import *
from functions.helpers import *
from functions.least_squares_GD import *
from functions.logistic_regression import *
from functions.regularized_logistic_regression import *
import pickle

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
DATA_FOLDER = 'data'

"""
TRAINING_DATA = ['train_jet_0_wout_mass_pca.csv' , 'train_jet_0_with_mass_pca.csv',
                 'train_jet_1_wout_mass_pca.csv' , 'train_jet_1_with_mass_pca.csv',
                 'train_jet_2_wout_mass_pca.csv' , 'train_jet_2_with_mass_pca.csv',
                 'train_jet_3_wout_mass_pca.csv' , 'train_jet_3_with_mass_pca.csv']

TEST_DATA = ['test_jet_0_wout_mass_pca.csv' , 'test_jet_0_with_mass_pca.csv',
             'test_jet_1_wout_mass_pca.csv' , 'test_jet_1_with_mass_pca.csv',
             'test_jet_2_wout_mass_pca.csv' , 'test_jet_2_with_mass_pca.csv',
             'test_jet_3_wout_mass_pca.csv' , 'test_jet_3_with_mass_pca.csv']
"""

TRAINING_DATA = ['train_jet_0_wout_mass.csv' , 'train_jet_0_with_mass_pca.csv',
                 'train_jet_1_wout_mass.csv' , 'train_jet_1_with_mass_pca.csv',
                 'train_jet_2_wout_mass.csv' , 'train_jet_2_with_mass_pca.csv',
                 'train_jet_3_wout_mass.csv' , 'train_jet_3_with_mass_pca.csv']

TEST_DATA = ['test_jet_0_wout_mass.csv' , 'test_jet_0_with_mass_pca.csv',
             'test_jet_1_wout_mass.csv' , 'test_jet_1_with_mass_pca.csv',
             'test_jet_2_wout_mass.csv' , 'test_jet_2_with_mass_pca.csv',
             'test_jet_3_wout_mass.csv' , 'test_jet_3_with_mass_pca.csv']

weights = []
lambda_star = []
degree_star = []
y_pred = []
ids_pred = []
perc_right_pred = 0
nbr_labels = 0

#degrees_poly = np.arange(1, 14)
#degrees_lambdas = np.arange(-10, 5)

degrees_poly = np.arange(5,6)
degrees_lambdas = np.arange(-2, 5)

k_fold = 5
digits = 3
#(95.1378)

Do the loop on all the training data.
We use CV to find best lambda and best degree and then we use the RR again to get the weights.

In [18]:
for data in TRAINING_DATA:
    # Print that we start the training
    print("Training with file %s"%data)
    print("-----------------------------------------------------")
    # Recreate the file
    data_file = DATA_FOLDER + '/' + data
    # Load the file
    y_train, x_train, ids_train = load_csv_data(data_file)
    y_train = y_train[:100]
    x_train = x_train[:100,:]
    # Do the Cross Validation for the ridge regression
    min_loss, deg, lamb = cross_validation(y_train, x_train, 
                                           degrees_lambdas, degrees_poly,
                                           1, 1000,
                                           k_fold, digits, verbose = True)
    # Print some interesting values
    print("  Max pred = %f"%(1-min_loss))
    print("  Lambda* = %10.3e"%lamb)
    print("  Degree* = %i"%deg)
    print("\n")
    lambda_star.append(lamb)
    degree_star.append(deg)
    # RR to get the best weights
    if deg > 1:
        tX_train = build_poly(x_train, deg)
    else:
        tX_train = x_train
    _, w_star = ridge_regression(y_train, tX_train, lamb)
    weights.append(w_star)
    
    perc_right_pred += len(y_train)*(1-min_loss)
    nbr_labels += len(y_train)
    
perc_right_pred = perc_right_pred/nbr_labels
print("Percentage of right pred on training set: %f"%perc_right_pred)
    

Training with file train_jet_0_wout_mass.csv
-----------------------------------------------------
  Start the 5-fold Cross Validation!
  Start degree 5
  Start for digit 1
    Power of lambda: -2
    Power of lambda: -1
    Power of lambda: 0


  loss = calculate_loss(y, tx, w) + lambda_*np.linalg.norm(w)**2
  if n_iter > 1 and np.abs(losses[-1]-losses[-2]) < 1e-8:
  grad = calculate_gradient(y, tx, w) + 2*lambda_*w
  result[t<60] = np.log(1+np.exp(result[t<60]))
  result[t>60] = 1
  result[t<-60] = 0
  result[np.abs(t) < 60] = 1/(1+np.exp(result[np.abs(t) < 60]))
  return (np.sum(log_exp(np.dot(tx,w)))) - np.dot(y.transpose(),np.dot(tx,w))
  w = w-alpha*gradient


    Power of lambda: 1
    Power of lambda: 2
    Power of lambda: 3
    Power of lambda: 4
    Start for digit 2
    Start for digit 3
  Finished Degree 5. Best lambda is  3.100e+00 with percentage wrong pred 32.193522
  --------------------
 5-fold Cross Validation finished!

  Max pred = -31.193522
  Lambda* =  3.100e+00
  Degree* = 5




NameError: name 'ridge_regression' is not defined

Write the results into *pickle* files.

In [88]:
with open('data/weights.p', 'wb') as pickle_file:
    pickle.dump(weights, pickle_file)
    
with open('data/degrees.p', 'wb') as pickle_file:
    pickle.dump(degree_star, pickle_file)
    
with open('data/lambdas.p', 'wb') as pickle_file:
    pickle.dump(lambda_star, pickle_file)

Read the results from the *pickle* files (in case we don't want to train again)

In [62]:
with open('data/weights.p', 'rb') as pickle_file:
    weights = pickle.load(pickle_file)
    
with open('data/degrees.p', 'rb') as pickle_file:
    degree_star = pickle.load(pickle_file)
    
with open('data/lambdas.p', 'rb') as pickle_file:
    lambda_star = pickle.load(pickle_file)

Do the loop on the test data to get the predicted labels

In [89]:
for idx, data in enumerate(TEST_DATA):
    # Print that we start the testing
    print("Testing with file %s"%data)
    print("-----------------------------------------------------")
    # Recreate the file
    data_file = DATA_FOLDER + '/' + data
    # Load the file
    _, x_test, ids_test = load_csv_data(data_file)
    # Build the polynomial
    if degree_star[idx] > 1:
        tX_test = build_poly(x_test, degree_star[idx])
    else:
        tX_test = x_test
    # Predict the labels
    y_pred.append(predict_labels(weights[idx], tX_test))
    ids_pred.append(ids_test)

Testing with file test_jet_0_wout_mass.csv
-----------------------------------------------------
Testing with file test_jet_0_with_mass.csv
-----------------------------------------------------
Testing with file test_jet_1_wout_mass.csv
-----------------------------------------------------
Testing with file test_jet_1_with_mass.csv
-----------------------------------------------------
Testing with file test_jet_2_wout_mass.csv
-----------------------------------------------------
Testing with file test_jet_2_with_mass.csv
-----------------------------------------------------
Testing with file test_jet_3_wout_mass.csv
-----------------------------------------------------
Testing with file test_jet_3_with_mass.csv
-----------------------------------------------------


In [90]:
ids = []
pred = []

idx = min(ids_pred[:][0])

length = np.sum(len(i) for i in y_pred)

for i in range(length):
    for j in range(len(TEST_DATA)):
        if len(ids_pred[j]) > 0:
            if ids_pred[j][0] == idx:
                ids.append(idx)
                pred.append(y_pred[j][0])
                ids_pred[j] = np.delete(ids_pred[j], 0)
                y_pred[j] = np.delete(y_pred[j], 0)
                break

    if (i % 100000 == 0):
        print(i)
        
    idx += 1

0
100000
200000
300000
400000
500000


In [91]:
print(len(pred))
print(len(ids))

568238
568238


In [92]:
pred = np.array(pred)
ids = np.array(ids)

In [93]:
OUTPUT_PATH = 'output/RR_8_models_10_fold.csv' # TODO: fill in desired name of output file for submission
create_csv_submission(ids, pred, OUTPUT_PATH)

In [94]:
sum(pred==1)/len(pred)

0.32050478848651448