# ML with Ridge Regression (8 models)

In this notebook, we will use the functions in the file ridge_regression.py. This time, we will use the 8 data sets and see if the prediction becomes better.

In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
from IPython import display
# Import everything in the functions folder
from functions.costs import *
from functions.helpers import *
from functions.split import *
from functions.ridge_regression import *
from functions.helpers import *
from functions.least_squares_GD import *
import pickle

In [5]:
DATA_FOLDER = 'data'

TRAINING_DATA = ['train_jet_0_wout_mass.csv' , 'train_jet_0_with_mass.csv',
                 'train_jet_1_wout_mass.csv' , 'train_jet_1_with_mass.csv',
                 'train_jet_2_wout_mass.csv' , 'train_jet_2_with_mass.csv',
                 'train_jet_3_wout_mass.csv' , 'train_jet_3_with_mass.csv']

TEST_DATA = ['test_jet_0_wout_mass.csv' , 'test_jet_0_with_mass.csv',
             'test_jet_1_wout_mass.csv' , 'test_jet_1_with_mass.csv',
             'test_jet_2_wout_mass.csv' , 'test_jet_2_with_mass.csv',
             'test_jet_3_wout_mass.csv' , 'test_jet_3_with_mass.csv']

degrees_poly = np.arange(6, 14)
degrees_lambdas = np.arange(-10, 5)

#degrees_poly = np.arange(5,6)
#degrees_lambdas = np.arange(-2, 5)

k_fold = 10
digits = 3

Do the loop on all the training data.
We use CV to find best lambda and best degree and then we use the RR again to get the weights.

In [6]:
lambda_star = []
degree_star = []
perc_right_pred = 0
nbr_labels = 0

for idx, data in enumerate(TRAINING_DATA):
    # Print that we start the training
    print("Cross-validation with file %s"%data)
    print("-----------------------------------------------------")
    # Recreate the file
    data_file = DATA_FOLDER + '/' + data
    # Load the file
    y_train, x_train, ids_train = load_csv_data(data_file)
    #x_train, _, _ = standardize(x_train)
    # Do the Cross Validation for the ridge regression
    min_loss, deg, lamb = cross_validation(y_train, x_train, 
                                           degrees_lambdas, degrees_poly,
                                           k_fold, digits, verbose = True)
    # Print some interesting values
    print("  Max pred = %f"%(1-min_loss))
    print("  Lambda* = %10.3e"%lamb)
    print("  Degree* = %i"%deg)
    print("\n")
    lambda_star.append(lamb)
    degree_star.append(deg)

    perc_right_pred += len(y_train)*(1-min_loss)
    nbr_labels += len(y_train)
    
perc_right_pred = perc_right_pred/nbr_labels
print("Percentage of right pred on training set: %f"%perc_right_pred)
    

Cross-validation with file train_jet_0_wout_mass.csv
-----------------------------------------------------
  Start the 10-fold Cross Validation!
  Start degree 6
  Finished Degree 6. Best lambda is  1.930e+04 with percentage wrong pred 0.049770
  --------------------
  Start degree 7


AttributeError: 'list' object has no attribute 'appen'

Write the results into *pickle* files.

In [None]:
with open('data/degrees_ct.p', 'wb') as pickle_file:
    pickle.dump(degree_star, pickle_file)
    
with open('data/lambdas_ct.p', 'wb') as pickle_file:
    pickle.dump(lambda_star, pickle_file)

Read the results from the *pickle* files (in case we don't want to train again)

In [None]:
with open('data/degrees.p', 'rb') as pickle_file:
    degree_star = pickle.load(pickle_file)
    
with open('data/lambdas.p', 'rb') as pickle_file:
    lambda_star = pickle.load(pickle_file)

In [None]:
print(degree_star)
print(lambda_star)

Do the training (get the weights)

In [None]:
weights = []
mean = 0
total = 0
ct = [False, True, False, True, True, True, False, True]
sqrt = [True, True, True, True, False, True, False, True]
square = [False, True, False, True, False, True, True, False]
for idx, data in enumerate(TRAINING_DATA):
    # Print that we start the training
    print("Training with file %s"%data)
    print("-----------------------------------------------------")
    # Recreate the file
    data_file = DATA_FOLDER + '/' + data
    # Load the file
    y_train, x_train, ids_train = load_csv_data(data_file)
    
    print(lambda_star[idx])
    print(degree_star[idx])
    
    # RR to get the best weights
    """
    if idx%2 == 0:
        tX_train = ct_poly(x_train, degree_star[idx])        
    elif idx == 7:
        tX_train = ct_poly_sqrt(x_train, degree_star[idx])
    else:
        tX_train = ct_poly_sqrt_squared(x_train, degree_star[idx])
    """
    tX_train = build_poly_cross_terms(x_train, degree_star[idx],
                                      ct=ct[idx], sqrt=sqrt[idx], square=square[idx])
        
    _, w_star = ridge_regression(y_train, tX_train, lambda_star[idx])  
    print(tX_train.shape)
    val = perc_wrong_pred(y_train, tX_train, w_star)
    print("Good prediction: %f"%(100.*(1.-val)))
    total += len(y_train)
    mean += (1-val)*len(y_train)
    
    weights.append(w_star)   
print("Total Good prediction: %f"%(100*mean/total))

Do the loop on the test data to get the predicted labels

In [None]:
y_pred = []
ids_pred = []

for idx, data in enumerate(TEST_DATA):
    # Print that we start the testing
    print("Testing with file %s"%data)
    print("-----------------------------------------------------")
    # Recreate the file
    data_file = DATA_FOLDER + '/' + data
    # Load the file
    _, x_test, ids_test = load_csv_data(data_file)
    # Build the polynomial
    tX_test = build_poly_cross_terms(x_test, degree_star[idx], 
                          ct=ct[idx], sqrt=sqrt[idx], square=square[idx])
    
    # Predict the labels
    y_pred.append(predict_labels(weights[idx], tX_test)) 
    ids_pred.append(ids_test)

In [None]:
ids = []
pred = []

idx = min(ids_pred[:][0])

length = np.sum(len(i) for i in y_pred)

for i in range(length):
    for j in range(len(TEST_DATA)):
        if len(ids_pred[j]) > 0:
            if ids_pred[j][0] == idx:
                ids.append(idx)
                pred.append(y_pred[j][0])
                ids_pred[j] = np.delete(ids_pred[j], 0)
                y_pred[j] = np.delete(y_pred[j], 0)
                break

    if (i % 100000 == 0):
        print(i)
        
    idx += 1

In [None]:
print(len(pred))
print(len(ids))

In [None]:
pred = np.array(pred)
ids = np.array(ids)

In [None]:
OUTPUT_PATH = 'output/RR_8_models_10_fold_multi_ct.csv' # TODO: fill in desired name of output file for submission
create_csv_submission(ids, pred, OUTPUT_PATH)

In [None]:
sum(pred==1)/len(pred)