In [2]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [3]:
# load the training data
from proj1_helpers import *
DATA_TRAIN_PATH = '../../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Do your thing crazy machine learning thing here :) ...

In [4]:

# Constant to indicate +1 and 0 for classification
BINARY_CLASSIFICATOIN_0 = -1
BINARY_CLASSIFICATOIN_1 = 1


def ridge_regression(y, tx, lambda_):
    i = np.eye(tx.shape[1])
    i[0][0] = 0  # Because we don't need to penalize the first term
    return np.linalg.solve(tx.T @ tx + lambda_ * i, tx.T @ y)

In [21]:
def performance(weights, y, xT):
    """Returns the percentage of successful classifications for the weights,
    given the expected results (y) and data (xT)"""
    from proj1_helpers import predict_labels
    compare_pred = predict_labels(weights, xT).reshape((len(y), 1))
#     print(compare_pred.shape)
#     print(y.reshape((len(y), 1)))
    compare_pred -= y.reshape((len(y), 1))
#     print(compare_pred.shape)

    non_zero = 0
    for i in range(len(compare_pred)):
        if compare_pred[i] != 0:
            non_zero += 1
            
    return 1 - non_zero / compare_pred.size

In [6]:

def standardize_0123_helper(x):
    """
    Helper function that standardize the input data to mean 0 stddev 1. 
    The function replace all the -999 entries with the mean of all non -999
    entries. 
    """
    for i in range(x.shape[1]):
        mean = np.mean(x[np.where(x[:, i] != -999), i])
        x[np.where(x[:, i] == -999), i] = mean 
        x[np.where(x[:, i] != -999), i] = x[np.where(x[:, i] != -999), i] - mean
    
    std_x = np.std(x, axis=0)
    x[:, std_x > 0] = x[:, std_x > 0] / std_x[std_x > 0]
    
    return x


def standardize_0(x):
    """
    Standardize function for PRI_jet_num is 0
    Return a standardize version of the original feature, with
    uselessful thrown away
    """
    # the features left that are meaningful and useful for training
    feature_left = np.array([0, 1, 2, 3, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21])
    left_x = np.zeros((x.shape[0], len(feature_left)))
    left_x[:, :] = x[:, feature_left]
    return standardize_0123_helper(left_x)
    

def standardize_1(x):
    """
    Standardize function for PRI_jet_num is 1
    Return a standardize version of the original feature, with
    uselessful thrown away
    """
    # the features left that are meaningful and useful for training
    feature_left = np.array([0, 1, 2, 3, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 29])
    left_x = np.zeros((x.shape[0], len(feature_left)))
    left_x[:, :] = x[:, feature_left]
    return standardize_0123_helper(left_x)
    
    
def standardize_23(x):
    """
    Standardize function for PRI_jet_num is 2 or 3
    Return a standardize version of the original feature, with
    uselessful thrown away
    """
    # the features left that are meaningful and useful for training
    feature_left = np.delete(np.arange(30), 22)
    left_x = np.zeros((x.shape[0], len(feature_left)))
    left_x[:, :] = x[:, feature_left]
    return standardize_0123_helper(left_x)

    

In [7]:
# The column index for PRI_jet_num
jet_num_col = 22

def split_dataset_wrt22(x):
    """
    Return three tuples of indices that splits x with respect to
    feature 22 - PRI_jet_num.
    First  Tuple of indicies: index in x where PRI_jet_num is 0
    Second Tuple of indicies: index in x where PRI_jet_num is 1
    Third  Tuple of indicies: index in x where PRI_jet_num is 2 or 3
    """
    x_22_0 = np.where(x[:, jet_num_col] == 0)
    x_22_1 = np.where(x[:, jet_num_col] == 1)
    x_22_23 = np.where(x[:, jet_num_col] >= 2)
    return x_22_0, x_22_1, x_22_23



In [8]:

def build_poly(x, degree):
    """
    Build the polynomial rising to the pass in parameter degree. 
    Return a matrix that has the same entry as pass in x, while 
    more features added accroding to degree. 
    Each individual feature is a some power of the original feature.
    """
    matrix = np.zeros((x.shape[0], x.shape[1] * (degree + 1)))
    for i in range(degree + 1):
        matrix[:, (i * x.shape[1]) : ((i + 1) * x.shape[1])] = (x ** i)[:]
        
    return matrix


In [9]:
def add_feature_helper(x, op, ori_shape):
    """
    Helper function that takes in x, an operator op, and the
    original shape of x. 
    Return a matrix that is expanded with the feature added.
    The matrix will have the same entries as x, but additional
    ori_shape columns of feature added. 
    """
    matrix = np.zeros((x.shape[0], x.shape[1] + ori_shape))
    matrix[:, : x.shape[1]] = x[:, :]
    matrix[:, x.shape[1] : ] = op(x[:, : ori_shape])
    return matrix


def add_feature(x):
    """
    Add some features that we consider as useful and meaningful
    to the data and good for training. 
    Return a modified x with features added. 
    """
    original_d = x.shape[1]
    x = add_feature_helper(x, np.sin, original_d)
    x = add_feature_helper(x, np.tanh, original_d)
#     x = add_feature_helper(x, np.sin, original_d)
    return x
    


In [26]:

# split the data
i_0, i_1, i_23 = split_dataset_wrt22(tX)
tx_0 =  tX[i_0]
y_0 =   y[i_0]
tx_1 =  tX[i_1]
y_1 =   y[i_1]
tx_23 = tX[i_23]
y_23 =  y[i_23]

# Standardize the data
std_tx_0 = standardize_0(tx_0)
std_tx_1 = standardize_1(tx_1)
std_tx_23 = standardize_23(tx_23)

# Add the feature
# std_tx_0 = add_feature(std_tx_0)
# std_tx_1 = add_feature(std_tx_1)
# std_tx_23 = add_feature(std_tx_23)

# polynomial degree
degrees = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11 ,12, 13, 14, 15, 16, 17]
lambdas = np.logspace(-5, 0, 15)

p_array = [0, 0, 0]
d_array = [0, 0, 0]
l_array = [0, 0, 0]

def update_arrays(lambda_, degree, p0, p1, p23):
    p = [p0, p1, p23]
    for i in range(3):
        if p[i] > p_array[i]:
            p_array[i] = p[i]
            d_array[i] = degree
            l_array[i] = lambda_
            
    
def print_stat(p_array, d_array, l_array):
    string = ['0 ', '1 ', '23']
    for i in range(3):
        print(string[i], ": ")
        print("Max p: ", p_array[i])
        print("With d: ", d_array[i])
        print("With l: ", l_array[i])
        

for lambda_ in lambdas:
    for degree in degrees:
        print("Degree: ", degree, "\tLambda: ", lambda_)
        # Build the polynomial 
        matrix_std_tx_0 = build_poly(std_tx_0, degree)
        matrix_std_tx_1 = build_poly(std_tx_1, degree)
        matrix_std_tx_23 = build_poly(std_tx_23, degree)

        weights_0 = ridge_regression(y_0, matrix_std_tx_0, lambda_)
        weights_1 = ridge_regression(y_1, matrix_std_tx_1, lambda_)
        weights_23 = ridge_regression(y_23, matrix_std_tx_23, lambda_)

        # invoke the performance function to get a rough estimate on how well we are doing 
        # on the data that we just trained. 
        # 
        # We suppose to use cross-validation for this step. 
        # However, due to the characteristics of the data, we think that evaluate on the original
        # training dataset will give us a reference on how well we are doing
        # This step is only an indication on whether we did anything REALLY wrong or not.
        p0 = performance(weights_0, y_0, matrix_std_tx_0)
        p1 = performance(weights_1, y_1, matrix_std_tx_1)
        p23 = performance(weights_23, y_23, matrix_std_tx_23)
        print("0  Size: ", len(y_0), "\tPerformance: ", p0)
        print("1  Size: ", len(y_1), "\tPerformance: ", p1)
        print("23 Size: ", len(y_23), "\tPerformance: ", p23)
        print("Overall: ", (p0 * len(y_0) + len(y_1) * p1 + p23 * len(y_23)) / len(y))
        print("******* ")
        update_arrays(lambda_, degree, p0, p1, p23)

print_stat(p_array, d_array, l_array)

Degree:  2 	Lambda:  1e-05
0  Size:  99913 	Performance:  0.8258584968922963
1  Size:  77544 	Performance:  0.756009491385536
23 Size:  72543 	Performance:  0.7698054946721256
Overall:  0.787928
******* 
Degree:  3 	Lambda:  1e-05
0  Size:  99913 	Performance:  0.8297518841392011
1  Size:  77544 	Performance:  0.7657072113896626
23 Size:  72543 	Performance:  0.7918338089133341
Overall:  0.798884
******* 
Degree:  4 	Lambda:  1e-05
0  Size:  99913 	Performance:  0.8326143745058201
1  Size:  77544 	Performance:  0.7764623955431755
23 Size:  72543 	Performance:  0.8025584825551741
Overall:  0.806476
******* 
Degree:  5 	Lambda:  1e-05
0  Size:  99913 	Performance:  0.8367379620269635
1  Size:  77544 	Performance:  0.7803698545341999
23 Size:  72543 	Performance:  0.8042678135726397
Overall:  0.809832
******* 
Degree:  6 	Lambda:  1e-05
0  Size:  99913 	Performance:  0.838899842863291
1  Size:  77544 	Performance:  0.7845997111317445
23 Size:  72543 	Performance:  0.8058392953145032
Overa

In [30]:
# 0  : 
# Max p:  0.8434037612723069
# With d:  9
# With l:  0.193069772888
# 1  : 
# Max p:  0.8057618900237284
# With d:  12
# With l:  1e-05
# 23 : 
# Max p:  0.8330093875356686
# With d:  13
# With l:  0.00138949549437

degree_0  = 9
degree_1  = 12
degree_23 = 13
matrix_std_tx_0 = build_poly(std_tx_0, degree_0)
matrix_std_tx_1 = build_poly(std_tx_1, degree_1)
matrix_std_tx_23 = build_poly(std_tx_23, degree_23)

weights_0 = ridge_regression(y_0, matrix_std_tx_0, 0.193069772888)
weights_1 = ridge_regression(y_1, matrix_std_tx_1, 1e-05)
weights_23 = ridge_regression(y_23, matrix_std_tx_23, 0.00138949549437)

## Generate predictions and save ouput in csv format for submission:

In [31]:
# load test data
DATA_TEST_PATH = '../../data/test.csv' 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)
i_0_test, i_1_test, i_23_test = split_dataset_wrt22(tX_test)

# split tx into 3 set 
tx_0_test = tX_test[i_0_test]
tx_1_test = tX_test[i_1_test]
tx_23_test = tX_test[i_23_test]

# standardize
std_tx_0_test = standardize_0(tx_0_test)
std_tx_1_test = standardize_1(tx_1_test)
std_tx_23_test = standardize_23(tx_23_test)

# add feature
# std_tx_0_test = add_feature(std_tx_0_test)
# std_tx_1_test = add_feature(std_tx_1_test)
# std_tx_23_test = add_feature(std_tx_23_test)

# split index into 3 features
ids_0_test = ids_test[i_0_test]
ids_1_test = ids_test[i_1_test]
ids_23_test = ids_test[i_23_test]

# Make prediction
y_pred_0 = predict_labels(weights_0, build_poly(std_tx_0_test, degree_0))
y_pred_1 = predict_labels(weights_1, build_poly(std_tx_1_test, degree_1))
y_pred_23 = predict_labels(weights_23, build_poly(std_tx_23_test, degree_23))


In [32]:
# concatenate everything into one
y_pred = np.concatenate((y_pred_0, y_pred_1, y_pred_23), axis=0)
ids_test = np.concatenate((ids_0_test, ids_1_test, ids_23_test), axis=0)

In [33]:
# output to file
OUTPUT_PATH = '../../data/output.csv' # TODO: fill in desired name of output file for submission
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)