In [11]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the training data into feature matrix, class labels, and event ids:

In [10]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)


## Do your thing crazy machine learning thing here :) ...

In [12]:
def build_poly(x, degree):
    
    matrix = np.zeros((x.shape[0], x.shape[1] * (degree + 1)))
    
    for i in range(degree + 1):
        matrix[:, (i * x.shape[1]) : ((i + 1) * x.shape[1])] = (x ** i)[:]
    return matrix


def build_k_indices(y, k_fold, seed=62):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

def batch_iter(y, tx, batch_size, num_batches=None, shuffle=True):
    """
    Generate a minibatch iterator for a dataset.
    Takes as input two iterables (here the output desired values 'y' and the input data 'tx')
    Outputs an iterator which gives mini-batches of `batch_size` matching elements from `y` and `tx`.
    Data can be randomly shuffled to avoid ordering in the original data messing with the randomness of the minibatches.
    Example of use :
    for minibatch_y, minibatch_tx in batch_iter(y, tx, 32):
        <DO-SOMETHING>
    """

    data_size = len(y)
    num_batches_max = int(np.ceil(data_size/batch_size))
    if num_batches is None:
        num_batches = num_batches_max
    else:
        num_batches = min(num_batches, num_batches_max)

    if shuffle:
        shuffle_indices = np.random.permutation(np.arange(data_size))
        shuffled_y = y[shuffle_indices]
        shuffled_tx = tx[shuffle_indices]
    else:
        shuffled_y = y
        shuffled_tx = tx
    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, data_size)
        if start_index != end_index:
            yield shuffled_y[start_index:end_index], shuffled_tx[start_index:end_index]

In [296]:
BINARY_CLASSIFICATOIN_0 = -1
BINARY_CLASSIFICATOIN_1 = 1


def sigmoid(t):
    """apply sigmoid function on t."""
    return 1.0 / (1.0 + np.exp(-t))


def calculate_loss_logistic_regression(y, tx, w):
    """compute the cost by negative log likelihood."""
    prediction = tx @ w
    
    y1 = np.where(y == BINARY_CLASSIFICATOIN_1)

    over_700 = np.where(prediction >= 700)

    prediction_result = np.log(1 + np.exp(prediction))
    prediction_result[over_700] = prediction[over_700]
    prediction_result[y1] -= prediction[y1]
    
    result = np.sum(prediction_result)
    return result


def calculate_gradient_logistic_regression(y, tx, w):
    """compute the gradient of loss."""

    y1 = np.where(y == BINARY_CLASSIFICATOIN_1)
    sig = sigmoid(tx @ w).reshape(len(y))
    sig[y1] -= y[y1]

    return (tx.T @ sig).reshape((tx.shape[1], 1))
    
    
def line_search_gamma(y, tx, w, loss, loss_prev, loss_prev_prev, gamma, gradient, gamma_counter, iter):
    if (loss > loss_prev * (1 + 1.0 / np.log(loss))):
        gamma = gamma / 1.5 if np.random.randint(2) == 1 else gamma
    elif (loss_prev_prev - loss_prev > loss_prev - loss) or gamma_counter >= 250:
        gamma = gamma * (1 + 1.0 / loss)
    return gamma
    

def logistic_regression_helper(y, tx, gamma, max_iters, lambda_):

    w = np.zeros((tx.shape[1], 1))
    threshold = 1e-8
    loss_prev_prev = 0
    loss_prev = 0
    w_max = w
    perf = 0
    i = 0
    gamma_prev = 0
    gamma_counter = 0
    
    batch_size = 100

    for iter in range(max_iters):
                
        loss = calculate_loss_logistic_regression(y, tx, w) + lambda_ * np.linalg.norm(w, 2)
        gradient = calculate_gradient_logistic_regression(y, tx, w)
        w = w - gradient * gamma

        if (loss_prev != 0 and loss_prev_prev != 0) and np.abs(loss_prev - loss) < threshold:
            print("Reached Theshold, exit")
            break
            
        gamma_prev = gamma
    
        gamma = line_search_gamma(y, tx, w, (loss - lambda_ * np.linalg.norm(w, 2)), loss_prev, loss_prev_prev, gamma, gradient, gamma_counter, iter)
        
        if gamma == gamma_prev:
            gamma_counter += 1
        else:
            gamma_counter = 0
        
        loss_prev_prev = loss_prev
        loss_prev = loss

        if (iter % 100) == 0:
            print("Gamma: ", gamma)
            print("Current iteration={i}, the loss={l}".format(i=iter, l=loss))

    return w


def logistic_regression(y, tx, gamma, max_iters):
    """ return the final w from the logistic regression """
    return logistic_regression_helper(y, tx, gamma, max_iters, lambda_=0)


def reg_logistic_regression(y, tx, lambda_, gamma, max_iters):
    """ return the final w from the penalized logistic regression, with lambda_ as a non 0 value"""
    return logistic_regression_helper(y, tx, gamma, max_iters, lambda_)


In [249]:
def performance(weights, y, xT):
    """Returns the percentage of successful classifications for the weights,
    given the expected results (y) and data (xT)"""
    from proj1_helpers import predict_labels
    compare_pred = predict_labels(weights, xT)
    compare_pred -= y.reshape((len(y), 1))
        
    non_zero = 0
    for i in range(len(compare_pred)):
        if compare_pred[i] != 0:
            non_zero += 1
            
    return 1 - non_zero / compare_pred.size

In [250]:

def plot_feature_relationship(y, tX):

    std_tx = standardize(tX)

    row = np.zeros(std_tx.shape[0])

    # for i in range(len(tX)):
    #     row[i] = tX[i][0]

    plt.figure(figsize=(20, 20))

    for j in range(std_tx.shape[1]):
        for i in range(len(std_tx)):
            row[i] = std_tx[i][j]
        plt.subplot(5, 6, j + 1)
        plt.title(j)
        plt.plot(row[np.where(y == 1)], y[np.where(y == 1)], 'ro')
        plt.plot(row[np.where(y == -1)], y[np.where(y == -1)], 'bo')
  

    plt.tight_layout()

    plt.show()

# plot_feature_relationship(y, tX)


In [180]:

# def standardize(x):
# #     cols = [4, 5, 6, 12, 26, 28]
# #     for i in range(len(x)):
# #         x[i][np.where(x[i] == -999)] = 0

#     # Combine feature, or do poly expansion
    
#     # Do not normalize the jet num
#     jet_num = x[:, jet_num_col]
    
#     # Replace -999 with some value that is the mean/median of the represent dataset 
#     for i in range(x.shape[1]):
#         median = np.median(x[np.where(x[:, i] != -999), i])
#         x[np.where(x[:, i] == -999), i] = median 
#         x[np.where(x[:, i] != -999), i] = x[np.where(x[:, i] != -999), i] - np.mean(x[np.where(x[:, i] != -999), i])
    
#     mean_x = np.mean(x, axis=0)
#     x = x - mean_x
    
#     std_x = np.std(x, axis=0)
#     x[:, std_x > 0] = x[:, std_x > 0] / std_x[std_x > 0]

#     x[:, jet_num_col] = jet_num
    
#     return x

# print(standardize(tX) - tX)

In [181]:
# def standardize_01(x):
# #     cols = [4, 5, 6, 12, 26, 28]
# #     for i in range(len(x)):
# #         x[i][np.where(x[i] == -999)] = 0

#     # Combine feature, or do poly expansion
    
#     # Do not normalize the jet num
    
#     # Replace -999 with some value that is the mean/median of the represent dataset 
#     for i in range(x.shape[1]):
#         x[np.where(x[:, i] == -999), i] = 1
    
#     mean_x = np.mean(x, axis=0)
#     x = x - mean_x
    
#     std_x = np.std(x, axis=0)
#     x[:, std_x > 0] = x[:, std_x > 0] / std_x[std_x > 0]

#     x[:, jet_num_col] = 1
    
#     return x


In [182]:
def standardize_0123_helper(x):
    for i in range(x.shape[1]):
        mean = np.mean(x[np.where(x[:, i] != -999), i])
        x[np.where(x[:, i] == -999), i] = mean 
        x[np.where(x[:, i] != -999), i] = x[np.where(x[:, i] != -999), i] - mean
    
    std_x = np.std(x, axis=0)
    x[:, std_x > 0] = x[:, std_x > 0] / std_x[std_x > 0]
    
    return x


def standardize_0(x):
    feature_left = np.array([0, 1, 2, 3, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21])
    left_x = np.zeros((x.shape[0], len(feature_left)))
    left_x[:, :] = x[:, feature_left]
    return standardize_0123_helper(left_x)
    
    

def standardize_1(x):
    feature_left = np.array([0, 1, 2, 3, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 29])
    left_x = np.zeros((x.shape[0], len(feature_left)))
    left_x[:, :] = x[:, feature_left]
    return standardize_0123_helper(left_x)
    
    
def standardize_23(x):
    feature_left = np.delete(np.arange(30), 22)
    left_x = np.zeros((x.shape[0], len(feature_left)))
    left_x[:, :] = x[:, feature_left]
    return standardize_0123_helper(left_x)

    

In [183]:
jet_num_col = 22


def split_dataset_wrt22(x):
    x_22_0 = np.where(x[:, jet_num_col] == 0)
    x_22_1 = np.where(x[:, jet_num_col] == 1)
    x_22_23 = np.where(x[:, jet_num_col] >= 2)
    return x_22_0, x_22_1, x_22_23



In [184]:


# There are two parameters, lambda_ and gamma, where gamma is the step size 

# max_iter = 15000
# lambdas = np.arange(0.1, 0.4, 0.1)
# gammas = np.array([0.004])

# degree = 3

# i_0, i_1, i_23 = split_dataset_wrt22(tX)

# tx_0 =  tX[i_0]
# y_0 =   y[i_0]
# tx_1 =  tX[i_1]
# y_1 =   y[i_1]
# tx_23 = tX[i_23]
# y_23 =  y[i_23]

# std_tx_0 = standardize_01(tx_0)
# std_tx_1 = standardize_01(tx_1)
# std_tx_23 = standardize(tx_23)

# matrix_std_tx_0 = build_poly(std_tx_0, degree)
# matrix_std_tx_1 = build_poly(std_tx_1, degree)
# matrix_std_tx_23 = build_poly(std_tx_23, degree)


# print(std_tx)
# print(tX)

# k_fold = 10
# k_indices = build_k_indices(y, k_fold)
    
# k = 2

# y_test = y[k_indices[k]]
# x_test = std_tx[k_indices[k]]
# train_indices = k_indices[~(np.arange(k_indices.shape[0]) == k)]
# y_train = np.concatenate(y[train_indices])
# x_train = np.concatenate(std_tx[train_indices])
    


In [290]:
max_iter = 40000
lambdas = np.arange(0.1, 0.4, 0.1)
gammas = np.array([0.005])

degree = 2

i_0, i_1, i_23 = split_dataset_wrt22(tX)

tx_0 =  tX[i_0]
y_0 =   y[i_0]
tx_1 =  tX[i_1]
y_1 =   y[i_1]
tx_23 = tX[i_23]
y_23 =  y[i_23]

std_tx_0 = standardize_0(tx_0)
std_tx_1 = standardize_1(tx_1)
std_tx_23 = standardize_23(tx_23)

matrix_std_tx_0 = build_poly(std_tx_0, degree)
matrix_std_tx_1 = build_poly(std_tx_1, degree)
matrix_std_tx_23 = build_poly(std_tx_23, degree)

In [291]:
# weights_0 = reg_logistic_regression(y_0, matrix_std_tx_0, lambdas[0], gammas[0], max_iter)


In [297]:
weights_1 = reg_logistic_regression(y_1, matrix_std_tx_1, lambdas[0], gammas[0], max_iter)


Gamma:  0.00333333333333
Current iteration=0, the loss=53749.4049693404




Gamma:  0.000195097861399
Current iteration=100, the loss=1359218.7276774226
Gamma:  8.67149620046e-05
Current iteration=200, the loss=662801.644624778
Gamma:  5.78156841351e-05
Current iteration=300, the loss=527609.7674706967
Gamma:  2.57011484078e-05
Current iteration=400, the loss=193036.88032858213
Gamma:  1.7140793166e-05
Current iteration=500, the loss=137056.07312432767
Gamma:  1.14319590774e-05
Current iteration=600, the loss=99399.35791908401
Gamma:  7.62571829482e-06
Current iteration=700, the loss=73931.20458814682
Gamma:  7.63095519202e-06
Current iteration=800, the loss=73408.39816457803
Gamma:  7.63608992692e-06
Current iteration=900, the loss=75117.59709950353
Gamma:  5.09507649757e-06
Current iteration=1000, the loss=55504.41763473445
Gamma:  5.0997100108e-06
Current iteration=1100, the loss=54644.04935962662
Gamma:  5.10440046093e-06
Current iteration=1200, the loss=54085.156074195205
Gamma:  5.10914351047e-06
Current iteration=1300, the loss=53569.30637312257
Gamma: 

KeyboardInterrupt: 

In [90]:
weights_23 = reg_logistic_regression(y_23, matrix_std_tx_23, lambdas[0], gammas[0], max_iter)


Gamma:  0.00333333333333
Current iteration=0, the loss=50282.97591936011




Gamma:  0.000130061474369
Current iteration=100, the loss=1299813.3253270006
Gamma:  5.78050997194e-05
Current iteration=200, the loss=821260.9207168942
Gamma:  5.78050997194e-05
Current iteration=300, the loss=616401.3876636017
Gamma:  5.78050997194e-05
Current iteration=400, the loss=420942.4316300268
Gamma:  3.85367331463e-05
Current iteration=500, the loss=286249.6861910225
Gamma:  1.71274369539e-05
Current iteration=600, the loss=206946.38208725894
Gamma:  1.71274369539e-05
Current iteration=700, the loss=157597.89235992503
Gamma:  1.71274369539e-05
Current iteration=800, the loss=113620.56760026883
Gamma:  7.61219420174e-06
Current iteration=900, the loss=85422.16557873185
Gamma:  7.61219420174e-06
Current iteration=1000, the loss=73176.41080177324
Gamma:  7.61219420174e-06
Current iteration=1100, the loss=63774.83872950742
Gamma:  7.61219420174e-06
Current iteration=1200, the loss=56729.12816050771
Gamma:  7.61219420174e-06
Current iteration=1300, the loss=51293.49741852834
Gamm

In [293]:
print("1  Size: ", len(y_1), "\tPerformance: ", performance(weights_1, y_1, matrix_std_tx_1))
print("0  Size: ", len(y_0), "\tPerformance: ", performance(weights_0, y_0, matrix_std_tx_0))
print("23 Size: ", len(y_23), "\tPerformance: ", performance(weights_23, y_23, matrix_std_tx_23))

1  Size:  77544 	Performance:  0.7770427112349118
0  Size:  99913 	Performance:  0.8369781710087776
23 Size:  72543 	Performance:  0.8077002605351309


## Generate predictions and save ouput in csv format for submission:

In [198]:
DATA_TEST_PATH = '../../data/test.csv' 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)
i_0_test, i_1_test, i_23_test = split_dataset_wrt22(tX_test)

tx_0_test = tX_test[i_0_test]
tx_1_test = tX_test[i_1_test]
tx_23_test = tX_test[i_23_test]

std_tx_0_test = standardize_0(tx_0_test)
std_tx_1_test = standardize_1(tx_1_test)
std_tx_23_test = standardize_23(tx_23_test)

ids_0_test = ids_test[i_0_test]
ids_1_test = ids_test[i_1_test]
ids_23_test = ids_test[i_23_test]

output_path = '../../data/output0.csv'
y_pred_0 = predict_labels(weights_0, build_poly(std_tx_0_test, degree))
create_csv_submission(ids_0_test, y_pred_0, output_path)


output_path = '../../data/output1_79.csv'
y_pred_1 = predict_labels(weights_1, build_poly(std_tx_1_test, degree))
create_csv_submission(ids_1_test, y_pred_1, output_path)


output_path = '../../data/output23.csv'
y_pred_23 = predict_labels(weights_23, build_poly(std_tx_23_test, degree))
create_csv_submission(ids_23_test, y_pred_23, output_path)

ValueError: shapes (227458,72) and (54,1) not aligned: 72 (dim 1) != 54 (dim 0)

In [167]:
OUTPUT_PATH = '../../data/output.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, build_poly(standardize(tX_test), degree))
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)