In [98]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the training data into feature matrix, class labels, and event ids:

In [99]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)


## Do your thing crazy machine learning thing here :) ...

In [137]:
def build_poly(x, degree):
    
    matrix = np.zeros((x.shape[0], x.shape[1] * (degree + 1)))
    
    for i in range(degree + 1):
        matrix[:, (i * x.shape[1]) : ((i + 1) * x.shape[1])] = (x ** i)[:]
    return matrix


def build_k_indices(y, k_fold, seed=62):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

def batch_iter(y, tx, batch_size, num_batches=None, shuffle=True):
    """
    Generate a minibatch iterator for a dataset.
    Takes as input two iterables (here the output desired values 'y' and the input data 'tx')
    Outputs an iterator which gives mini-batches of `batch_size` matching elements from `y` and `tx`.
    Data can be randomly shuffled to avoid ordering in the original data messing with the randomness of the minibatches.
    Example of use :
    for minibatch_y, minibatch_tx in batch_iter(y, tx, 32):
        <DO-SOMETHING>
    """

    data_size = len(y)
    num_batches_max = int(np.ceil(data_size/batch_size))
    if num_batches is None:
        num_batches = num_batches_max
    else:
        num_batches = min(num_batches, num_batches_max)

    if shuffle:
        shuffle_indices = np.random.permutation(np.arange(data_size))
        shuffled_y = y[shuffle_indices]
        shuffled_tx = tx[shuffle_indices]
    else:
        shuffled_y = y
        shuffled_tx = tx
    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, data_size)
        if start_index != end_index:
            yield shuffled_y[start_index:end_index], shuffled_tx[start_index:end_index]

In [108]:
BINARY_CLASSIFICATOIN_0 = -1
BINARY_CLASSIFICATOIN_1 = 1


def sigmoid(t):
    """apply sigmoid function on t."""
    return 1.0 / (1.0 + np.exp(-t))


def calculate_loss_logistic_regression(y, tx, w):
    """compute the cost by negative log likelihood."""
    prediction = tx @ w
    
    y1 = np.where(y == BINARY_CLASSIFICATOIN_1)

    over_700 = np.where(prediction >= 700)

    prediction_result = np.log(1 + np.exp(prediction))
    prediction_result[over_700] = prediction[over_700]
    prediction_result[y1] -= prediction[y1]
    
    result = np.sum(prediction_result)
    return result


def calculate_gradient_logistic_regression(y, tx, w):
    """compute the gradient of loss."""

    y1 = np.where(y == BINARY_CLASSIFICATOIN_1)
    sig = sigmoid(tx @ w).reshape(len(y))
    sig[y1] -= y[y1]

    return tx.T @ sig


def logistic_regression_helper(y, tx, gamma, max_iters, lambda_):

    w = np.zeros((tx.shape[1], 1))
    threshold = 1e-8
    loss_prev = 0
    w_max = w
    perf = 0
    i = 0
    
    batch_size = len(y) / 5

    for iter in range(max_iters):
        
        for mini_y, mini_x in batch_iter(y, tx, batch_size):
        
            loss = calculate_loss_logistic_regression(mini_y, mini_x, w) + lambda_ * np.linalg.norm(w, 2)
            gradient = calculate_gradient_logistic_regression(mini_y, mini_x, w)
            w -= (gradient * gamma).reshape(w.shape)

            if (loss_prev != 0) and np.abs(loss_prev - loss) < threshold:
                print("Reached Theshold, exit")
                break

            loss_prev = loss
            
        
#         if (iter % 10) == 0:
#             cur_perf = performance(w, y, tx)
#             if cur_perf >= perf:
#                 w_max = w
#                 perf = cur_perf
#                 i = iter


        if (iter % 100) == 0:
            print("Current iteration={i}, the loss={l}".format(i=iter, l=loss))

#         if (iter % 300) == 0:
#             print(w_max)
#             print("Performance: ", perf)
#             print("Iteration: ", i)
            

    return w


def logistic_regression(y, tx, gamma, max_iters):
    """ return the final w from the logistic regression """
    return logistic_regression_helper(y, tx, gamma, max_iters, lambda_=0)


def reg_logistic_regression(y, tx, lambda_, gamma, max_iters):
    """ return the final w from the penalized logistic regression, with lambda_ as a non 0 value"""
    return logistic_regression_helper(y, tx, gamma, max_iters, lambda_)


In [89]:
def performance(weights, y, xT):
    """Returns the percentage of successful classifications for the weights,
    given the expected results (y) and data (xT)"""
    from proj1_helpers import predict_labels
    compare_pred = predict_labels(weights, xT)
    compare_pred -= y.reshape((len(y), 1))
        
    non_zero = 0
    for i in range(len(compare_pred)):
        if compare_pred[i] != 0:
            non_zero += 1
            
    return 1 - non_zero / compare_pred.size

In [90]:

def plot_feature_relationship(y, tX):

    std_tx = standardize(tX)

    row = np.zeros(std_tx.shape[0])

    # for i in range(len(tX)):
    #     row[i] = tX[i][0]

    plt.figure(figsize=(20, 20))

    for j in range(std_tx.shape[1]):
        for i in range(len(std_tx)):
            row[i] = std_tx[i][j]
        plt.subplot(5, 6, j + 1)
        plt.title(j)
        plt.plot(row[np.where(y == 1)], y[np.where(y == 1)], 'ro')
        plt.plot(row[np.where(y == -1)], y[np.where(y == -1)], 'bo')
  

    plt.tight_layout()

    plt.show()

# plot_feature_relationship(y, tX)


In [100]:
def standardize(x):
#     cols = [4, 5, 6, 12, 26, 28]
#     for i in range(len(x)):
#         x[i][np.where(x[i] == -999)] = 0

    # Combine feature, or do poly expansion
    
    # Do not normalize the jet num
    jet_num_col = 22
    jet_num = x[:, jet_num_col]
    
    # Replace -999 with some value that is the mean/median of the represent dataset 
    for i in range(x.shape[1]):
        median = np.median(x[np.where(x[:, i] != -999), i])
        x[np.where(x[:, i] == -999), i] = median 
        x[np.where(x[:, i] != -999), i] = x[np.where(x[:, i] != -999), i] - np.mean(x[np.where(x[:, i] != -999), i])
    
    mean_x = np.mean(x, axis=0)
    x = x - mean_x
    
    std_x = np.std(x, axis=0)
    x[:, std_x > 0] = x[:, std_x > 0] / std_x[std_x > 0]

    x[:, jet_num_col] = jet_num
    
    return x

# print(standardize(tX) - tX)

In [92]:
# def add_feature(tX, cols, )

In [140]:


# There are two parameters, lambda_ and gamma, where gamma is the step size 

max_iter = 15000
lambdas = np.arange(0.1, 0.4, 0.1)
gammas = [0.05]

degree = 3

std_tx = standardize(tX)
matrix_std = build_poly(std_tx, degree)
# print(std_tx)
# print(tX)

# k_fold = 10
# k_indices = build_k_indices(y, k_fold)
    
# k = 2

# y_test = y[k_indices[k]]
# x_test = std_tx[k_indices[k]]
# train_indices = k_indices[~(np.arange(k_indices.shape[0]) == k)]
# y_train = np.concatenate(y[train_indices])
# x_train = np.concatenate(std_tx[train_indices])
    
weights = reg_logistic_regression(y, matrix_std, lambdas[0], gammas[0], max_iter)

print(weights)

 




Current iteration=0, the loss=3180308687.6000843
Current iteration=100, the loss=889313007.2500957
Current iteration=200, the loss=1411474542.2793546
Current iteration=300, the loss=5656608215.101741
Current iteration=400, the loss=913416419.7535772
Current iteration=500, the loss=1208454568.5202713
Current iteration=600, the loss=1135909556.7617693
Current iteration=700, the loss=2628409051.558204
Current iteration=800, the loss=1588890502.000391
Current iteration=900, the loss=720905312.1548002
Current iteration=1000, the loss=898052316.7573014
Current iteration=1100, the loss=935839693.9204004
Current iteration=1200, the loss=793017926.9597172
Current iteration=1300, the loss=1736994288.4637105
Current iteration=1400, the loss=936578964.9458123
Current iteration=1500, the loss=519054400.51307905
Current iteration=1600, the loss=634680928.4236592
Current iteration=1700, the loss=1235288395.0958323
Current iteration=1800, the loss=579119340.2990378
Current iteration=1900, the loss=532

In [141]:



print(performance(weights, y, matrix_std))

0.779196


## Generate predictions and save ouput in csv format for submission:

In [142]:
DATA_TEST_PATH = '../../data/test.csv' 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [143]:
OUTPUT_PATH = '../../data/output.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, build_poly(standardize(tX_test), degree))
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)