In [1]:
import sys
sys.path.append('..')

import numpy as np
from src.helpers import load_csv_data, standardize, flatten_jet_features, predict_labels, create_csv_submission
from src.logistic.implementations import logistic_regression
from src.logistic.loss import compute_loss
from src.logistic.not_req_impl import reg_logistic_regression, gradient_descent_step
from src.logistic.gradient import compute_gradient
from src.split import split_data
from src.polynomials import build_poly_matrix_vandermonde

from src.logistic.sigmoid import sigmoid

%reload_ext autoreload
%autoreload 2

In [2]:
# Import data
y, x_raw, ids = load_csv_data('../data/train.csv')

In [7]:
# Transform y for logistic regression
y[np.where(y == -1)] = 0

In [25]:
# Standardize x and add the 1s column
x, mean_x, std_x = standardize(x_raw)

In [55]:
def jet_split(x, y, degree):  
    x0, x1, x23 = x[(x[:, 22] == 0)], x[(x[:, 22] == 1)], x[(x[:, 22] == 2) | (x[:, 22] == 3)]
    y0, y1, y23 = y[(x[:, 22] == 0)], y[(x[:, 22] == 1)], y[(x[:, 22] == 2) | (x[:, 22] == 3)]
    
    jet0_indexes = [4, 5, 6, 12, 23, 24, 25, 26, 27, 28]
    jet1_indexes = [4, 5, 6, 12, 26, 27, 28]
    
    x0_rem = np.delete(x0, jet0_indexes, axis=1)
    x1_rem = np.delete(x1, jet1_indexes, axis=1)
    
    x0_std = standardize(x0_rem)[0]
    x1_std = standardize(x1_rem)[0]
    x23_std = standardize(x23)[0]
    
    x0_poly = build_poly_matrix_vandermonde(x0_std, degree)    
    x1_poly = build_poly_matrix_vandermonde(x1_std, degree)
    x23_poly = build_poly_matrix_vandermonde(x23_std, degree)
    
    tx0 = np.c_[np.ones((y0.shape[0], 1)), x0_poly]
    tx1 = np.c_[np.ones((y1.shape[0], 1)), x1_poly]
    tx23 = np.c_[np.ones((y23.shape[0], 1)), x23_poly]
    
    return tx0, tx1, tx23, y0, y1, y23

In [84]:
# Create three different sets
tx0, tx1, tx23, y0, y1, y23 = jet_split(x_raw, y, 1)

In [85]:
# Split the data
ratio = 0.8
tx0_train, y0_train, tx0_test, y0_test = split_data(tx0, y0, ratio)
tx1_train, y1_train, tx1_test, y1_test = split_data(tx1, y1, ratio)
tx23_train, y23_train, tx23_test, y23_test = split_data(tx23, y23, ratio)

In [99]:
gamma = 0.1
iters = 10000
lambda_ = 0

In [100]:
initial_w = np.zeros((tx0.shape[1], 1))

w0, tr_loss0 = reg_logistic_regression(y0_train, tx0_train, initial_w, lambda_, iters, gamma, method='sgd')

Current iteration=0, loss=[55403.25414216]
||d|| = 2.7324669642658286
Current iteration=1000, loss=[41830.79565029]
||d|| = 0.6879570115959988
Current iteration=2000, loss=[41690.83350776]
||d|| = 1.5427066289083498
Current iteration=3000, loss=[41674.36910082]
||d|| = 3.398105300757459
Current iteration=4000, loss=[41610.45699836]
||d|| = 3.2451107257763967
Current iteration=5000, loss=[41591.7873739]
||d|| = 1.8018504199484595
Current iteration=6000, loss=[41540.57308652]
||d|| = 0.597113855636787
Current iteration=7000, loss=[41523.27092091]
||d|| = 3.3018506774192136
Current iteration=8000, loss=[41501.99835543]
||d|| = 3.2686977474143335
Current iteration=9000, loss=[41488.79746188]
||d|| = 0.5429988393579248
loss=[41473.85638447]


In [101]:
initial_w = np.zeros((tx1.shape[1], 1))

w1, tr_loss1 = reg_logistic_regression(y1_train, tx1_train, initial_w, lambda_, iters, gamma, method='sgd')

Current iteration=0, loss=[42999.38534604]
||d|| = 2.844322357540556
Current iteration=1000, loss=[38296.79092955]
||d|| = 2.55456092537108
Current iteration=2000, loss=[38122.03463385]
||d|| = 3.526360208399795
Current iteration=3000, loss=[38141.83027134]
||d|| = 2.661227538377066
Current iteration=4000, loss=[37985.01491669]
||d|| = 3.563433683303491
Current iteration=5000, loss=[37963.97422366]
||d|| = 3.319494340786066
Current iteration=6000, loss=[37899.1330591]
||d|| = 1.99703989920449
Current iteration=7000, loss=[37873.45684235]
||d|| = 1.9373541027900703
Current iteration=8000, loss=[37866.58580558]
||d|| = 3.4048460217480936
Current iteration=9000, loss=[37829.27914076]
||d|| = 2.1017452462546005
loss=[37900.0503434]


In [102]:
initial_w = np.zeros((tx23.shape[1], 1))

w23, tr_loss23 = reg_logistic_regression(y23_train, tx23_train, initial_w, lambda_, iters, gamma, method='sgd')

Current iteration=0, loss=[40226.10347662]
||d|| = 3.106497919695004
Current iteration=1000, loss=[35255.76546146]
||d|| = 2.304779896982352
Current iteration=2000, loss=[35137.2703897]
||d|| = 3.2448625782882954
Current iteration=3000, loss=[35078.092544]
||d|| = 2.4575150274350737
Current iteration=4000, loss=[35319.9623284]
||d|| = 1.8464445130855551
Current iteration=5000, loss=[35230.16657967]
||d|| = 2.9636975559067933
Current iteration=6000, loss=[34875.12725833]
||d|| = 0.9186356060105719
Current iteration=7000, loss=[34860.25135999]
||d|| = 5.202261626706678
Current iteration=8000, loss=[34821.29173659]
||d|| = 1.2689119555751922
Current iteration=9000, loss=[34807.01489777]
||d|| = 4.483115402909467
loss=[34810.52652304]


In [103]:
y0_pred = predict_labels(w0, tx0_test)

acc0 = (y0_pred == y0_test).sum() / y0_test.shape[0]

In [104]:
y1_pred = predict_labels(w1, tx1_test)

acc1 = (y1_pred == y1_test).sum() / y1_test.shape[0]

In [105]:
y23_pred = predict_labels(w23, tx23_test)

acc23 = (y23_pred == y23_test).sum() / y23_test.shape[0]

In [106]:
(acc0 * y0_test.shape[0] + acc1 * y1_test.shape[0] + acc23 * y23_test.shape[0]) / (y.shape[0] * 0.2)

0.68986

- Result with penalized SGD, gamma = 0.1, lambda = 0.01, degree 2, 10k iter: loss = 115658, test set accuracy = 0.68584
- Result with penalized SGD, gamma = 0.1, lambda = 0.01, degree 1, 10k iter: loss = 115658, test set accuracy = 0.68904
- Result with SGD, lambda = 0.01, degree 1, 10k iter: loss = 115658, test set accuracy = 0.68986

In [68]:
y_sub, x_sub_raw, ids_sub = load_csv_data('../data/test.csv')

In [70]:
x_sub = flatten_jet_features(x_sub_raw)
tx_sub = np.c_[np.ones((y_sub.shape[0], 1)), x_sub]
tx_sub_poly = build_poly_matrix_vandermonde(tx_sub, 2)
y_sub = predict_labels(w, tx_sub_poly)

In [72]:
create_csv_submission(ids_sub, y_sub, '../submissions/10-24.14-18.csv')