In [1]:
import sys
sys.path.append('..')

import numpy as np
from src.helpers import load_csv_data, standardize, remove_incomplete_columns, predict_labels, create_csv_submission, compute_accuracy
from src.logistic.loss import compute_loss
from src.logistic.not_req_impl import reg_logistic_regression
from src.logistic.gradient import compute_gradient

from src.helpers import remove_correlated_columns, get_jet_indexes, jet_indexes, log_indexes, get_all
from src.polynomials import build_poly_matrix_quadratic
from src.k_fold import build_k_indices

%reload_ext autoreload
%autoreload 2

In [2]:
# Import data
y, x_raw, ids = load_csv_data('../data/train.csv')
init_col_n = x_raw.shape[1]
init_col_n

30

In [3]:
# Transform y for logistic regression
y[np.where(y == -1)] = 0

In [4]:
x_jet_indexes = get_jet_indexes(x_raw)
# x_jet_indexes = get_all(x_raw)
x = x_raw
x_mass = np.zeros(x.shape[0])
x_mass[x[:, 0] == -999] = 1
x[:, 0][x[:, 0] == -999] = np.median(x[:, 0][x[:, 0] != -999])
x = np.column_stack((x, x_mass))

In [5]:
for i in x_jet_indexes:
    tx_i = x[x_jet_indexes[i]]
    tx_del = np.delete(tx_i, jet_indexes[i], axis=1)
    print(tx_del.shape)

(99913, 21)
(77544, 24)
(72543, 31)


In [6]:
ws, te_accs, tr_accs, te_losses, tr_losses = [], [], [], [], []
lambda_ = 0

for i in x_jet_indexes:
    
    y_i = y[x_jet_indexes[i]]
    tx_i = x[x_jet_indexes[i]]
    tx_del = np.delete(tx_i, jet_indexes[i], axis=1)
    
    for li in range(tx_del.shape[1]):
       # print(tx_del[:,li].min())
       tx_del[:,li] = np.apply_along_axis(lambda n: np.log(1 + abs(tx_del[:,li].min()) + n), 0, tx_del[:,li])
    
    tx_std = standardize(tx_del)[0]
    tx_poly = build_poly_matrix_quadratic(tx_std)
    tx = np.c_[np.ones((y_i.shape[0], 1)), tx_poly]
    
    initial_w = np.zeros((tx.shape[1], 1))

    k_indices = build_k_indices(y_i, k, 1)
    
    te_accs_k, tr_accs_k, te_losses_k, tr_losses_k, ws_k = [], [], [], [], []
    
    # Ridge linear
    w, loss_tr_k = reg_logistic_regression(y_i, tx, initial_w, lambda_, 10000, 0.01, method='sgd', ratio=0.5)

    # Calculate the loss for test data
    loss_te_k = compute_loss(y_i, tx, w)

    acc_tr_k = compute_accuracy(tx, w, y_i, mode='logistic')
    acc_te_k = compute_accuracy(tx, w, y_i, mode='logistic')

    te_accs_k.append(acc_te_k)
    tr_accs_k.append(acc_tr_k)
    te_losses_k.append(np.math.sqrt(2 * loss_te_k))
    tr_losses_k.append(np.math.sqrt(2 * loss_tr_k))

    ws_k.append(w)  

    te_accs.append(np.mean(te_accs_k) * x[x_jet_indexes[i]].shape[0])
    tr_accs.append(np.mean(acc_te_k) * x[x_jet_indexes[i]].shape[0])
    te_losses.append(np.mean(te_losses_k) * x[x_jet_indexes[i]].shape[0])
    tr_losses.append(np.mean(tr_losses_k) * x[x_jet_indexes[i]].shape[0])
    ws.append(np.mean(ws_k, axis=0))

    print("lambda={l:.6f}, Training accuracy={tr:.3f}, Testing accuracy={te:.3f}".format(
           l=lambda_, tr=tr_accs[i] / y_i.shape[0], te=te_accs[i] / y_i.shape[0]))

Current iteration=0, loss=[69254.41425129]
||d|| = 8.452017961819308
Current iteration=1000, loss=[48051.30847285]
||d|| = 2.342489911071729
Current iteration=2000, loss=[47322.49371571]
||d|| = 3.7508621461127314
Current iteration=3000, loss=[46528.73448391]
||d|| = 3.787202275331311
Current iteration=4000, loss=[46032.1087465]
||d|| = 4.516971082030042
Current iteration=5000, loss=[45957.94126128]
||d|| = 12.191519043170517
Current iteration=6000, loss=[45776.33259439]
||d|| = 3.091703034345654
Current iteration=7000, loss=[46172.32307359]
||d|| = 15.058401186380157
Current iteration=8000, loss=[45377.95173017]
||d|| = 3.4952133158559335
Current iteration=9000, loss=[45262.57208358]
||d|| = 7.864668019242488
loss=[45218.51880455]
lambda=0.000000, Training accuracy=0.772, Testing accuracy=0.772
Current iteration=0, loss=[53749.40496934]
||d|| = 7.568035474504469
Current iteration=1000, loss=[47239.06291202]
||d|| = 11.112483510723814
Current iteration=2000, loss=[46194.51581023]
||d||

In [7]:
y_sub, x_sub_raw, ids_sub = load_csv_data('../data/test.csv')
x_sub = x_sub_raw
x_sub[:, 0][x_sub[:, 0] == -999] = np.median(x_sub[:, 0][x_sub[:, 0] != -999])
x_sub_mass = np.zeros(x_sub.shape[0])
x_sub_mass[x_sub[:, 0] == -999] = 1
x_sub = np.column_stack((x_sub, x_sub_mass))
sub_jet_indexes = get_jet_indexes(x_sub)

In [8]:
for i, w in enumerate(ws):
    tx_i_sub = x_sub[sub_jet_indexes[i]]
    tx_sub_del = np.delete(tx_i_sub, jet_indexes[i], axis=1)
    print(tx_sub_del.shape)

(227458, 21)
(175338, 24)
(165442, 31)


In [9]:
for i, w in enumerate(ws):
    
    tx_i_sub = x_sub[sub_jet_indexes[i]]
    tx_sub_del = np.delete(tx_i_sub, jet_indexes[i], axis=1)
    
    for li in range(tx_sub_del.shape[1]):
        # print(tx_del[:,li].min())
        tx_sub_del[:,li] = np.apply_along_axis(lambda n: np.log(1 + abs(tx_sub_del[:,li].min()) + n), 0, tx_sub_del[:,li])
    
    tx_sub_std = standardize(tx_sub_del)[0]
    tx_sub_poly = build_poly_matrix_quadratic(tx_sub_std)
    tx_sub = np.c_[np.ones((y_sub[sub_jet_indexes[i]].shape[0], 1)), tx_sub_poly]
    
    y_sub[sub_jet_indexes[i]] = predict_labels(ws[i], tx_sub, mode='logistic')
    
create_csv_submission(ids_sub, y_sub, '../submissions/10-27_19.10.csv')