In [1]:
import sys
sys.path.append('..')

import numpy as np
from src.helpers import load_csv_data, standardize, remove_incomplete_columns, predict_labels, create_csv_submission
from src.logistic.implementations import logistic_regression
from src.logistic.loss import compute_loss
from src.logistic.not_req_impl import reg_logistic_regression, gradient_descent_step
from src.logistic.gradient import compute_gradient
from src.split import split_data

from src.logistic.sigmoid import sigmoid

%reload_ext autoreload
%autoreload 2

In [3]:
# Import data
y, x_raw, ids = load_csv_data('../data/train.csv')

In [3]:
# Clean the data
x, kept_columns = remove_incomplete_columns(x_raw)

In [5]:
# Transform y for logistic regression
y[np.where(y == -1)] = 0

In [6]:
# Standardize x and add the 1s column
x, mean_x, std_x = standardize(x)
tx = np.c_[np.ones((y.shape[0], 1)), x]

In [7]:
# Split the data
tx_train, y_train, tx_test, y_test = split_data(tx, y, 0.8)

## Plain logistic regression

In [19]:
initial_w = np.zeros((tx.shape[1], 1))

w, tr_loss = reg_logistic_regression(y, tx, initial_w, 0, 100000, 0.01, method='sgd')

2262984
Current iteration=73300, loss=[136962.1559508]
||d|| = 0.5941439287913605
Current iteration=73400, loss=[137321.24478922]
||d|| = 1.3720806001791432
Current iteration=73500, loss=[136879.39428901]
||d|| = 0.21088987013687352
Current iteration=73600, loss=[138473.90002365]
||d|| = 0.34067314234831886
Current iteration=73700, loss=[136274.09138344]
||d|| = 0.897425031512749
Current iteration=73800, loss=[136676.012739]
||d|| = 0.3395781969356725
Current iteration=73900, loss=[135257.75571366]
||d|| = 0.5371458495169106
Current iteration=74000, loss=[135827.87017012]
||d|| = 0.5281365873490873
Current iteration=74100, loss=[136897.46244334]
||d|| = 0.5885020672193488
Current iteration=74200, loss=[135893.05574127]
||d|| = 0.4155854003366906
Current iteration=74300, loss=[136641.44592865]
||d|| = 1.22564450074405
Current iteration=74400, loss=[135783.66237767]
||d|| = 2.093968465640055
Current iteration=74500, loss=[136596.64678365]
||d|| = 2.1149871532145914
Current iteration=7460

Result with gamma = 0.001 and 100k iter ||d|| = 0.8317845119149855 loss = 137795.11103903, test set accuracy = 0.67216

Result with gamma = 0.01 and 100k iter ||d|| = 0.44937032966833224 loss = [138897.157523], test set accuracy = 0.67562



In [10]:
y_sub, x_sub_raw, ids_sub = load_csv_data('data/test.csv')

In [21]:
x_sub = x_sub_raw[:, kept_columns]
tx_sub = np.c_[np.ones((y_sub.shape[0], 1)), x_sub]
y_sub = predict_labels(w, tx_sub)

In [22]:
create_csv_submission(ids_sub, y_sub, 'submissions/10-24.00-15.csv')

In [20]:
y_pred = predict_labels(w, tx_test)

(y_pred == y_test).sum() / y_test.shape[0]

0.67562

In [14]:
lambdas = np.logspace(-5, 0, 15)
initial_w = np.zeros((tx.shape[1], 1))

ws = []
tr_losses = []
te_losses = []

for ind, lambda_ in enumerate(lambdas):
    w, tr_loss = reg_logistic_regression(y, tx, initial_w, lambda_, 10000, 0.001, method='newton')
    ws.append(w)
    te_loss = compute_loss(y_test, tx_test, w, lambda_=lambda_)
    tr_losses.append(tr_loss[0])
    te_losses.append(te_loss[0])

    y_pred = predict_labels(w, tx_test)
    training_accuracy = (y_pred == y_test).sum() / y_test.shape[0]

    print("lambda={lambda_}, Training loss={tr:.3f}, Testing loss={te:.3f}".format(
        lambda_=lambda_, tr=tr_losses[ind], te=te_losses[ind]))
    print("Training accuracy={acc:.3f}".format(acc=training_accuracy))

Current iteration=0, loss=[173313.04513999]
||d|| = 1.26305514443864
Current iteration=1000, loss=[168283.06486387]
||d|| = 1.819023295795367
Current iteration=2000, loss=[166816.44837511]
||d|| = 1.0077769045239235
Current iteration=3000, loss=[166393.08782902]
||d|| = 0.8318294301989898
Current iteration=4000, loss=[166597.5718525]
||d|| = 2.6272222280554387
Current iteration=5000, loss=[166996.0668856]
||d|| = 0.8493192680566024
Current iteration=6000, loss=[166730.77618249]
||d|| = 1.0512566439418087
Current iteration=7000, loss=[166921.00926433]
||d|| = 3.5558377129950416
Current iteration=8000, loss=[166223.62885936]
||d|| = 0.6963285060093969
Current iteration=9000, loss=[167875.55098277]
||d|| = 0.9897859278051709
loss=[167745.57591656]
lambda=1e-05, Training loss=167800.920, Testing loss=33595.736
Training accuracy=0.655
Current iteration=0, loss=[173346.53609555]
||d|| = 0.8728545565961453
Current iteration=1000, loss=[168460.05859765]
||d|| = 1.3735031108323827
Current itera

## Logistic regression with summed jet columns

In [47]:
# Import data
y, x_raw, ids = load_csv_data('../data/train.csv')

In [48]:
x = flatten_jet_features(x_raw)

In [49]:
# Transform y for logistic regression
y[np.where(y == -1)] = 0

In [63]:
# Standardize x and add the 1s column
x, mean_x, std_x = standardize(x)
tx = np.c_[np.ones((y.shape[0], 1)), x]
tx_poly = build_poly_matrix_vandermonde(tx, 2)

In [64]:
# Split the data
tx_train, y_train, tx_test, y_test = split_data(tx_poly, y, 0.8)

In [65]:
initial_w = np.zeros((tx_poly.shape[1], 1))

w, tr_loss = reg_logistic_regression(y_train, tx_train, initial_w, 0.001, 10000, 0.1, method='sgd')

Current iteration=0, loss=[271210.8712248]
||d|| = 28.187006321784594
Current iteration=1000, loss=[473276.92933884]
||d|| = 0.0018141404419597484
Current iteration=2000, loss=[471457.80712879]
||d|| = 72.80105748739423
Current iteration=3000, loss=[413814.03478361]
||d|| = 2.20590705580624
Current iteration=4000, loss=[382755.89768559]
||d|| = 0.0011342766214547798
Current iteration=5000, loss=[362612.0083851]
||d|| = 6.656985821296442
Current iteration=6000, loss=[347606.82277806]
||d|| = 3.1595568788074604
Current iteration=7000, loss=[325844.92175665]
||d|| = 17.07173239625616
Current iteration=8000, loss=[313553.25590884]
||d|| = 3.19607849012299
Current iteration=9000, loss=[295879.7846367]
||d|| = 2.085721188752055
loss=[153269.50450747]


In [66]:
y_pred = predict_labels(w, tx_test)

(y_pred == y_test).sum() / y_test.shape[0]

0.68696

- Result with SGD, gamma = 0.1, degree 2, 10k iter: loss = 115658, test set accuracy = 0.68696
- Result with Newton, gamma = 0.1, degree 2, 10k iter: loss = 126817, test set accuracy = 0.6554


In [35]:
def poly_reg_logistic_regression(tx, y, degree, ratio):
    
    tx_poly = build_poly_matrix_vandermonde(tx, degree)
    tx_train, y_train, tx_test, y_test = split_data(tx_poly, y, ratio)
 
    lambdas = np.logspace(-5, 0, 15)
    initial_w = np.zeros((tx_train.shape[1], 1))

    ws = []
    tr_losses = []
    te_losses = []

    for ind, lambda_ in enumerate(lambdas):


        w, tr_loss = reg_logistic_regression(y_train, tx_train, initial_w, lambda_, 10000, 0.1, method='newton')
        ws.append(w)
        te_loss = compute_loss(y_test, tx_test, w, lambda_=lambda_)
        tr_losses.append(tr_loss.flatten()[0])
        te_losses.append(te_loss.flatten()[0])

        y_pred = predict_labels(w, tx_test)
        training_accuracy = (y_pred == y_test).sum() / y_test.shape[0]

        print("lambda={lambda_}, Training loss={tr:.3f}, Testing loss={te:.3f}".format(
            lambda_=lambda_, tr=tr_losses[ind], te=te_losses[ind]))
        print("Training accuracy={acc:.3f}".format(acc=training_accuracy))

poly_reg_logistic_regression(tx, y, 2, 0.8)

Current iteration=0, loss=[139955.25046312]
||d|| = 2.7131764979399953
Current iteration=1000, loss=[129131.8432457]
||d|| = 5.66480554398374
Current iteration=2000, loss=[128452.4336647]
||d|| = 3.358482380966959
Current iteration=3000, loss=[128904.39914444]
||d|| = 3.1543797487187564
Current iteration=4000, loss=[128822.17365715]
||d|| = 3.870159339983603
Current iteration=5000, loss=[128707.81919923]
||d|| = 2.157741942193498
Current iteration=6000, loss=[128793.65809358]
||d|| = 5.063492435556481
Current iteration=7000, loss=[128838.25429689]
||d|| = 2.116481931366886
Current iteration=8000, loss=[128900.86620889]
||d|| = 2.117993899172692
Current iteration=9000, loss=[128713.22159776]
||d|| = 2.10311382058259
loss=[127524.0134424]
lambda=1e-05, Training loss=128856.288, Testing loss=32318.584
Training accuracy=0.655
Current iteration=0, loss=[141646.78530174]
||d|| = 28.187607308561805
Current iteration=1000, loss=[129927.61377292]
||d|| = 2.4503415278703855
Current iteration=200

KeyboardInterrupt: 

## Logistic regression with different subsets

### With quadratic polynomial expansion

In [2]:
# Import data
y, x_raw, ids = load_csv_data('data/train.csv')
init_col_n = x_raw.shape[1]
init_col_n

30

In [3]:
# create list of columns indices
curr_cols = list(range(x_raw.shape[1]))

def rem_cols(lst, mask):
    for c,x in enumerate(mask):
        if not x:
            lst.remove(c)

In [4]:
# Clean the data
x, kept_columns = remove_incomplete_columns(x_raw)
rem_cols(curr_cols, kept_columns)

In [5]:
curr_cols

[0, 1, 2, 3, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 29]

In [6]:
print(x)

[[ 138.47    51.655   97.827 ...  258.733    2.     113.497]
 [ 160.937   68.768  103.235 ...  164.546    1.      46.226]
 [-999.     162.172  125.953 ...  260.414    1.      44.251]
 ...
 [ 105.457   60.526   75.839 ...  198.907    1.      41.992]
 [  94.951   19.362   68.812 ...  112.718    0.       0.   ]
 [-999.      72.756   70.831 ...   99.405    0.       0.   ]]


In [7]:
x, kept_columns_2 = remove_correlated_columns(x)
rem_cols(curr_cols, kept_columns_2)

In [8]:
kept_columns_2

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True, False])

In [9]:
curr_cols

[0, 1, 2, 3, 7, 8, 9, 10, 11, 13, 14, 15, 16, 18, 20, 21, 22, 29]

In [10]:
# log transform
to_log_cols_idxs = [2, 3, 4, 6, 7, 8, 10, 13, 16]
for i in to_log_cols_idxs:
    print(x[:,i].min())
    x[:,i] = np.apply_along_axis(lambda n: np.log(1 + abs(x[:,i].min()) + n), 0, x[:,i])

6.329
0.0
0.208
46.104
0.047
-1.414
-2.499
-2.505
-3.142


In [11]:
# Transform y for logistic regression
y[np.where(y == -1)] = 0

In [12]:
# Standardize x and add the 1s column
x, mean_x, std_x = standardize(x)
tx = np.c_[np.ones((y.shape[0], 1)), x]

In [13]:
# poly
tx = build_poly_matrix_quadratic(tx)

In [14]:
# Split the data
tx_train, y_train, tx_test, y_test = split_data(tx, y, 0.8)

In [15]:
initial_w = np.zeros((tx.shape[1], 1))

w, tr_loss = reg_logistic_regression(y, tx, initial_w, 0, 10000, 0.05, method='sgd', ratio=0.5)

Current iteration=0, loss=[173286.79513999]
||d|| = 3.443337520133352
Current iteration=1000, loss=[169331.57214641]
||d|| = 8.587969507816975
Current iteration=2000, loss=[140566.09295898]
||d|| = 3.4237780837681195
Current iteration=3000, loss=[139128.99164782]
||d|| = 3.3300413210784403
Current iteration=4000, loss=[142754.26234536]
||d|| = 2.0571933740014594
Current iteration=5000, loss=[133270.19730013]
||d|| = 6.575845170786087
Current iteration=6000, loss=[134135.57444219]
||d|| = 5.557606082362086
Current iteration=7000, loss=[132673.5156465]
||d|| = 2.2204047658010806
Current iteration=8000, loss=[133118.57044955]
||d|| = 0.8498772901081524
Current iteration=9000, loss=[137660.72928189]
||d|| = 6.426500209839358
loss=[132741.37886452]


Result with gamma = 0.01 and 100k iter ||d|| = 0.44937032966833224 loss = [138897.157523], test set accuracy = 0.67562

---

Robbins-Monroe | Result with degree = 3, gamma = 0.0004 and 10k iter loss = [155654.57878558], test set accuracy = 0.65534

Robbins-Monroe | Result with quadratic, gamma = 0.0004 and 10k iter loss = [155654.57878558], test set accuracy = 0.6554

---

Robbins-Monroe | quadratic, gamma = 0.01, ratio = 0.5, 10k iter | loss = [143340.81035078], test set accuracy = 0.6616

---

logistic w/ RM, log, quadratic | gamma = 0.01, ratio = 0.5, 10k iter | loss = [137580.1845174], test set accuracy = 0.69154

logistic w/ RM, log, quadratic | gamma = 0.05, ratio = 0.5, 10k iter | loss = [132741.37886452], test set accuracy = 0.74846

### Without mass column

In [2]:
# Import data
y, x_raw, ids = load_csv_data('../data/train.csv')
init_col_n = x_raw.shape[1]
init_col_n

30

In [3]:
# Transform y for logistic regression
y[np.where(y == -1)] = 0

In [5]:
x_jet_indexes = get_jet_indexes(x_raw)
# x_jet_indexes = get_all(x_raw)
x = x_raw
# x_mass = np.zeros(x.shape[0])
# x_mass[x[:, 0] == -999] = 1
x[:, 0][x[:, 0] == -999] = np.median(x[:, 0][x[:, 0] != -999])
# x = np.column_stack((x, x_mass))

In [7]:
ws, te_accs, tr_accs, te_losses, tr_losses = [], [], [], [], []
lambda_ = 0
k = 5

for i in x_jet_indexes:
    
    y_i = y[x_jet_indexes[i]]
    tx_i = x[x_jet_indexes[i]]
    tx_del = np.delete(tx_i, jet_indexes[i], axis=1)
    
    for li in log_indexes:
       # print(tx_del[:,li].min())
       tx_del[:,li] = np.apply_along_axis(lambda n: np.log(1 + abs(tx_del[:,li].min()) + n), 0, tx_del[:,li])
    
    tx_std = standardize(tx_del)[0]
    tx_poly = build_poly_matrix_quadratic(tx_std)
    tx = np.c_[np.ones((y_i.shape[0], 1)), tx_poly]
    
    initial_w = np.zeros((tx.shape[1], 1))

    k_indices = build_k_indices(y_i, k, 1)
    
    te_accs_k, tr_accs_k, te_losses_k, tr_losses_k, ws_k = [], [], [], [], []
    
    for k_ in range(k):
        
        test_indices = k_indices[k_]
        train_indices = np.setdiff1d(k_indices.flatten(), test_indices)

        y_train = y_i[train_indices]
        x_train = tx[train_indices]
        y_test = y_i[test_indices]
        x_test = tx[test_indices]

        # Ridge linear
        w, loss_tr_k = reg_logistic_regression(y_train, x_train, initial_w, lambda_, 10000, 0.01, method='sgd', ratio=0.5)

        # Calculate the loss for test data
        loss_te_k = compute_loss(y_test, x_test, w)
        
        acc_tr_k = compute_accuracy(x_train, w, y_train, mode='logistic')
        acc_te_k = compute_accuracy(x_test, w, y_test, mode='logistic')
        
        te_accs_k.append(acc_te_k)
        tr_accs_k.append(acc_tr_k)
        te_losses_k.append(np.math.sqrt(2 * loss_te_k))
        tr_losses_k.append(np.math.sqrt(2 * loss_tr_k))

        ws_k.append(w)
        

    te_accs.append(np.mean(te_accs_k) * x[x_jet_indexes[i]].shape[0])
    tr_accs.append(np.mean(acc_te_k) * x[x_jet_indexes[i]].shape[0])
    te_losses.append(np.mean(te_losses_k) * x[x_jet_indexes[i]].shape[0])
    tr_losses.append(np.mean(tr_losses_k) * x[x_jet_indexes[i]].shape[0])
    ws.append(np.mean(ws_k, axis=0))

    print("lambda={l:.6f}, Training accuracy={tr:.3f}, Testing accuracy={te:.3f}".format(
           l=lambda_, tr=tr_accs[i] / y_i.shape[0], te=te_accs[i] / y_i.shape[0]))

Current iteration=0, loss=[55401.8678478]
||d|| = 2.5787159411771086
Current iteration=1000, loss=[48723.13405209]
||d|| = 5.922103991217265
Current iteration=2000, loss=[49316.23811431]
||d|| = 7.187825262124538
Current iteration=3000, loss=[47112.09800444]
||d|| = 1.2774213848064553
Current iteration=4000, loss=[47590.02996856]
||d|| = 3.5928271124780147
Current iteration=5000, loss=[46988.97556607]
||d|| = 16.16833323616141
Current iteration=6000, loss=[46356.64971734]
||d|| = 0.9629467944887913
Current iteration=7000, loss=[46352.58603949]
||d|| = 5.306026661594372
Current iteration=8000, loss=[46224.20597002]
||d|| = 3.808306576732271
Current iteration=9000, loss=[46568.19341752]
||d|| = 4.211216331407385
loss=[46164.79383636]
Current iteration=0, loss=[55401.8678478]
||d|| = 3.469929013599797
Current iteration=1000, loss=[46925.37148927]
||d|| = 7.224795563582372
Current iteration=2000, loss=[47209.80848357]
||d|| = 4.342158945304195
Current iteration=3000, loss=[46341.56359176]


### With mass column

In [8]:
# Import data
y, x_raw, ids = load_csv_data('../data/train.csv')
init_col_n = x_raw.shape[1]
init_col_n

30

In [9]:
# Transform y for logistic regression
y[np.where(y == -1)] = 0

In [10]:
x_jet_indexes = get_jet_indexes(x_raw)
# x_jet_indexes = get_all(x_raw)
x = x_raw
x_mass = np.zeros(x.shape[0])
x_mass[x[:, 0] == -999] = 1
x[:, 0][x[:, 0] == -999] = np.median(x[:, 0][x[:, 0] != -999])
x = np.column_stack((x, x_mass))

In [11]:
ws, te_accs, tr_accs, te_losses, tr_losses = [], [], [], [], []
lambda_ = 0
k = 5

for i in x_jet_indexes:
    
    y_i = y[x_jet_indexes[i]]
    tx_i = x[x_jet_indexes[i]]
    tx_del = np.delete(tx_i, jet_indexes[i], axis=1)
    
    for li in log_indexes:
       # print(tx_del[:,li].min())
       tx_del[:,li] = np.apply_along_axis(lambda n: np.log(1 + abs(tx_del[:,li].min()) + n), 0, tx_del[:,li])
    
    tx_std = standardize(tx_del)[0]
    tx_poly = build_poly_matrix_quadratic(tx_std)
    tx = np.c_[np.ones((y_i.shape[0], 1)), tx_poly]
    
    initial_w = np.zeros((tx.shape[1], 1))

    k_indices = build_k_indices(y_i, k, 1)
    
    te_accs_k, tr_accs_k, te_losses_k, tr_losses_k, ws_k = [], [], [], [], []
    
    for k_ in range(k):
        
        test_indices = k_indices[k_]
        train_indices = np.setdiff1d(k_indices.flatten(), test_indices)

        y_train = y_i[train_indices]
        x_train = tx[train_indices]
        y_test = y_i[test_indices]
        x_test = tx[test_indices]

        # Ridge linear
        w, loss_tr_k = reg_logistic_regression(y_train, x_train, initial_w, lambda_, 10000, 0.01, method='sgd', ratio=0.5)

        # Calculate the loss for test data
        loss_te_k = compute_loss(y_test, x_test, w)
        
        acc_tr_k = compute_accuracy(x_train, w, y_train, mode='logistic')
        acc_te_k = compute_accuracy(x_test, w, y_test, mode='logistic')
        
        te_accs_k.append(acc_te_k)
        tr_accs_k.append(acc_tr_k)
        te_losses_k.append(np.math.sqrt(2 * loss_te_k))
        tr_losses_k.append(np.math.sqrt(2 * loss_tr_k))

        ws_k.append(w)
        

    te_accs.append(np.mean(te_accs_k) * x[x_jet_indexes[i]].shape[0])
    tr_accs.append(np.mean(acc_te_k) * x[x_jet_indexes[i]].shape[0])
    te_losses.append(np.mean(te_losses_k) * x[x_jet_indexes[i]].shape[0])
    tr_losses.append(np.mean(tr_losses_k) * x[x_jet_indexes[i]].shape[0])
    ws.append(np.mean(ws_k, axis=0))

    print("lambda={l:.6f}, Training accuracy={tr:.3f}, Testing accuracy={te:.3f}".format(
           l=lambda_, tr=tr_accs[i] / y_i.shape[0], te=te_accs[i] / y_i.shape[0]))

Current iteration=0, loss=[55401.8678478]
||d|| = 3.4241362425983177
Current iteration=1000, loss=[49820.68382129]
||d|| = 12.675298374652264
Current iteration=2000, loss=[47261.71399705]
||d|| = 2.795636048590469
Current iteration=3000, loss=[46613.63864134]
||d|| = 7.766423477049506
Current iteration=4000, loss=[47082.00467998]
||d|| = 11.5013856680073
Current iteration=5000, loss=[46583.53475557]
||d|| = 14.287333971330819
Current iteration=6000, loss=[46299.55575193]
||d|| = 3.871322223252055
Current iteration=7000, loss=[46312.30588604]
||d|| = 1.5363500856181416
Current iteration=8000, loss=[46412.66924231]
||d|| = 1.2021993539628
Current iteration=9000, loss=[46184.71128872]
||d|| = 8.766659316064718
loss=[46976.23589851]
Current iteration=0, loss=[55401.8678478]
||d|| = 10.246944292417446
Current iteration=1000, loss=[48318.82925819]
||d|| = 8.505885622731759
Current iteration=2000, loss=[47809.82206217]
||d|| = 5.960986018032392
Current iteration=3000, loss=[46996.7634807]
||d

### Without logarithm

In [20]:
# Import data
y, x_raw, ids = load_csv_data('../data/train.csv')
init_col_n = x_raw.shape[1]
init_col_n

30

In [21]:
# Transform y for logistic regression
y[np.where(y == -1)] = 0

In [22]:
x_jet_indexes = get_jet_indexes(x_raw)
# x_jet_indexes = get_all(x_raw)
x = x_raw
x_mass = np.zeros(x.shape[0])
x_mass[x[:, 0] == -999] = 1
x[:, 0][x[:, 0] == -999] = np.median(x[:, 0][x[:, 0] != -999])
x = np.column_stack((x, x_mass))

In [23]:
ws, te_accs, tr_accs, te_losses, tr_losses = [], [], [], [], []
lambda_ = 0
k = 5

for i in x_jet_indexes:
    
    y_i = y[x_jet_indexes[i]]
    tx_i = x[x_jet_indexes[i]]
    tx_del = np.delete(tx_i, jet_indexes[i], axis=1)
    
    # for li in log_indexes:
    #   # print(tx_del[:,li].min())
    #   tx_del[:,li] = np.apply_along_axis(lambda n: np.log(1 + abs(tx_del[:,li].min()) + n), 0, tx_del[:,li])
    
    tx_std = standardize(tx_del)[0]
    tx_poly = build_poly_matrix_quadratic(tx_std)
    tx = np.c_[np.ones((y_i.shape[0], 1)), tx_poly]
    
    initial_w = np.zeros((tx.shape[1], 1))

    k_indices = build_k_indices(y_i, k, 1)
    
    te_accs_k, tr_accs_k, te_losses_k, tr_losses_k, ws_k = [], [], [], [], []
    
    for k_ in range(k):
        
        test_indices = k_indices[k_]
        train_indices = np.setdiff1d(k_indices.flatten(), test_indices)

        y_train = y_i[train_indices]
        x_train = tx[train_indices]
        y_test = y_i[test_indices]
        x_test = tx[test_indices]

        # Ridge linear
        w, loss_tr_k = reg_logistic_regression(y_train, x_train, initial_w, lambda_, 10000, 0.01, method='sgd', ratio=0.5)

        # Calculate the loss for test data
        loss_te_k = compute_loss(y_test, x_test, w)
        
        acc_tr_k = compute_accuracy(x_train, w, y_train, mode='logistic')
        acc_te_k = compute_accuracy(x_test, w, y_test, mode='logistic')
        
        te_accs_k.append(acc_te_k)
        tr_accs_k.append(acc_tr_k)
        te_losses_k.append(np.math.sqrt(2 * loss_te_k))
        tr_losses_k.append(np.math.sqrt(2 * loss_tr_k))

        ws_k.append(w)
        

    te_accs.append(np.mean(te_accs_k) * x[x_jet_indexes[i]].shape[0])
    tr_accs.append(np.mean(acc_te_k) * x[x_jet_indexes[i]].shape[0])
    te_losses.append(np.mean(te_losses_k) * x[x_jet_indexes[i]].shape[0])
    tr_losses.append(np.mean(tr_losses_k) * x[x_jet_indexes[i]].shape[0])
    ws.append(np.mean(ws_k, axis=0))

    print("lambda={l:.6f}, Training accuracy={tr:.3f}, Testing accuracy={te:.3f}".format(
           l=lambda_, tr=tr_accs[i] / y_i.shape[0], te=te_accs[i] / y_i.shape[0]))

Current iteration=0, loss=[55401.8678478]
||d|| = 6.476739119738675
Current iteration=1000, loss=[38069.98310069]
||d|| = 10.417890193416616
Current iteration=2000, loss=[38339.38567851]
||d|| = 4.652161093775247
Current iteration=3000, loss=[37393.65378116]
||d|| = 2.4425804190750786
Current iteration=4000, loss=[37099.2471176]
||d|| = 1.818833262249398
Current iteration=5000, loss=[36853.63971436]
||d|| = 2.1610816054541933
Current iteration=6000, loss=[36460.39148303]
||d|| = 1.0174059679392067
Current iteration=7000, loss=[36424.98055793]
||d|| = 6.904450684956423
Current iteration=8000, loss=[36196.6889689]
||d|| = 2.1274999535267787
Current iteration=9000, loss=[36449.78074782]
||d|| = 1.6229486924072816
loss=[36227.84596917]
Current iteration=0, loss=[55401.8678478]
||d|| = 4.726917060209904
Current iteration=1000, loss=[38463.39625739]
||d|| = 1.961411138204492
Current iteration=2000, loss=[38102.62244801]
||d|| = 5.856510404057953
Current iteration=3000, loss=[38376.45051467]


### Without logarithm and mass column

In [16]:
# Import data
y, x_raw, ids = load_csv_data('../data/train.csv')
init_col_n = x_raw.shape[1]
init_col_n

30

In [17]:
# Transform y for logistic regression
y[np.where(y == -1)] = 0

In [18]:
x_jet_indexes = get_jet_indexes(x_raw)
# x_jet_indexes = get_all(x_raw)
x = x_raw
# x_mass = np.zeros(x.shape[0])
# x_mass[x[:, 0] == -999] = 1
x[:, 0][x[:, 0] == -999] = np.median(x[:, 0][x[:, 0] != -999])
# x = np.column_stack((x, x_mass))

In [19]:
ws, te_accs, tr_accs, te_losses, tr_losses = [], [], [], [], []
lambda_ = 0
k = 5

for i in x_jet_indexes:
    
    y_i = y[x_jet_indexes[i]]
    tx_i = x[x_jet_indexes[i]]
    tx_del = np.delete(tx_i, jet_indexes[i], axis=1)
    
    # for li in log_indexes:
    #   # print(tx_del[:,li].min())
    #   tx_del[:,li] = np.apply_along_axis(lambda n: np.log(1 + abs(tx_del[:,li].min()) + n), 0, tx_del[:,li])
    
    tx_std = standardize(tx_del)[0]
    tx_poly = build_poly_matrix_quadratic(tx_std)
    tx = np.c_[np.ones((y_i.shape[0], 1)), tx_poly]
    
    initial_w = np.zeros((tx.shape[1], 1))

    k_indices = build_k_indices(y_i, k, 1)
    
    te_accs_k, tr_accs_k, te_losses_k, tr_losses_k, ws_k = [], [], [], [], []
    
    for k_ in range(k):
        
        test_indices = k_indices[k_]
        train_indices = np.setdiff1d(k_indices.flatten(), test_indices)

        y_train = y_i[train_indices]
        x_train = tx[train_indices]
        y_test = y_i[test_indices]
        x_test = tx[test_indices]

        # Ridge linear
        w, loss_tr_k = reg_logistic_regression(y_train, x_train, initial_w, lambda_, 10000, 0.01, method='sgd', ratio=0.5)

        # Calculate the loss for test data
        loss_te_k = compute_loss(y_test, x_test, w)
        
        acc_tr_k = compute_accuracy(x_train, w, y_train, mode='logistic')
        acc_te_k = compute_accuracy(x_test, w, y_test, mode='logistic')
        
        te_accs_k.append(acc_te_k)
        tr_accs_k.append(acc_tr_k)
        te_losses_k.append(np.math.sqrt(2 * loss_te_k))
        tr_losses_k.append(np.math.sqrt(2 * loss_tr_k))

        ws_k.append(w)
        

    te_accs.append(np.mean(te_accs_k) * x[x_jet_indexes[i]].shape[0])
    tr_accs.append(np.mean(acc_te_k) * x[x_jet_indexes[i]].shape[0])
    te_losses.append(np.mean(te_losses_k) * x[x_jet_indexes[i]].shape[0])
    tr_losses.append(np.mean(tr_losses_k) * x[x_jet_indexes[i]].shape[0])
    ws.append(np.mean(ws_k, axis=0))

    print("lambda={l:.6f}, Training accuracy={tr:.3f}, Testing accuracy={te:.3f}".format(
           l=lambda_, tr=tr_accs[i] / y_i.shape[0], te=te_accs[i] / y_i.shape[0]))

Current iteration=0, loss=[55401.8678478]
||d|| = 8.625644532854063
Current iteration=1000, loss=[38411.42511902]
||d|| = 8.294951655184045
Current iteration=2000, loss=[37840.09690223]
||d|| = 3.4985747752798755
Current iteration=3000, loss=[37398.20672033]
||d|| = 1.8017233146020377
Current iteration=4000, loss=[37204.65339111]
||d|| = 10.249003794965903
Current iteration=5000, loss=[37765.08317311]
||d|| = 1.9573549262185745
Current iteration=6000, loss=[36765.68436233]
||d|| = 6.052507492681298
Current iteration=7000, loss=[36769.59367373]
||d|| = 4.6978093253947995
Current iteration=8000, loss=[36557.40860552]
||d|| = 1.2476174996270983
Current iteration=9000, loss=[36562.32876424]
||d|| = 5.455720183403461
loss=[36276.21585838]
Current iteration=0, loss=[55401.8678478]
||d|| = 19.41304749530535
Current iteration=1000, loss=[38749.29699402]
||d|| = 2.0336516351783835
Current iteration=2000, loss=[38340.02406117]
||d|| = 4.891001359336784
Current iteration=3000, loss=[37485.8866449

### Taking the logarithm of all the columns

In [1]:
import sys
sys.path.append('..')

import numpy as np
from src.helpers import load_csv_data, standardize, remove_incomplete_columns, predict_labels, create_csv_submission, compute_accuracy
from src.logistic.loss import compute_loss
from src.logistic.not_req_impl import reg_logistic_regression
from src.logistic.gradient import compute_gradient

from src.helpers import remove_correlated_columns, get_jet_indexes, jet_indexes, log_indexes, get_all
from src.polynomials import build_poly_matrix_quadratic
from src.k_fold import build_k_indices

%reload_ext autoreload
%autoreload 2

In [2]:
# Import data
y, x_raw, ids = load_csv_data('../data/train.csv')
init_col_n = x_raw.shape[1]
init_col_n

30

In [3]:
# Transform y for logistic regression
y[np.where(y == -1)] = 0

In [4]:
x_jet_indexes = get_jet_indexes(x_raw)
# x_jet_indexes = get_all(x_raw)
x = x_raw
x_mass = np.zeros(x.shape[0])
x_mass[x[:, 0] == -999] = 1
x[:, 0][x[:, 0] == -999] = np.median(x[:, 0][x[:, 0] != -999])
x = np.column_stack((x, x_mass))

In [5]:
for i in x_jet_indexes:
    tx_i = x[x_jet_indexes[i]]
    tx_del = np.delete(tx_i, jet_indexes[i], axis=1)
    print(tx_del.shape)

(99913, 21)
(77544, 24)
(72543, 31)


In [6]:
ws, te_accs, tr_accs, te_losses, tr_losses = [], [], [], [], []
lambda_ = 0

for i in x_jet_indexes:
    
    y_i = y[x_jet_indexes[i]]
    tx_i = x[x_jet_indexes[i]]
    tx_del = np.delete(tx_i, jet_indexes[i], axis=1)
    
    for li in range(tx_del.shape[1]):
       # print(tx_del[:,li].min())
       tx_del[:,li] = np.apply_along_axis(lambda n: np.log(1 + abs(tx_del[:,li].min()) + n), 0, tx_del[:,li])
    
    tx_std = standardize(tx_del)[0]
    tx_poly = build_poly_matrix_quadratic(tx_std)
    tx = np.c_[np.ones((y_i.shape[0], 1)), tx_poly]
    
    initial_w = np.zeros((tx.shape[1], 1))

    k_indices = build_k_indices(y_i, k, 1)
    
    te_accs_k, tr_accs_k, te_losses_k, tr_losses_k, ws_k = [], [], [], [], []
    
    # Ridge linear
    w, loss_tr_k = reg_logistic_regression(y_i, tx, initial_w, lambda_, 10000, 0.01, method='sgd', ratio=0.5)

    # Calculate the loss for test data
    loss_te_k = compute_loss(y_i, tx, w)

    acc_tr_k = compute_accuracy(tx, w, y_i, mode='logistic')
    acc_te_k = compute_accuracy(tx, w, y_i, mode='logistic')

    te_accs_k.append(acc_te_k)
    tr_accs_k.append(acc_tr_k)
    te_losses_k.append(np.math.sqrt(2 * loss_te_k))
    tr_losses_k.append(np.math.sqrt(2 * loss_tr_k))

    ws_k.append(w)  

    te_accs.append(np.mean(te_accs_k) * x[x_jet_indexes[i]].shape[0])
    tr_accs.append(np.mean(acc_te_k) * x[x_jet_indexes[i]].shape[0])
    te_losses.append(np.mean(te_losses_k) * x[x_jet_indexes[i]].shape[0])
    tr_losses.append(np.mean(tr_losses_k) * x[x_jet_indexes[i]].shape[0])
    ws.append(np.mean(ws_k, axis=0))

    print("lambda={l:.6f}, Training accuracy={tr:.3f}, Testing accuracy={te:.3f}".format(
           l=lambda_, tr=tr_accs[i] / y_i.shape[0], te=te_accs[i] / y_i.shape[0]))

Current iteration=0, loss=[69254.41425129]
||d|| = 8.452017961819308
Current iteration=1000, loss=[48051.30847285]
||d|| = 2.342489911071729
Current iteration=2000, loss=[47322.49371571]
||d|| = 3.7508621461127314
Current iteration=3000, loss=[46528.73448391]
||d|| = 3.787202275331311
Current iteration=4000, loss=[46032.1087465]
||d|| = 4.516971082030042
Current iteration=5000, loss=[45957.94126128]
||d|| = 12.191519043170517
Current iteration=6000, loss=[45776.33259439]
||d|| = 3.091703034345654
Current iteration=7000, loss=[46172.32307359]
||d|| = 15.058401186380157
Current iteration=8000, loss=[45377.95173017]
||d|| = 3.4952133158559335
Current iteration=9000, loss=[45262.57208358]
||d|| = 7.864668019242488
loss=[45218.51880455]
lambda=0.000000, Training accuracy=0.772, Testing accuracy=0.772
Current iteration=0, loss=[53749.40496934]
||d|| = 7.568035474504469
Current iteration=1000, loss=[47239.06291202]
||d|| = 11.112483510723814
Current iteration=2000, loss=[46194.51581023]
||d||