In [1]:
import sys
sys.path.append('..')

import numpy as np
from src.helpers import load_csv_data, standardize, remove_incomplete_columns, predict_labels, create_csv_submission
from src.logistic.implementations import logistic_regression
from src.logistic.loss import compute_loss
from src.logistic.not_req_impl import reg_logistic_regression, gradient_descent_step
from src.logistic.gradient import compute_gradient
from src.split import split_data

from src.logistic.sigmoid import sigmoid

%reload_ext autoreload
%autoreload 2

In [2]:
# Import data
y, x_raw, ids = load_csv_data('data/train.csv')

In [3]:
# Clean the data
x, kept_columns = remove_incomplete_columns(x_raw)

In [4]:
kept_columns

array([ True,  True,  True,  True, False, False, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False, False, False, False,
       False, False,  True])

In [5]:
# Transform y for logistic regression
y[np.where(y == -1)] = 0

In [6]:
# Standardize x and add the 1s column
x, mean_x, std_x = standardize(x)
tx = np.c_[np.ones((y.shape[0], 1)), x]

In [7]:
# Split the data
tx_train, y_train, tx_test, y_test = split_data(tx, y, 0.8)

In [19]:
initial_w = np.zeros((tx.shape[1], 1))

w, tr_loss = reg_logistic_regression(y, tx, initial_w, 0, 100000, 0.01, method='sgd')

2262984
Current iteration=73300, loss=[136962.1559508]
||d|| = 0.5941439287913605
Current iteration=73400, loss=[137321.24478922]
||d|| = 1.3720806001791432
Current iteration=73500, loss=[136879.39428901]
||d|| = 0.21088987013687352
Current iteration=73600, loss=[138473.90002365]
||d|| = 0.34067314234831886
Current iteration=73700, loss=[136274.09138344]
||d|| = 0.897425031512749
Current iteration=73800, loss=[136676.012739]
||d|| = 0.3395781969356725
Current iteration=73900, loss=[135257.75571366]
||d|| = 0.5371458495169106
Current iteration=74000, loss=[135827.87017012]
||d|| = 0.5281365873490873
Current iteration=74100, loss=[136897.46244334]
||d|| = 0.5885020672193488
Current iteration=74200, loss=[135893.05574127]
||d|| = 0.4155854003366906
Current iteration=74300, loss=[136641.44592865]
||d|| = 1.22564450074405
Current iteration=74400, loss=[135783.66237767]
||d|| = 2.093968465640055
Current iteration=74500, loss=[136596.64678365]
||d|| = 2.1149871532145914
Current iteration=7460

Result with gamma = 0.001 and 100k iter ||d|| = 0.8317845119149855 loss = 137795.11103903, test set accuracy = 0.67216

Result with gamma = 0.01 and 100k iter ||d|| = 0.44937032966833224 loss = [138897.157523], test set accuracy = 0.67562



In [10]:
y_sub, x_sub_raw, ids_sub = load_csv_data('data/test.csv')

In [21]:
x_sub = x_sub_raw[:, kept_columns]
tx_sub = np.c_[np.ones((y_sub.shape[0], 1)), x_sub]
y_sub = predict_labels(w, tx_sub)

In [22]:
create_csv_submission(ids_sub, y_sub, 'submissions/10-24.00-15.csv')

In [20]:
y_pred = predict_labels(w, tx_test)

(y_pred == y_test).sum() / y_test.shape[0]

0.67562

In [22]:
lambdas = np.logspace(-5, 0, 15)
initial_w = np.zeros((tx.shape[1], 1))

ws = []
tr_losses = []
te_losses = []

for ind, lambda_ in enumerate(lambdas):
    w, tr_loss = reg_logistic_regression(y, tx, initial_w, lambda_, 200000, 0.001, method='newton')
    ws.append(w)
    te_loss = compute_loss(y_test, tx_test, w, lambda_=lambda_)
    tr_losses.append(tr_loss[0])
    te_losses.append(te_loss[0])

    print(tr_loss.flatten().flatten())
    print(te_loss.flatten().flatten())

    print("lambda={l:.3f}, Training loss={tr:.3f}, Testing loss={te:.3f}".format(
        l=lambda_, tr=tr_losses[ind], te=te_losses[ind]))

||d|| = 1.7530054184663586
Current iteration=41000, loss=[167701.93314014]
||d|| = 4.218094976486765
Current iteration=42000, loss=[167358.61419643]
||d|| = 5.07458776749566
Current iteration=43000, loss=[167341.60504827]
||d|| = 1.0454517287992566
Current iteration=44000, loss=[167519.82165141]
||d|| = 1.9650167134892134
Current iteration=45000, loss=[167327.12370088]
||d|| = 1.4119022181652439
Current iteration=46000, loss=[167069.65808409]
||d|| = 0.7131083032859795
Current iteration=47000, loss=[167631.3022728]
||d|| = 0.9487529189735084
Current iteration=48000, loss=[167142.33334487]
||d|| = 0.8945975333042224
Current iteration=49000, loss=[167715.07555589]
||d|| = 0.9646077927541177
Current iteration=50000, loss=[167260.93115244]
||d|| = 2.3569976513853286
Current iteration=51000, loss=[167336.09601173]
||d|| = 0.930592135723258
Current iteration=52000, loss=[167813.25980397]
||d|| = 1.4452047003473176
Current iteration=53000, loss=[167940.73472535]
||d|| = 2.891644899710126
Curr