In [1]:
import sys
sys.path.append('..')

import numpy as np
from src.helpers import load_csv_data, standardize, remove_incomplete_columns, predict_labels, create_csv_submission
from src.logistic.implementations import logistic_regression
from src.logistic.loss import compute_loss
from src.logistic.not_req_impl import reg_logistic_regression, gradient_descent_step
from src.logistic.gradient import compute_gradient
from src.split import split_data

from src.logistic.sigmoid import sigmoid

%reload_ext autoreload
%autoreload 2

In [3]:
# Import data
y, x_raw, ids = load_csv_data('../data/train.csv')

In [3]:
# Clean the data
x, kept_columns = remove_incomplete_columns(x_raw)

In [4]:
kept_columns

array([ True,  True,  True,  True, False, False, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False, False, False, False,
       False, False,  True])

In [5]:
# Transform y for logistic regression
y[np.where(y == -1)] = 0

In [6]:
# Standardize x and add the 1s column
x, mean_x, std_x = standardize(x)
tx = np.c_[np.ones((y.shape[0], 1)), x]

In [7]:
# Split the data
tx_train, y_train, tx_test, y_test = split_data(tx, y, 0.8)

In [19]:
initial_w = np.zeros((tx.shape[1], 1))

w, tr_loss = reg_logistic_regression(y, tx, initial_w, 0, 100000, 0.01, method='sgd')

2262984
Current iteration=73300, loss=[136962.1559508]
||d|| = 0.5941439287913605
Current iteration=73400, loss=[137321.24478922]
||d|| = 1.3720806001791432
Current iteration=73500, loss=[136879.39428901]
||d|| = 0.21088987013687352
Current iteration=73600, loss=[138473.90002365]
||d|| = 0.34067314234831886
Current iteration=73700, loss=[136274.09138344]
||d|| = 0.897425031512749
Current iteration=73800, loss=[136676.012739]
||d|| = 0.3395781969356725
Current iteration=73900, loss=[135257.75571366]
||d|| = 0.5371458495169106
Current iteration=74000, loss=[135827.87017012]
||d|| = 0.5281365873490873
Current iteration=74100, loss=[136897.46244334]
||d|| = 0.5885020672193488
Current iteration=74200, loss=[135893.05574127]
||d|| = 0.4155854003366906
Current iteration=74300, loss=[136641.44592865]
||d|| = 1.22564450074405
Current iteration=74400, loss=[135783.66237767]
||d|| = 2.093968465640055
Current iteration=74500, loss=[136596.64678365]
||d|| = 2.1149871532145914
Current iteration=7460

Result with gamma = 0.001 and 100k iter ||d|| = 0.8317845119149855 loss = 137795.11103903, test set accuracy = 0.67216

Result with gamma = 0.01 and 100k iter ||d|| = 0.44937032966833224 loss = [138897.157523], test set accuracy = 0.67562



In [10]:
y_sub, x_sub_raw, ids_sub = load_csv_data('data/test.csv')

In [21]:
x_sub = x_sub_raw[:, kept_columns]
tx_sub = np.c_[np.ones((y_sub.shape[0], 1)), x_sub]
y_sub = predict_labels(w, tx_sub)

In [22]:
create_csv_submission(ids_sub, y_sub, 'submissions/10-24.00-15.csv')

In [20]:
y_pred = predict_labels(w, tx_test)

(y_pred == y_test).sum() / y_test.shape[0]

0.67562

In [14]:
lambdas = np.logspace(-5, 0, 15)
initial_w = np.zeros((tx.shape[1], 1))

ws = []
tr_losses = []
te_losses = []

for ind, lambda_ in enumerate(lambdas):
    w, tr_loss = reg_logistic_regression(y, tx, initial_w, lambda_, 10000, 0.001, method='newton')
    ws.append(w)
    te_loss = compute_loss(y_test, tx_test, w, lambda_=lambda_)
    tr_losses.append(tr_loss[0])
    te_losses.append(te_loss[0])

    y_pred = predict_labels(w, tx_test)
    training_accuracy = (y_pred == y_test).sum() / y_test.shape[0]

    print("lambda={lambda_}, Training loss={tr:.3f}, Testing loss={te:.3f}".format(
        lambda_=lambda_, tr=tr_losses[ind], te=te_losses[ind]))
    print("Training accuracy={acc:.3f}".format(acc=training_accuracy))

Current iteration=0, loss=[173313.04513999]
||d|| = 1.26305514443864
Current iteration=1000, loss=[168283.06486387]
||d|| = 1.819023295795367
Current iteration=2000, loss=[166816.44837511]
||d|| = 1.0077769045239235
Current iteration=3000, loss=[166393.08782902]
||d|| = 0.8318294301989898
Current iteration=4000, loss=[166597.5718525]
||d|| = 2.6272222280554387
Current iteration=5000, loss=[166996.0668856]
||d|| = 0.8493192680566024
Current iteration=6000, loss=[166730.77618249]
||d|| = 1.0512566439418087
Current iteration=7000, loss=[166921.00926433]
||d|| = 3.5558377129950416
Current iteration=8000, loss=[166223.62885936]
||d|| = 0.6963285060093969
Current iteration=9000, loss=[167875.55098277]
||d|| = 0.9897859278051709
loss=[167745.57591656]
lambda=1e-05, Training loss=167800.920, Testing loss=33595.736
Training accuracy=0.655
Current iteration=0, loss=[173346.53609555]
||d|| = 0.8728545565961453
Current iteration=1000, loss=[168460.05859765]
||d|| = 1.3735031108323827
Current itera