In [7]:
import sys
sys.path.append('..')

import numpy as np
from src.helpers import load_csv_data, standardize, flatten_jet_features, predict_labels, create_csv_submission
from src.logistic.implementations import logistic_regression
from src.logistic.loss import compute_loss
from src.logistic.not_req_impl import reg_logistic_regression, gradient_descent_step
from src.logistic.gradient import compute_gradient
from src.split import split_data
from src.polynomials import build_poly_matrix_vandermonde

from src.logistic.sigmoid import sigmoid

%reload_ext autoreload
%autoreload 2

In [47]:
# Import data
y, x_raw, ids = load_csv_data('../data/train.csv')

In [48]:
x = flatten_jet_features(x_raw)

In [49]:
# Transform y for logistic regression
y[np.where(y == -1)] = 0

In [63]:
# Standardize x and add the 1s column
x, mean_x, std_x = standardize(x)
tx = np.c_[np.ones((y.shape[0], 1)), x]
tx_poly = build_poly_matrix_vandermonde(tx, 2)

In [64]:
# Split the data
tx_train, y_train, tx_test, y_test = split_data(tx_poly, y, 0.8)

In [65]:
initial_w = np.zeros((tx_poly.shape[1], 1))

w, tr_loss = reg_logistic_regression(y_train, tx_train, initial_w, 0.001, 10000, 0.1, method='sgd')

Current iteration=0, loss=[271210.8712248]
||d|| = 28.187006321784594
Current iteration=1000, loss=[473276.92933884]
||d|| = 0.0018141404419597484
Current iteration=2000, loss=[471457.80712879]
||d|| = 72.80105748739423
Current iteration=3000, loss=[413814.03478361]
||d|| = 2.20590705580624
Current iteration=4000, loss=[382755.89768559]
||d|| = 0.0011342766214547798
Current iteration=5000, loss=[362612.0083851]
||d|| = 6.656985821296442
Current iteration=6000, loss=[347606.82277806]
||d|| = 3.1595568788074604
Current iteration=7000, loss=[325844.92175665]
||d|| = 17.07173239625616
Current iteration=8000, loss=[313553.25590884]
||d|| = 3.19607849012299
Current iteration=9000, loss=[295879.7846367]
||d|| = 2.085721188752055
loss=[153269.50450747]


In [66]:
y_pred = predict_labels(w, tx_test)

(y_pred == y_test).sum() / y_test.shape[0]

0.68696

- Result with SGD, gamma = 0.1, degree 2, 10k iter: loss = 115658, test set accuracy = 0.68696
- Result with Newton, gamma = 0.1, degree 2, 10k iter: loss = 126817, test set accuracy = 0.6554


In [35]:
def poly_reg_logistic_regression(tx, y, degree, ratio):
    
    tx_poly = build_poly_matrix_vandermonde(tx, degree)
    tx_train, y_train, tx_test, y_test = split_data(tx_poly, y, ratio)
 
    lambdas = np.logspace(-5, 0, 15)
    initial_w = np.zeros((tx_train.shape[1], 1))

    ws = []
    tr_losses = []
    te_losses = []

    for ind, lambda_ in enumerate(lambdas):


        w, tr_loss = reg_logistic_regression(y_train, tx_train, initial_w, lambda_, 10000, 0.1, method='newton')
        ws.append(w)
        te_loss = compute_loss(y_test, tx_test, w, lambda_=lambda_)
        tr_losses.append(tr_loss.flatten()[0])
        te_losses.append(te_loss.flatten()[0])

        y_pred = predict_labels(w, tx_test)
        training_accuracy = (y_pred == y_test).sum() / y_test.shape[0]

        print("lambda={lambda_}, Training loss={tr:.3f}, Testing loss={te:.3f}".format(
            lambda_=lambda_, tr=tr_losses[ind], te=te_losses[ind]))
        print("Training accuracy={acc:.3f}".format(acc=training_accuracy))

poly_reg_logistic_regression(tx, y, 2, 0.8)

Current iteration=0, loss=[139955.25046312]
||d|| = 2.7131764979399953
Current iteration=1000, loss=[129131.8432457]
||d|| = 5.66480554398374
Current iteration=2000, loss=[128452.4336647]
||d|| = 3.358482380966959
Current iteration=3000, loss=[128904.39914444]
||d|| = 3.1543797487187564
Current iteration=4000, loss=[128822.17365715]
||d|| = 3.870159339983603
Current iteration=5000, loss=[128707.81919923]
||d|| = 2.157741942193498
Current iteration=6000, loss=[128793.65809358]
||d|| = 5.063492435556481
Current iteration=7000, loss=[128838.25429689]
||d|| = 2.116481931366886
Current iteration=8000, loss=[128900.86620889]
||d|| = 2.117993899172692
Current iteration=9000, loss=[128713.22159776]
||d|| = 2.10311382058259
loss=[127524.0134424]
lambda=1e-05, Training loss=128856.288, Testing loss=32318.584
Training accuracy=0.655
Current iteration=0, loss=[141646.78530174]
||d|| = 28.187607308561805
Current iteration=1000, loss=[129927.61377292]
||d|| = 2.4503415278703855
Current iteration=200

KeyboardInterrupt: 

In [68]:
y_sub, x_sub_raw, ids_sub = load_csv_data('../data/test.csv')

In [70]:
x_sub = flatten_jet_features(x_sub_raw)
tx_sub = np.c_[np.ones((y_sub.shape[0], 1)), x_sub]
tx_sub_poly = build_poly_matrix_vandermonde(tx_sub, 2)
y_sub = predict_labels(w, tx_sub_poly)

In [72]:
create_csv_submission(ids_sub, y_sub, '../submissions/10-24.14-18.csv')