In [1]:
import sys
sys.path.append('..')

import numpy as np
from src.helpers import load_csv_data, standardize, remove_incomplete_columns, predict_labels, create_csv_submission
from src.logistic.implementations import logistic_regression
from src.logistic.loss import compute_loss
from src.logistic.not_req_impl import reg_logistic_regression, gradient_descent_step
from src.logistic.gradient import compute_gradient
from src.split import split_data

from src.logistic.sigmoid import sigmoid
from src.polynomials import build_poly_matrix_vandermonde

%reload_ext autoreload
%autoreload 2

In [2]:
# Import data
y, x_raw, ids = load_csv_data('data/train.csv')

In [3]:
# Clean the data
x, kept_columns = remove_incomplete_columns(x_raw)

In [4]:
kept_columns

array([ True,  True,  True,  True, False, False, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False, False, False, False,
       False, False,  True])

In [5]:
# Transform y for logistic regression
y[np.where(y == -1)] = 0

In [6]:
# Standardize x and add the 1s column
x, mean_x, std_x = standardize(x)
tx = np.c_[np.ones((y.shape[0], 1)), x]

In [7]:
degree = 3

In [8]:
# poly
tx = build_poly_matrix_vandermonde(tx, degree)

In [9]:
# Split the data
tx_train, y_train, tx_test, y_test = split_data(tx, y, 0.8)

In [10]:
initial_w = np.zeros((tx.shape[1], 1))

w, tr_loss = reg_logistic_regression(y, tx, initial_w, 0, 20000, 0.0004, method='sgd')

Current iteration=0, loss=[173286.79513999]
||d|| = 333.98940438329527
Current iteration=1000, loss=[250267.84728534]
||d|| = 2.56701416464671
Current iteration=2000, loss=[225548.24101531]
||d|| = 6.979289039149756
Current iteration=3000, loss=[184523.56883166]
||d|| = 10.889669194820197
Current iteration=4000, loss=[161604.21771364]
||d|| = 2.903432493231668
Current iteration=5000, loss=[155704.71561939]
||d|| = 3.09544087869614
Current iteration=6000, loss=[155610.24625826]
||d|| = 31.522571596217105
Current iteration=7000, loss=[155536.68809181]
||d|| = 3.096341112671164
Current iteration=8000, loss=[156948.00737298]
||d|| = 16.06690185273196
Current iteration=9000, loss=[155791.53372586]
||d|| = 5.3392581410657804
Current iteration=10000, loss=[155420.48936548]
||d|| = 154.9436773697464
Current iteration=11000, loss=[155343.19849284]
||d|| = 2.522845262151715
Current iteration=12000, loss=[155407.42475692]
||d|| = 2.5129933770102344
Current iteration=13000, loss=[155848.33042827]


Result with gamma = 0.001 and 100k iter ||d|| = 0.8317845119149855 loss = 137795.11103903, test set accuracy = 0.67216

Result with gamma = 0.01 and 100k iter ||d|| = 0.44937032966833224 loss = [138897.157523], test set accuracy = 0.67562

---

Robbins-Monroe | Result with degree = 3, gamma = 0.0004 and 10k iter loss = [155654.57878558], test set accuracy = 0.65524

Robbins-Monroe | Result with degree = 3, gamma = 0.0004 and 10k iter loss = [155654.57878558], test set accuracy = 0.65534

In [11]:
y_sub, x_sub_raw, ids_sub = load_csv_data('data/test.csv')

In [12]:
x_sub = x_sub_raw[:, kept_columns]
tx_sub = np.c_[np.ones((y_sub.shape[0], 1)), x_sub]
tx_sub = tx = build_poly_matrix_vandermonde(tx_sub, degree)
y_sub = predict_labels(w, tx_sub)

In [13]:
w.shape

(84, 1)

In [14]:
tx_sub.shape

(568238, 84)

In [15]:
create_csv_submission(ids_sub, y_sub, 'submissions/10-24.00-15.csv')

In [16]:
y_pred = predict_labels(w, tx_test)

(y_pred == y_test).sum() / y_test.shape[0]

0.65534