In [1]:
import sys
sys.path.append('..')

import numpy as np
from src.helpers import load_csv_data, standardize, remove_incomplete_columns, predict_labels, create_csv_submission
from src.logistic.implementations import logistic_regression
from src.logistic.loss import compute_loss
from src.logistic.not_req_impl import reg_logistic_regression, gradient_descent_step
from src.logistic.gradient import compute_gradient
from src.split import split_data

from src.logistic.sigmoid import sigmoid
from src.helpers import remove_correlated_columns
from src.polynomials import build_poly_matrix_quadratic

%reload_ext autoreload
%autoreload 2

In [2]:
# Import data
y, x_raw, ids = load_csv_data('data/train.csv')
init_col_n = x_raw.shape[1]

In [3]:
# create list of columns indices
curr_cols = [x for x in range(x_raw.shape[1])]

def rem_cols(lst, mask):
    for c,x in enumerate(mask):
        if not x:
            lst.remove(c)

In [4]:
# Clean the data
x, kept_columns = remove_incomplete_columns(x_raw)
rem_cols(curr_cols, kept_columns)

In [5]:
curr_cols

[0, 1, 2, 3, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 29]

In [6]:
x, kept_columns_2 = remove_correlated_columns(x)
rem_cols(curr_cols, kept_columns_2)

In [7]:
kept_columns_2

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True, False])

In [8]:
curr_cols

[0, 1, 2, 3, 7, 8, 9, 10, 11, 13, 14, 15, 16, 18, 20, 21, 22, 29]

In [9]:
# Transform y for logistic regression
y[np.where(y == -1)] = 0

In [10]:
# Standardize x and add the 1s column
x, mean_x, std_x = standardize(x)
tx = np.c_[np.ones((y.shape[0], 1)), x]

In [11]:
# poly
tx = build_poly_matrix_quadratic(tx)

In [12]:
# Split the data
tx_train, y_train, tx_test, y_test = split_data(tx, y, 0.8)

In [13]:
initial_w = np.zeros((tx.shape[1], 1))

w, tr_loss = reg_logistic_regression(y, tx, initial_w, 0, 20000, 0.01, method='sgd', ratio=0.5)

Current iteration=0, loss=[173286.79513999]
||d|| = 4.692392330159733
Current iteration=1000, loss=[149123.86361978]
||d|| = 2.2717767281305665
Current iteration=2000, loss=[147940.21421259]
||d|| = 2.055167097109393
Current iteration=3000, loss=[146975.26733716]
||d|| = 1.8556481129181002
Current iteration=4000, loss=[146414.65681485]
||d|| = 1.3583559419878584
Current iteration=5000, loss=[144984.9959531]
||d|| = 6.246465280289287
Current iteration=6000, loss=[145208.10939339]
||d|| = 1.9187783468187753
Current iteration=7000, loss=[146152.15283906]
||d|| = 2.459796747743006
Current iteration=8000, loss=[143824.63753407]
||d|| = 5.010612293545735
Current iteration=9000, loss=[143568.34335933]
||d|| = 6.107202839879983
Current iteration=10000, loss=[143349.68148458]
||d|| = 2.809704416125998
Current iteration=11000, loss=[143309.86819708]
||d|| = 6.417065425933975
Current iteration=12000, loss=[142711.73528632]
||d|| = 2.0255473991526016
Current iteration=13000, loss=[143265.10534446]

Result with gamma = 0.001 and 100k iter ||d|| = 0.8317845119149855 loss = 137795.11103903, test set accuracy = 0.67216

Result with gamma = 0.01 and 100k iter ||d|| = 0.44937032966833224 loss = [138897.157523], test set accuracy = 0.67562

---

Robbins-Monroe | Result with degree = 3, gamma = 0.0004 and 10k iter loss = [155654.57878558], test set accuracy = 0.65524

Robbins-Monroe | Result with degree = 3, gamma = 0.0004 and 10k iter loss = [155654.57878558], test set accuracy = 0.65534

Robbins-Monroe | Result with quadratic, gamma = 0.0004 and 10k iter loss = [155654.57878558], test set accuracy = 0.6554

---

Robbins-Monroe | quadratic, gamma = 0.01, ratio = 0.5, 5k iter | loss = [145324.28779927], test set accuracy = 0.6589

Robbins-Monroe | quadratic, gamma = 0.01, ratio = 0.5, 10k iter | loss = [143340.81035078], test set accuracy = 0.6616

Robbins-Monroe | quadratic, gamma = 0.01, ratio = 0.5, 20k iter | loss = [141398.94468713], test set accuracy = 0.66362

In [14]:
y_sub, x_sub_raw, ids_sub = load_csv_data('data/test.csv')

In [15]:
# build mask
mask = [x in curr_cols for x in range(init_col_n)]

In [16]:
x_sub = x_sub_raw[:, mask]
tx_sub = np.c_[np.ones((y_sub.shape[0], 1)), x_sub]
tx_sub = tx = build_poly_matrix_quadratic(tx_sub)
y_sub = predict_labels(w, tx_sub)

In [17]:
create_csv_submission(ids_sub, y_sub, 'submissions/10-24.00-15.csv')

In [18]:
y_pred = predict_labels(w, tx_test)

(y_pred == y_test).sum() / y_test.shape[0]

0.66362