In [1]:
import sys
sys.path.append('..')

import numpy as np
from src.helpers import load_csv_data, standardize, remove_incomplete_columns, predict_labels, create_csv_submission
from src.logistic.implementations import logistic_regression
from src.logistic.loss import compute_loss
from src.logistic.not_req_impl import reg_logistic_regression, gradient_descent_step
from src.logistic.gradient import compute_gradient
from src.split import split_data

from src.logistic.sigmoid import sigmoid
from src.helpers import remove_correlated_columns
from src.polynomials import build_poly_matrix_quadratic

%reload_ext autoreload
%autoreload 2

In [2]:
# Import data
y, x_raw, ids = load_csv_data('data/train.csv')
init_col_n = x_raw.shape[1]
init_col_n

30

In [3]:
# create list of columns indices
curr_cols = list(range(x_raw.shape[1]))

def rem_cols(lst, mask):
    for c,x in enumerate(mask):
        if not x:
            lst.remove(c)

In [4]:
# Clean the data
x, kept_columns = remove_incomplete_columns(x_raw)
rem_cols(curr_cols, kept_columns)

In [5]:
curr_cols

[0, 1, 2, 3, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 29]

In [6]:
print(x)

[[ 138.47    51.655   97.827 ...  258.733    2.     113.497]
 [ 160.937   68.768  103.235 ...  164.546    1.      46.226]
 [-999.     162.172  125.953 ...  260.414    1.      44.251]
 ...
 [ 105.457   60.526   75.839 ...  198.907    1.      41.992]
 [  94.951   19.362   68.812 ...  112.718    0.       0.   ]
 [-999.      72.756   70.831 ...   99.405    0.       0.   ]]


In [7]:
x, kept_columns_2 = remove_correlated_columns(x)
rem_cols(curr_cols, kept_columns_2)

In [8]:
kept_columns_2

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True, False])

In [9]:
curr_cols

[0, 1, 2, 3, 7, 8, 9, 10, 11, 13, 14, 15, 16, 18, 20, 21, 22, 29]

In [10]:
# log transform
to_log_cols_idxs = [2, 3, 4, 6, 7, 8, 10, 13, 16]
for i in to_log_cols_idxs:
    print(x[:,i].min())
    x[:,i] = np.apply_along_axis(lambda n: np.log(1 + abs(x[:,i].min()) + n), 0, x[:,i])

6.329
0.0
0.208
46.104
0.047
-1.414
-2.499
-2.505
-3.142


In [11]:
# Transform y for logistic regression
y[np.where(y == -1)] = 0

In [12]:
# Standardize x and add the 1s column
x, mean_x, std_x = standardize(x)
tx = np.c_[np.ones((y.shape[0], 1)), x]

In [13]:
# poly
tx = build_poly_matrix_quadratic(tx)

In [14]:
# Split the data
tx_train, y_train, tx_test, y_test = split_data(tx, y, 0.8)

In [15]:
initial_w = np.zeros((tx.shape[1], 1))

w, tr_loss = reg_logistic_regression(y, tx, initial_w, 0, 10000, 0.05, method='sgd', ratio=0.5)

Current iteration=0, loss=[173286.79513999]
||d|| = 3.443337520133352
Current iteration=1000, loss=[169331.57214641]
||d|| = 8.587969507816975
Current iteration=2000, loss=[140566.09295898]
||d|| = 3.4237780837681195
Current iteration=3000, loss=[139128.99164782]
||d|| = 3.3300413210784403
Current iteration=4000, loss=[142754.26234536]
||d|| = 2.0571933740014594
Current iteration=5000, loss=[133270.19730013]
||d|| = 6.575845170786087
Current iteration=6000, loss=[134135.57444219]
||d|| = 5.557606082362086
Current iteration=7000, loss=[132673.5156465]
||d|| = 2.2204047658010806
Current iteration=8000, loss=[133118.57044955]
||d|| = 0.8498772901081524
Current iteration=9000, loss=[137660.72928189]
||d|| = 6.426500209839358
loss=[132741.37886452]


Result with gamma = 0.01 and 100k iter ||d|| = 0.44937032966833224 loss = [138897.157523], test set accuracy = 0.67562

---

Robbins-Monroe | Result with degree = 3, gamma = 0.0004 and 10k iter loss = [155654.57878558], test set accuracy = 0.65534

Robbins-Monroe | Result with quadratic, gamma = 0.0004 and 10k iter loss = [155654.57878558], test set accuracy = 0.6554

---

Robbins-Monroe | quadratic, gamma = 0.01, ratio = 0.5, 10k iter | loss = [143340.81035078], test set accuracy = 0.6616

---

logistic w/ RM, log, quadratic | gamma = 0.01, ratio = 0.5, 10k iter | loss = [137580.1845174], test set accuracy = 0.69154

logistic w/ RM, log, quadratic | gamma = 0.05, ratio = 0.5, 10k iter | loss = [132741.37886452], test set accuracy = 0.74846

In [16]:
y_sub, x_sub_raw, ids_sub = load_csv_data('data/test.csv')

In [17]:
# build mask
mask = [x in curr_cols for x in range(init_col_n)]

In [18]:
x_sub = x_sub_raw[:, mask]
tx_sub = np.c_[np.ones((y_sub.shape[0], 1)), x_sub]
tx_sub = tx = build_poly_matrix_quadratic(tx_sub)
y_sub = predict_labels(w, tx_sub)

In [19]:
create_csv_submission(ids_sub, y_sub, 'submissions/10-24.00-15.csv')

In [20]:
y_pred = predict_labels(w, tx_test)

(y_pred == y_test).sum() / y_test.shape[0]

0.74846