In [1]:
import sys
sys.path.append('..')

import numpy as np
from src.helpers import load_csv_data, standardize, flatten_jet_features, predict_labels, create_csv_submission, get_jet_indexes, jet_indexes
from src.linear.implementations import ridge_regression
from src.split import split_data
from src.polynomials import build_poly_matrix_vandermonde

%reload_ext autoreload
%autoreload 2

In [2]:
# Import data
y, x_raw, ids = load_csv_data('../data/train.csv')

## Plain ridge regression

In [4]:
x, mean_x, std_x = standardize(x_raw)
tx = np.c_[np.ones((y.shape[0], 1)), x]

In [7]:
ratio = 0.8
tx_train, y_train, tx_test, y_test = split_data(tx, y, ratio)

In [13]:
loss, w = ridge_regression(y_train, tx_train, 0.01)

In [14]:
y_pred = predict_labels(w, tx_test, mode='linear')

In [15]:
(y_pred == y_test).sum() / y_test.shape[0]

0.69216

## Ridge regression with flattened jet features

In [16]:
x = flatten_jet_features(x_raw)
x, mean_x, std_x = standardize(x)
tx = np.c_[np.ones((y.shape[0], 1)), x]

In [17]:
tx_train, y_train, tx_test, y_test = split_data(tx, y, 0.8)

In [18]:
loss, w = ridge_regression(y_train, tx_train, 0.01)

In [19]:
y_pred = predict_labels(w, tx_test, mode='linear')

In [20]:
(y_pred == y_test).sum() / y_test.shape[0]

0.70966

## Ridge regression with flattened jet features and polynomial degree

In [38]:
x = flatten_jet_features(x_raw)
x, mean_x, std_x = standardize(x)
tx_poly = build_poly_matrix_vandermonde(x, 12)
tx = np.c_[np.ones((y.shape[0], 1)), tx_poly]

In [39]:
tx_train, y_train, tx_test, y_test = split_data(tx, y, 0.8)

In [40]:
loss, w = ridge_regression(y_train, tx_train, 0.0001)

In [41]:
y_pred = predict_labels(w, tx_test, mode='linear')

In [42]:
(y_pred == y_test).sum() / y_test.shape[0]

0.79294

Result with lambda = 0.0001, degree = 12 => 0.79294

In [None]:
y_sub, x_sub_raw, ids_sub = load_csv_data('../data/test.csv')
y_pred = predict_labels(w, tx_test, mode='linear')
create_csv_submission(ids, y_pred, '../submissions/10-24.22-32.csv')

## Ridge regression with different sets

In [3]:
degree = 7
ratio = 0.8
lambda_ = 0.0001

x_train, y_train, x_test, y_test = split_data(x_raw, y, ratio)

train_jet_indexes= get_jet_indexes(x_train)
test_jet_indexes = get_jet_indexes(x_test)

In [6]:
ws = []
accuracies = []

y_pred = np.zeros((x_test.shape[0], 1))

for i in train_jet_indexes:
    
    tx_train_raw = x_train[train_jet_indexes[i]]
    tx_test_raw = x_test[test_jet_indexes[i]]
    
    tx_train_std = standardize(tx_train_raw)[0]
    tx_test_std = standardize(tx_test_raw)[0]
    
    tx_train_rem = np.delete(tx_train_std, jet_indexes[i], axis=1)
    tx_test_rem = np.delete(tx_test_std, jet_indexes[i], axis=1)
    
    tx_train = build_poly_matrix_vandermonde(tx_train_rem, degree)
    tx_test = build_poly_matrix_vandermonde(tx_test_rem, degree)
    
    loss, w = ridge_regression(y_train[train_jet_indexes[i]], tx_train, lambda_)
    
    y_pred[test_jet_indexes[i]] = predict_labels(w, tx_test, mode='linear')
    ws.append(w)

In [7]:
tot_accuracy = (y_pred == y_test).sum() / (y_test.shape[0])
tot_accuracy

0.71004

With degree 12, lambda = 0.001 => test accuracy = 0.710

In [9]:
y_sub, x_sub_raw, ids_sub = load_csv_data('../data/test.csv')
tx_sub_split, y_sub_split, ids_split = jet_split(x_sub_raw, y_sub, ids_sub, degree)

In [10]:
y_split_pred = [predict_labels(ws[i], x, mode='linear') for i, x in enumerate(tx_sub_split)]

In [12]:
y_split_cat = np.concatenate(y_split_pred)
ids_split_cat = np.concatenate(ids_split)
create_csv_submission(ids_split_cat, y_split_cat, '../submissions/10-24.21-37.csv')

## Ridge regression with cross validation

In [None]:
def cross_validation(y, x, k_indices, k, lambda_, degree, mean=True):
    """return the loss of ridge regression."""
    # Get k'th subgroup in test, others in train
    
    losses_tr, losses_te, ws = [], [], []
    
    for k_ in range(k):
        
        test_indices = k_indices[k_]
        train_indices = np.setdiff1d(k_indices.flatten(), test_indices)

        y_train = y[train_indices]
        x_train = x[train_indices]
        y_test = y[test_indices]
        x_test = x[test_indices]

        # Form data with polynomial degree
        x_train_poly = build_poly(x_train, degree)
        x_test_poly = build_poly(x_test, degree)

        # Ridge regression
        loss_tr, w_ridge = ridge_regression(y_train, x_train_poly, lambda_)

        # Calculate the loss for test data
        loss_te = compute_mse(y_test, x_test_poly, w_ridge)
        
        losses_tr.append(np.math.sqrt(2 * loss_tr))
        losses_te.append(np.math.sqrt(2 * loss_te))
        ws.append(w_ridge)
    
        
        
    return np.mean(losses_tr), np.mean(losses_te)