In [1]:
%load_ext autoreload
%autoreload
from proj1_helpers import *
from implementations import *
from tests import *
from preprocessing import *
import numpy as np
import matplotlib.pyplot as plt

# Outline of the pipeline
## Import data
+ Import raw data. Split original train data into out test and train sets.
## Preprocessing
+ The file `preprocessing.py` is imported and contains functions to clean (impute with mean), remove columns, standardize and do PCA.
+ Preprocess train and test data separately (you can define the number of principal components used with the max_comp parameter. Defaults to 30).
## Apply Model
+ Apply your preferred model!

In [2]:
# Read train_y=LABELS, train_x=FEATURES and train_id=EVENT_IDS from dataset.
subsamp = False
y, x, id_ = load_csv_data('../data/train.csv', sub_sample=subsamp)
y_out_test, x_out_test, id_out_test = load_csv_data('../data/test.csv', sub_sample=subsamp)

In [75]:
features = np.loadtxt('../data/train.csv', dtype=str, delimiter=',')[0,2:]
def show_PC_explicit(features):
    import sympy as sy
    features_sym = [sy.symbols(f, real=True, positive=True) for f in features]
    display(np.array(features_sym).dot(np.around(transform_train, 1)))

In [78]:
clean = True
dopca = False
remove_cols = False
cols = (4, 5, 6, 12, 26, 27, 28)
#cols=(0, 4, 5, 6, 12, 23, 24, 25, 26, 27, 28)
max_comp = 30  # For cleaning, and no removing cols

x_train, y_train, x_test, y_test = split_data(x, y, ratio=0.80, seed=42)
y_train, x_train, x_train_mean, x_train_var, transform_train, eigenvals_train = preprocess(
    x_train,
    y_train,
    clean=clean,
    dopca=dopca,
    max_comp=max_comp,
    remove_cols=remove_cols,
    cols=cols)
y_test, x_test, x_test_mean, x_test_var, transform_test, eigenvals_test = preprocess(
    x_test,
    y_test,
    clean=clean,
    dopca=dopca,
    max_comp=max_comp,
    remove_cols=remove_cols,
    cols=cols)
print(x_test.shape, x_train.shape)
print(features[list(cols)])

(50000, 30) (200000, 30)
['DER_deltaeta_jet_jet' 'DER_mass_jet_jet' 'DER_prodeta_jet_jet'
 'DER_lep_eta_centrality' 'PRI_jet_subleading_pt' 'PRI_jet_subleading_eta'
 'PRI_jet_subleading_phi']


In [61]:
degree = 10
# Build data matrix with feature expansion
tx_train = build_poly(x_train, degree)
tx_test = build_poly(x_test, degree)
tx_train.shape, tx_test.shape
tx_train[:,1:], _, _ = standardize_features(tx_train[:,1:])
tx_test[:,1:], _, _ = standardize_features(tx_test[:,1:])

# Gradient Descent

In [41]:
w_init = np.array([0] * tx_train.shape[1])
max_iter_gd = 5000
gamma_gd = 1e-3
w_gd, loss_gd = least_squares_GD(y_train,
                                 tx_train,
                                 w_init,
                                 max_iter_gd,
                                 gamma_gd,
                                 pr=True,
                                 adapt_gamma=False,
                                 kind='mse',
                                accel=False)
gd_prediction = predict_labels(w_gd, tx_test)
acc_gd = accuracy_ratio(gd_prediction, y_test)
print('Accuracy ratio = %.3f' % acc_gd)
print('Test loss = %.3f' % compute_loss(y_test, tx_test, w_gd))
print('Train loss = %.3f' % loss_gd)

GD (0/4999): loss=0.5
GD (100/4999): loss=0.442184935171644
GD (200/4999): loss=0.4115490059588939
GD (300/4999): loss=0.3919753385151225
GD (400/4999): loss=0.37818181437789616
GD (500/4999): loss=0.367904721804925
GD (600/4999): loss=0.3599855858896922
GD (700/4999): loss=0.35375000664114153
GD (800/4999): loss=0.34876542821949175
GD (900/4999): loss=0.34473459550394064
GD (1000/4999): loss=0.3414435679355703
GD (1100/4999): loss=0.3387336326074574
GD (1200/4999): loss=0.336484633077038
GD (1300/4999): loss=0.334604284110886
GD (1400/4999): loss=0.3330209257506864
GD (1500/4999): loss=0.3316784146817885
GD (1600/4999): loss=0.3305324257396797
GD (1700/4999): loss=0.3295477241393855
GD (1800/4999): loss=0.3286961258209597
GD (1900/4999): loss=0.3279549557166036
GD (2000/4999): loss=0.3273058717984965
GD (2100/4999): loss=0.3267339610171813
GD (2200/4999): loss=0.32622703931531677
GD (2300/4999): loss=0.3257751061109137
GD (2400/4999): loss=0.3253699165865967
GD (2500/4999): loss=0.325

np.savetxt('../data/w_gd_acc.dat', w_gd)

# Stochastic Gradient Descent

In [17]:
w_init = np.array([0] * tx_train.shape[1])
max_iter_sgd = 500
gamma_sgd = 1e-5
batch_size = 1

w_sgd, loss_sgd = least_squares_SGD(y_train,
                                    tx_train,
                                    w_init,
                                    batch_size,
                                    max_iter_sgd,
                                    gamma_sgd,
                                    pr=False,
                                    adapt_gamma=False,
                                    choose_best=True)
sgd_prediction = predict_labels(w_sgd, tx_test)
acc_sgd = accuracy_ratio(sgd_prediction, y_test)
print('Accuracy ratio = %.2f' % acc_sgd)
print('Test loss = %.2e' % compute_loss(y_test, tx_test, w_sgd))
print('Train loss = %.2e' % loss_sgd)

Accuracy ratio = 0.64
Test loss = 4.88e-01
Train loss = 4.89e-01


# Least Squares

In [57]:
w_lsq, loss_lsq = least_squares(y_train, tx_train)
lsq_prediction = predict_labels(w_lsq, tx_test)
acc_lsq = accuracy_ratio(lsq_prediction, y_test)
print('Accuracy ratio = %.2f' % acc_lsq)
print('Train loss = %.2f' % loss_lsq)
print('Test loss = %.2e' % compute_loss(y_test, tx_test, w_lsq))


Accuracy ratio = 0.43
Train loss = 0.28
Test loss = 4.04e+08


np.savetxt('../data/w_lsq.dat', w_lsq)

# Ridge Regression

In [63]:
lambda_rr = 2.7e-3
w_rr, loss_rr = ridge_regression(y_train, tx_train, lambda_rr)
rr_prediction = predict_labels(w_rr, tx_test)
acc_rr = accuracy_ratio(rr_prediction, y_test)
print('Accuracy ratio = %.3f'%acc_rr)
print('Test loss = %.3f'%compute_loss(y_test, tx_test, w_rr))
print('Train loss = %.3f'%loss_rr)

Accuracy ratio = 0.784
Test loss = 0.315
Train loss = 0.300


np.savetxt('../data/w_rr.dat', w_rr)

# Logistic Regression

In [38]:
y_train_log = minus_one_2_zero(y_train)
y_test_log = minus_one_2_zero(y_test)


w_init = np.array([0] * tx_train.shape[1])
max_iter_lrgd = 500
gamma_lrgd = 1e-8
w_lrgd, loss_lrgd = logistic_regression(y_train_log,
                                        tx_train,
                                        w_init,
                                        max_iter_lrgd,
                                        gamma_lrgd,
                                        pr=False,
                                        adapt_gamma=False,
                                       accel=False)

lrgd_prediction = predict_labels(w_lrgd, tx_test)
acc_lrgd = accuracy_ratio(lrgd_prediction, y_test)

print('Accuracy ratio = %.3f' % acc_lrgd)
print('Test loss = %.3f' % compute_loss_logistic(y_test_log, tx_test, w_lrgd))
print('Train loss = %.3f' % loss_lrgd)

Accuracy ratio = 0.710
Test loss = 28862.970
Train loss = 115577.099


In [39]:
%autoreload
from implementations import *
lambda_rlrgd = 1e4
gamma_rlrgd = 1e-8
max_iter_rlrgd = 500

w_rlrgd, loss_rlrgd = reg_logistic_regression(y_train_log,
                                              tx_train,
                                              lambda_rlrgd,
                                              w_init,
                                              max_iter_rlrgd,
                                              gamma_rlrgd,
                                              pr=True,
                                              adapt_gamma=False, 
                                              accel=False)
rlrgd_prediction = predict_labels(w_rlrgd, tx_test)
acc_rlrgd = accuracy_ratio(rlrgd_prediction, y_test)
print('Accuracy ratio = %.3f' % acc_rlrgd)
print('Test loss = %.3f' % compute_loss_logistic(y_test_log, tx_test, w_rlrgd))
print('Train loss = %.3f' % loss_rlrgd)

 Regularized Logistic Regression GD (0/499): loss=138629.43611198905
 Regularized Logistic Regression GD (100/499): loss=127908.06070553078
 Regularized Logistic Regression GD (200/499): loss=123004.92086239882
 Regularized Logistic Regression GD (300/499): loss=120019.13028727792
 Regularized Logistic Regression GD (400/499): loss=117907.72498582836
Accuracy ratio = 0.710
Test loss = 28927.939
Train loss = 116290.899


# Tests

In [73]:
%autoreload
from tests import *
from implementations import *
clean = True
dopca = False
remove_cols = False
stdafter = False
cols = (4, 5, 6, 12, 26, 27, 28)
cols=range(16, 30)
max_comp = 30  # For cleaning, and no removing cols
cross_validation_demo(x,
                      y,
                      ridge_regression,
                      args_rr,
                      k_fold=4,
                      degree=10,
                      clean=clean,
                      dopca=dopca,
                      remove_cols=remove_cols,
                      lambda_min = -4,
                      lambda_max = 1,
                      stdafter=stdafter)

Using method ridge_regression
Using lambda = 1.0e-04
Using lambda = 3.6e-04
Using lambda = 1.3e-03
Using lambda = 4.6e-03
Using lambda = 1.7e-02
Using lambda = 6.0e-02
Using lambda = 2.2e-01
Using lambda = 7.7e-01
Using lambda = 2.8e+00
Using lambda = 1.0e+01
Best lambda from error: 2.15e-01
Best lambda from accuracy: 1.67e-02


# Make Submission

In [69]:
subsamp = False
y, x, id_ = load_csv_data('../data/train.csv', sub_sample=subsamp)
y_out_test, x_out_test, id_out_test = load_csv_data('../data/test.csv', sub_sample=subsamp)
clean = True
dopca = False
remove_cols = False
stdafter=False
cols = (4, 5, 6, 12, 26, 27, 28)
#cols=(0, 4, 5, 6, 12, 23, 24, 25, 26, 27, 28)
max_comp = 30  # For cleaning, and no removing cols

y_train, x_train, x_train_mean, x_train_var, transform_train, eigenvals_train = preprocess(
    x,
    y,
    clean=clean,
    dopca=dopca,
    max_comp=max_comp,
    remove_cols=remove_cols,
    cols=cols)
y_test, x_test, x_test_mean, x_test_var, transform_test, eigenvals_test = preprocess(
    x_out_test,
    y_out_test,
    clean=clean,
    dopca=dopca,
    max_comp=max_comp,
    remove_cols=remove_cols,
    cols=cols)
print(x_test.shape, x_train.shape)
degree = 10
# Build data matrix with feature expansion
tx_train = build_poly(x_train, degree)
tx_test = build_poly(x_test, degree)
print(tx_train.shape, tx_test.shape)
if stdafter:
    tx_train[:,1:], _, _ = standardize_features(tx_train[:,1:])
    tx_test[:,1:], _, _ = standardize_features(tx_test[:,1:])
lambda_rr = 1e-4
w_rr, loss_rr = ridge_regression(y_train, tx_train, lambda_rr)
rr_prediction = predict_labels(w_rr, tx_test)
print('Train loss = %.3f'%loss_rr)
create_csv_submission(id_out_test, predict_labels(w_rr, tx_test) , '../results/rr_pred_deg10_cl1_pc0_rmcol0_stdafter0.csv')

(568238, 30) (250000, 30)
(250000, 301) (568238, 301)
Train loss = 0.276
