In [1]:
%load_ext autoreload
%autoreload
from proj1_helpers import *
from implementations import *
from tests import *
from preprocessing import *
import numpy as np
import matplotlib.pyplot as plt

# Outline of the pipeline
## Import data
+ Import raw data. Split original train data into out test and train sets.
## Preprocessing
+ The file `preprocessing.py` is imported and contains functions to clean (impute with mean), remove columns, standardize and do PCA.
+ Preprocess train and test data separately (you can define the number of principal components used with the max_comp parameter. Defaults to 30).
## Apply Model
+ Apply your preferred model!

In [2]:
# Read train_y=LABELS, train_x=FEATURES and train_id=EVENT_IDS from dataset.
subsamp = False
y, x, id_ = load_csv_data('../data/train.csv', sub_sample=subsamp)
y_out_test, x_out_test, id_out_test = load_csv_data('../data/test.csv', sub_sample=subsamp)

In [3]:
features = np.loadtxt('../data/train.csv', dtype=str, delimiter=',')[0,2:]
def show_PC_explicit(features):
    import sympy as sy
    features_sym = [sy.symbols(f, real=True, positive=True) for f in features]
    display(np.array(features_sym).dot(np.around(transform_train, 1)))

In [4]:
clean = True
dopca = False
remove_cols = False
cols = (4, 5, 6, 12, 26, 27, 28)
#cols=(0, 4, 5, 6, 12, 23, 24, 25, 26, 27, 28)
max_comp = 30  # For cleaning, and no removing cols

x_train, y_train, x_test, y_test = split_data(x, y, ratio=0.80, seed=42)
y_train, x_train, x_train_mean, x_train_var, transform_train, eigenvals_train = preprocess(
    x_train,
    y_train,
    clean=clean,
    dopca=dopca,
    max_comp=max_comp,
    remove_cols=remove_cols,
    cols=cols)
y_test, x_test, x_test_mean, x_test_var, transform_test, eigenvals_test = preprocess(
    x_test,
    y_test,
    clean=clean,
    dopca=dopca,
    max_comp=max_comp,
    remove_cols=remove_cols,
    cols=cols)
print(x_test.shape, x_train.shape)
print(features[list(cols)])

(50000, 30) (200000, 30)
['DER_deltaeta_jet_jet' 'DER_mass_jet_jet' 'DER_prodeta_jet_jet'
 'DER_lep_eta_centrality' 'PRI_jet_subleading_pt' 'PRI_jet_subleading_eta'
 'PRI_jet_subleading_phi']


In [5]:
degree = 10
# Build data matrix with feature expansion
tx_train = build_poly(x_train, degree)
tx_test = build_poly(x_test, degree)
tx_train.shape, tx_test.shape
tx_train[:,1:], _, _ = standardize_features(tx_train[:,1:])
tx_test[:,1:], _, _ = standardize_features(tx_test[:,1:])

# Gradient Descent

In [41]:
w_init = np.array([0] * tx_train.shape[1])
max_iter_gd = 5000
gamma_gd = 1e-3
w_gd, loss_gd = least_squares_GD(y_train,
                                 tx_train,
                                 w_init,
                                 max_iter_gd,
                                 gamma_gd,
                                 pr=True,
                                 adapt_gamma=False,
                                 kind='mse',
                                accel=False)
gd_prediction = predict_labels(w_gd, tx_test)
acc_gd = accuracy_ratio(gd_prediction, y_test)
print('Accuracy ratio = %.3f' % acc_gd)
print('Test loss = %.3f' % compute_loss(y_test, tx_test, w_gd))
print('Train loss = %.3f' % loss_gd)

GD (0/4999): loss=0.5
GD (100/4999): loss=0.442184935171644
GD (200/4999): loss=0.4115490059588939
GD (300/4999): loss=0.3919753385151225
GD (400/4999): loss=0.37818181437789616
GD (500/4999): loss=0.367904721804925
GD (600/4999): loss=0.3599855858896922
GD (700/4999): loss=0.35375000664114153
GD (800/4999): loss=0.34876542821949175
GD (900/4999): loss=0.34473459550394064
GD (1000/4999): loss=0.3414435679355703
GD (1100/4999): loss=0.3387336326074574
GD (1200/4999): loss=0.336484633077038
GD (1300/4999): loss=0.334604284110886
GD (1400/4999): loss=0.3330209257506864
GD (1500/4999): loss=0.3316784146817885
GD (1600/4999): loss=0.3305324257396797
GD (1700/4999): loss=0.3295477241393855
GD (1800/4999): loss=0.3286961258209597
GD (1900/4999): loss=0.3279549557166036
GD (2000/4999): loss=0.3273058717984965
GD (2100/4999): loss=0.3267339610171813
GD (2200/4999): loss=0.32622703931531677
GD (2300/4999): loss=0.3257751061109137
GD (2400/4999): loss=0.3253699165865967
GD (2500/4999): loss=0.325

np.savetxt('../data/w_gd_acc.dat', w_gd)

# Stochastic Gradient Descent

In [17]:
w_init = np.array([0] * tx_train.shape[1])
max_iter_sgd = 500
gamma_sgd = 1e-5
batch_size = 1

w_sgd, loss_sgd = least_squares_SGD(y_train,
                                    tx_train,
                                    w_init,
                                    batch_size,
                                    max_iter_sgd,
                                    gamma_sgd,
                                    pr=False,
                                    adapt_gamma=False,
                                    choose_best=True)
sgd_prediction = predict_labels(w_sgd, tx_test)
acc_sgd = accuracy_ratio(sgd_prediction, y_test)
print('Accuracy ratio = %.2f' % acc_sgd)
print('Test loss = %.2e' % compute_loss(y_test, tx_test, w_sgd))
print('Train loss = %.2e' % loss_sgd)

Accuracy ratio = 0.64
Test loss = 4.88e-01
Train loss = 4.89e-01


# Least Squares

In [57]:
w_lsq, loss_lsq = least_squares(y_train, tx_train)
lsq_prediction = predict_labels(w_lsq, tx_test)
acc_lsq = accuracy_ratio(lsq_prediction, y_test)
print('Accuracy ratio = %.2f' % acc_lsq)
print('Train loss = %.2f' % loss_lsq)
print('Test loss = %.2e' % compute_loss(y_test, tx_test, w_lsq))


Accuracy ratio = 0.43
Train loss = 0.28
Test loss = 4.04e+08


np.savetxt('../data/w_lsq.dat', w_lsq)

# Ridge Regression

In [8]:
lambda_rr = 2.7e-3
w_rr, loss_rr = ridge_regression(y_train, tx_train, lambda_rr)
rr_prediction = predict_labels(w_rr, tx_test)
acc_rr = accuracy_ratio(rr_prediction, y_test)
print('Accuracy ratio = %.3f'%acc_rr)
print('Test loss = %.3f'%compute_loss(y_test, tx_test, w_rr))
print('Train loss = %.3f'%loss_rr)

Accuracy ratio = 0.784
Test loss = 0.315
Train loss = 0.300


np.savetxt('../data/w_rr.dat', w_rr)

# Logistic Regression

In [30]:
y_train_log = minus_one_2_zero(y_train)
y_test_log = minus_one_2_zero(y_test)


w_init = np.array([0] * tx_train.shape[1])
max_iter_lrgd = 5000
gamma_lrgd = 1e-6
w_lrgd, loss_lrgd = logistic_regression(y_train_log,
                                        tx_train,
                                        w_init,
                                        max_iter_lrgd,
                                        gamma_lrgd,
                                        pr=True,
                                        adapt_gamma=False,
                                       accel=False)

lrgd_prediction = predict_labels(w_lrgd, tx_test)
acc_lrgd = accuracy_ratio(lrgd_prediction, y_test)

print('Accuracy ratio = %.3f' % acc_lrgd)
print('Test loss = %.3f' % compute_loss_logistic(y_test_log, tx_test, w_lrgd))
print('Train loss = %.3f' % loss_lrgd)

Logistic Regression GD (0/4999): loss=138629.43611198905
Logistic Regression GD (100/4999): loss=93238.78320807862
Logistic Regression GD (200/4999): loss=90741.30364242186
Logistic Regression GD (300/4999): loss=89548.78185686289
Logistic Regression GD (400/4999): loss=88790.06707491117
Logistic Regression GD (500/4999): loss=88249.09943494303
Logistic Regression GD (600/4999): loss=87839.05559566483
Logistic Regression GD (700/4999): loss=87515.22787293412
Logistic Regression GD (800/4999): loss=87250.93851234576
Logistic Regression GD (900/4999): loss=87029.76908116811
Logistic Regression GD (1000/4999): loss=86840.94824060913
Logistic Regression GD (1100/4999): loss=86677.06876001206
Logistic Regression GD (1200/4999): loss=86532.8324551642
Logistic Regression GD (1300/4999): loss=86404.96579106138
Logistic Regression GD (1400/4999): loss=86290.8533741525
Logistic Regression GD (1500/4999): loss=86187.38885925693
Logistic Regression GD (1600/4999): loss=86092.72445980992
Logistic R

In [31]:
%autoreload
from implementations import *
lambda_rlrgd = 1e0
gamma_rlrgd = 1e-6
max_iter_rlrgd = 5000

w_rlrgd, loss_rlrgd = reg_logistic_regression(y_train_log,
                                              tx_train,
                                              lambda_rlrgd,
                                              w_init,
                                              max_iter_rlrgd,
                                              gamma_rlrgd,
                                              pr=True,
                                              adapt_gamma=False, 
                                              accel=False)
rlrgd_prediction = predict_labels(w_rlrgd, tx_test)
acc_rlrgd = accuracy_ratio(rlrgd_prediction, y_test)
print('Accuracy ratio = %.3f' % acc_rlrgd)
print('Test loss = %.3f' % compute_loss_logistic(y_test_log, tx_test, w_rlrgd))
print('Train loss = %.3f' % loss_rlrgd)

 Regularized Logistic Regression GD (0/4999): loss=138629.43611198905
 Regularized Logistic Regression GD (100/4999): loss=93240.1935979139
 Regularized Logistic Regression GD (200/4999): loss=90743.57200283137
 Regularized Logistic Regression GD (300/4999): loss=89551.7584977208
 Regularized Logistic Regression GD (400/4999): loss=88793.67690119703
 Regularized Logistic Regression GD (500/4999): loss=88253.28982347847
 Regularized Logistic Regression GD (600/4999): loss=87843.78386950732
 Regularized Logistic Regression GD (700/4999): loss=87520.45605496898
 Regularized Logistic Regression GD (800/4999): loss=87256.63476167026
 Regularized Logistic Regression GD (900/4999): loss=87035.90631510803
 Regularized Logistic Regression GD (1000/4999): loss=86847.50300860114
 Regularized Logistic Regression GD (1100/4999): loss=86684.02052046584
 Regularized Logistic Regression GD (1200/4999): loss=86540.16323838657
 Regularized Logistic Regression GD (1300/4999): loss=86412.66001537253
 Regu

# Tests

In [73]:
%autoreload
from tests import *
from implementations import *
clean = True
dopca = False
remove_cols = False
stdafter = False
cols = (4, 5, 6, 12, 26, 27, 28)
cols=range(16, 30)
max_comp = 30  # For cleaning, and no removing cols
args_rlrgd['pr']=True

cross_validation_demo(x,
                      y,
                      ridge_regression,
                      args_rr,
                      k_fold=4,
                      degree=10,
                      clean=clean,
                      dopca=dopca,
                      remove_cols=remove_cols,
                      lambda_min = -4,
                      lambda_max = 1,
                      stdafter=stdafter)

Using method ridge_regression
Using lambda = 1.0e-04
Using lambda = 3.6e-04
Using lambda = 1.3e-03
Using lambda = 4.6e-03
Using lambda = 1.7e-02
Using lambda = 6.0e-02
Using lambda = 2.2e-01
Using lambda = 7.7e-01
Using lambda = 2.8e+00
Using lambda = 1.0e+01
Best lambda from error: 2.15e-01
Best lambda from accuracy: 1.67e-02


In [None]:
from implementations import *
clean = True
dopca = False
remove_cols = False
stdafter = True
cols = (4, 5, 6, 12, 26, 27, 28)
max_comp = 30  # For cleaning, and no removing cols
args_gd['max_iters']=5000
args_gd['pr']=True

for deg in range(1, 10):
    cross_validation_demo(x,
                      y,
                      least_squares_GD,
                      args_gd,
                      k_fold=4,
                      degree=deg,
                      clean=clean,
                      dopca=dopca,
                      remove_cols=remove_cols,
                      lambda_min = -4,
                      lambda_max = 1,
                      stdafter=stdafter)

Using method least_squares_GD
GD (0/4999): loss=0.5
GD (100/4999): loss=0.4560206925074313
GD (200/4999): loss=0.4292641703423632
GD (300/4999): loss=0.41143348633198323
GD (400/4999): loss=0.3988199946404141
GD (500/4999): loss=0.3895291216690483
GD (600/4999): loss=0.3824820250968874
GD (700/4999): loss=0.37701397046542406
GD (800/4999): loss=0.3726918727226744
GD (900/4999): loss=0.36922175961020354
GD (1000/4999): loss=0.3663976140627746
GD (1100/4999): loss=0.3640712535796154
GD (1200/4999): loss=0.36213375251302676
GD (1300/4999): loss=0.360503559310501
GD (1400/4999): loss=0.3591186609426928
GD (1500/4999): loss=0.35793127625122767
GD (1600/4999): loss=0.3569041751151248
GD (1700/4999): loss=0.35600806997153933
GD (1800/4999): loss=0.35521973149457475
GD (1900/4999): loss=0.354520604026057
GD (2000/4999): loss=0.3538957728207588
GD (2100/4999): loss=0.35333318348106557
GD (2200/4999): loss=0.35282304514278673
GD (2300/4999): loss=0.3523573695228966
GD (2400/4999): loss=0.3519296

GD (500/4999): loss=0.3674117885519998
GD (600/4999): loss=0.35945438110764505
GD (700/4999): loss=0.353188749550242
GD (800/4999): loss=0.3481799587845124
GD (900/4999): loss=0.34412953382542394
GD (1000/4999): loss=0.3408229155005411
GD (1100/4999): loss=0.33810103976812134
GD (1200/4999): loss=0.3358434875930232
GD (1300/4999): loss=0.3339577168720511
GD (1400/4999): loss=0.3323717883001665
GD (1500/4999): loss=0.33102925614988915
GD (1600/4999): loss=0.32988548060579187
GD (1700/4999): loss=0.3289049134159444
GD (1800/4999): loss=0.32805907004522344
GD (1900/4999): loss=0.32732499658523184
GD (2000/4999): loss=0.32668409911808105
GD (2100/4999): loss=0.32612124211049953
GD (2200/4999): loss=0.32562404869608286
GD (2300/4999): loss=0.32518235390617223
GD (2400/4999): loss=0.3247877747534961
GD (2500/4999): loss=0.32443337027098035
GD (2600/4999): loss=0.32411337127840656
GD (2700/4999): loss=0.32382296453950254
GD (2800/4999): loss=0.3235581195917519
GD (2900/4999): loss=0.323315449

GD (1100/4999): loss=0.3376442529643002
GD (1200/4999): loss=0.3353635158264099
GD (1300/4999): loss=0.33343293570367477
GD (1400/4999): loss=0.33178527165535554
GD (1500/4999): loss=0.33036790787720965
GD (1600/4999): loss=0.3291392772618965
GD (1700/4999): loss=0.3280662690097418
GD (1800/4999): loss=0.3271223183030348
GD (1900/4999): loss=0.32628597763897643
GD (2000/4999): loss=0.325539834226564
GD (2100/4999): loss=0.32486968002565775
GD (2200/4999): loss=0.3242638689432229
GD (2300/4999): loss=0.3237128145253063
GD (2400/4999): loss=0.3232085943778196
GD (2500/4999): loss=0.3227446365255419
GD (2600/4999): loss=0.3223154692657284
GD (2700/4999): loss=0.321916520628051
GD (2800/4999): loss=0.32154395686861115
GD (2900/4999): loss=0.3211945518716889
GD (3000/4999): loss=0.32086558115913166
GD (3100/4999): loss=0.32055473558603603
GD (3200/4999): loss=0.3202600508527694
GD (3300/4999): loss=0.31997984977232535
GD (3400/4999): loss=0.3197126948594033
GD (3500/4999): loss=0.3194573492

# Make Submission

In [69]:
subsamp = False
y, x, id_ = load_csv_data('../data/train.csv', sub_sample=subsamp)
y_out_test, x_out_test, id_out_test = load_csv_data('../data/test.csv', sub_sample=subsamp)
clean = True
dopca = False
remove_cols = False
stdafter=False
cols = (4, 5, 6, 12, 26, 27, 28)
#cols=(0, 4, 5, 6, 12, 23, 24, 25, 26, 27, 28)
max_comp = 30  # For cleaning, and no removing cols

y_train, x_train, x_train_mean, x_train_var, transform_train, eigenvals_train = preprocess(
    x,
    y,
    clean=clean,
    dopca=dopca,
    max_comp=max_comp,
    remove_cols=remove_cols,
    cols=cols)
y_test, x_test, x_test_mean, x_test_var, transform_test, eigenvals_test = preprocess(
    x_out_test,
    y_out_test,
    clean=clean,
    dopca=dopca,
    max_comp=max_comp,
    remove_cols=remove_cols,
    cols=cols)
print(x_test.shape, x_train.shape)
degree = 10
# Build data matrix with feature expansion
tx_train = build_poly(x_train, degree)
tx_test = build_poly(x_test, degree)
print(tx_train.shape, tx_test.shape)
if stdafter:
    tx_train[:,1:], _, _ = standardize_features(tx_train[:,1:])
    tx_test[:,1:], _, _ = standardize_features(tx_test[:,1:])
lambda_rr = 1e-4
w_rr, loss_rr = ridge_regression(y_train, tx_train, lambda_rr)
rr_prediction = predict_labels(w_rr, tx_test)
print('Train loss = %.3f'%loss_rr)
create_csv_submission(id_out_test, predict_labels(w_rr, tx_test) , '../results/rr_pred_deg10_cl1_pc0_rmcol0_stdafter0.csv')

(568238, 30) (250000, 30)
(250000, 301) (568238, 301)
Train loss = 0.276
