# Set up Environment

In [20]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
%load_ext autoreload
%autoreload 2

# Import functions in scripts
import os
import sys
module_path = os.path.abspath(os.path.join('../scripts'))
if module_path not in sys.path:
    sys.path.append(module_path)
from proj1_helpers import *
from implementations import *
np.set_printoptions(suppress=True)
from sklearn import linear_model

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load the data

In [21]:
targets, traindata, ids = load_csv_data("../data/train.csv",False)
nr_traindata, nr_columns = traindata.shape
print(traindata.shape)

(250000, 30)


In [None]:
with open('../data/train.csv', newline='') as f:
    reader = csv.reader(f)
    headers = next(reader)
headers = np.array(headers[2:])   

In [None]:
correlation = np.corrcoef(np.c_[traindata, targets], rowvar = False)
plot_corr_matrix(correlation, np.append(headers,'target'))

In [None]:
for ind, corr in enumerate(correlation):
    t = np.sort(corr)
    print(t[:2], t[ -2:])

In [None]:
def plot_hists(targets, traindata, headers):
    sns.distplot(targets)

    for i, header in enumerate(headers):
        f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, 
                                        gridspec_kw={"height_ratios": (.20, .80)}, figsize=(10,6))

        sns.boxplot(traindata[:,i], ax=ax_box)
        sns.distplot(traindata[:,i], ax=ax_hist, label= header)

        ax_box.set(yticks=[])
        ax_hist.set(xlabel=header)
        sns.despine(ax=ax_hist)
        sns.despine(ax=ax_box, left=True)

# Preprocess Data

In [None]:
insert_median_for_nan(traindata)
jet0, jet1, jet2, jet3, y1, y2, y3, y4 = split_data_by_jet_num(traindata, targets)
jet0, _, _ = standardize(jet0)
jet1, _, _ = standardize(jet1)
jet2, _, _ = standardize(jet2)
jet3, _, _ = standardize(jet3)

In [15]:
insert_median_for_nan(traindata)
traindata = second_order_features(traindata)
traindata.shape

(250000, 931)

In [23]:
insert_median_for_nan(traindata)
traindata = build_poly(traindata, 3)
traindata, _, _ = standardize(traindata)
traindata.shape

(250000, 91)

In [36]:
targets[targets == -1] = 0

# Train Model

In [None]:
weights_ridge, loss = ridge_regression(targets, train, lambda_ = 0.002)

In [37]:
def cross_validation_demo(x,y):
    seed = 54
    #degree = 3
    k_fold = 5
    lambdas = np.logspace(-3, 1, 20)
    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    # define lists to store the loss of training data and test data
    rmse_tr = np.zeros(len(lambdas))
    rmse_te = np.zeros(len(lambdas))
    
    
    # ***************************************************
    # INSERT YOUR CODE HERE
    # cross validation: TODO
    # *************************************************** 
    for ind, lambda_ in enumerate(lambdas): 
        print(ind, lambda_)
        rmse_tr[ind], rmse_te[ind] = cross_validation(y, x, k_indices, k_fold, lambda_, "logistic",
                                                      1000, gamma = 0.002)
    cross_validation_visualization(lambdas, rmse_tr, rmse_te)

In [38]:
cross_validation_demo(traindata, targets)

0 0.001
SGD(0/999): loss=138512.6732485821, gamma=0.002
SGD(50/999): loss=133214.54465572687, gamma=0.002
SGD(100/999): loss=128798.44632624152, gamma=0.002
SGD(150/999): loss=125076.0088915425, gamma=0.002
SGD(200/999): loss=121913.70164087923, gamma=0.002
SGD(250/999): loss=119209.5401549888, gamma=0.002
SGD(300/999): loss=116885.4587042786, gamma=0.002
SGD(350/999): loss=114880.24534644971, gamma=0.002
SGD(400/999): loss=113144.83512044283, gamma=0.002
SGD(450/999): loss=111639.11838450807, gamma=0.002
SGD(500/999): loss=110329.71793382012, gamma=0.002
SGD(550/999): loss=109188.40259868682, gamma=0.002
SGD(600/999): loss=108190.92386279484, gamma=0.002
SGD(650/999): loss=107316.14956692254, gamma=0.002
SGD(700/999): loss=106545.44303103402, gamma=0.002
SGD(750/999): loss=105862.23713341553, gamma=0.002
SGD(800/999): loss=105251.15157317389, gamma=0.002
SGD(850/999): loss=104697.6329050872, gamma=0.002


RuntimeWarning: overflow encountered in exp

# Evaluate Training Error

In [None]:
train_predictions = predict_labels(weights_ridge, train)

In [None]:
plt.hist(train_predictions)

As the evaluation metric, we use simple classification accuracy (percentage of correct predictions). 

In [None]:
# Good predictions after multiplication will have positive value
measure = train_predictions*targets
num_of_good_predictions = measure[measure > 0].sum()
accuracy = num_of_good_predictions/nr_traindata
print(accuracy)

# Create Submission File

In [None]:
test_targets, test_data, test_ids = load_csv_data("../data/test.csv",False)
nr_testdata, nr_testcolumns = test_data.shape
test_features = white_cubic_features(test_data,nr_testcolumns,nr_testdata)
test_predictions = predict_labels(weights, test_features)
create_csv_submission(test_ids, test_predictions, "Kozak_Nurmi_Tsai")