In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from proj1_helpers import load_csv_data
from functions import *
%load_ext autoreload
%autoreload 2

**1. LOAD THE DATA**

In [2]:
train_set = load_csv_data('Data/train.csv', sub_sample = False)
test_set = load_csv_data('Data/test.csv', sub_sample = False)

**2. SET UP THE DATA**

In [3]:
x = train_set[1]
y = train_set[0]
ids = train_set[2]

x_train, x_test, y_train, y_test, ids_train, ids_test = split_data_tr_te(x, y, ids, 0.15)

x_train = train_set[1]
y_train = train_set[0]
ids_train = train_set[2]

x_test = test_set[1]
y_test = test_set[0]
ids_test = test_set[2]

In [4]:
# The following separete the initial test set x_test in three subsets
# according the feature PRI_jet_num which takes its value in the 
# set {0,1,2,3}


# Concatenating x, y and ids: 

complete_test = np.column_stack((y_test, x_test))
complete_test = np.column_stack((complete_test, ids_test))

# Creating three subsets by the value of the feature PRI_jet_num 
# which is column 23


# We create 3 sub arrays (Subset-0, Subset-1, Subset-23) 
# according the value of Pri_jet_num which is feature 23

subset_test_0 = complete_test[complete_test[:,23] == 0]
subset_test_1 = complete_test[complete_test[:,23] == 1]
subset_test_23 = complete_test[2 <= complete_test[:,23]]

# Separate the subsets by ids_test, y_test and x_test

y_test_0 = subset_test_0[:,0]
y_test_1 = subset_test_1[:,0]
y_test_2 = subset_test_23[:,0]

x_test_0 = subset_test_0[:,1:-1]
x_test_1 = subset_test_1[:,1:-1]
x_test_2 = subset_test_23[:,1:-1]

id_test_0 = subset_test_0[:,-1]
id_test_1 = subset_test_1[:,-1]
id_test_2 = subset_test_23[:,-1]



# The following separete the initial training set x in three subsets
# according the feature PRI_jet_num which takes its value in the 
# set {0,1,2,3}


# Concatenating x, y and ids: 

complete_train = np.column_stack((y_train, x_train))
complete_train = np.column_stack((complete_train, ids_train))

# Creating three subsets by the value of the feature PRI_jet_num 
# which is column 23


# We create 3 sub arrays (Subset-0, Subset-1, Subset-23) 
# according the value of Pri_jet_num which is feature 23

subset_train_0 = complete_train[complete_train[:,23] == 0]
subset_train_1 = complete_train[complete_train[:,23] == 1]
subset_train_23 = complete_train[2 <= complete_train[:,23]]

# Separate the subsets by ids, y and x

y_train_0 = subset_train_0[:,0]
y_train_1 = subset_train_1[:,0]
y_train_2 = subset_train_23[:,0]

x_train_0 = subset_train_0[:,1:-1]
x_train_1 = subset_train_1[:,1:-1]
x_train_2 = subset_train_23[:,1:-1]

id_train_0 = subset_train_0[:,-1]
id_train_1 = subset_train_1[:,-1]
id_train_2 = subset_train_23[:,-1]

In [5]:
x_train_0_std, x_test_0_std = standardize(x_train_0, x_test_0)
x_train_0_std_int = np.column_stack((np.ones(x_train_0_std.shape[0]), x_train_0_std))
x_test_0_std_int = np.column_stack((np.ones(x_test_0_std.shape[0]), x_test_0_std))

x_train_1_std, x_test_1_std = standardize(x_train_1, x_test_1)
x_train_1_std_int = np.column_stack((np.ones(x_train_1_std.shape[0]), x_train_1_std))
x_test_1_std_int = np.column_stack((np.ones(x_test_1_std.shape[0]), x_test_1_std))

x_train_2_std, x_test_2_std = standardize(x_train_2, x_test_2)
x_train_2_std_int = np.column_stack((np.ones(x_train_2_std.shape[0]), x_train_2_std))
x_test_2_std_int = np.column_stack((np.ones(x_test_2_std.shape[0]), x_test_2_std))

**3. CLEAN THE DATA**

**3. DEFINE THE FUNCTIONS**

In [603]:
def least_squares(y, tx):
    A = tx.T@tx
    b = tx.T@y
    w = np.linalg.solve(A, b)
    loss = compute_mse(y, tx, w)
    return w, loss

In [604]:
def least_squares_GD(y, tx, initial_w, max_iters, gamma):
    ws = [initial_w]
    w = initial_w
    for n_iter in range(max_iters):
        gradient = compute_gradient_least_square(y, tx, w)
        loss = compute_mse(y, tx, w)
        w = w - gamma*gradient
        print("Gradient Descent({bi}/{ti}): loss={l}, w0={w0}, w1={w1}".format(bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1]))
    return w, loss

In [605]:
def least_squares_SGD(y, tx, initial_w, max_iters, gamma):
    ws = [initial_w]
    losses = []
    w = initial_w
    for n_iter in range(max_iters):    
        for minibatch_y, minibatch_tx in batch_iter(y, tx, batch_size):
            gradient = compute_stoch_gradient(minibatch_y, minibatch_tx, w)
            loss = compute_mse(minibatch_y, minibatch_tx, w)
            new_w = w - gamma*gradient
            w = new_w
            ws.append(w)
            losses.append(loss)
            print("Gradient Descent({bi}/{ti}): loss={l}, w0={w0}, w1={w1}".format(
            bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1]))
    return ws, losses

In [606]:
def ridge_regression(y, tx, lamb):
    """implement ridge regression."""
    aI = lamb * np.identity(tx.shape[1])
    a = tx.T.dot(tx) + aI
    b = tx.T.dot(y)
    w = np.linalg.solve(a, b)
    loss = compute_mse(y, tx, w)
    return w, loss

In [607]:
def logistic_regression(y, tx, initial_w, max_iters, gamma):
    loss, w = gradient_descent_log_reg(y, tx, initial_w, max_iters, gamma)
    return w, loss

In [608]:
def reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma):
    loss, w = reg_gradient_descent_log_reg(y, tx, lambda_, initial_w, max_iters, gamma)
    return w, loss

**4.CROSS VALIDATION VIZUALIZATION**

cross_validation_demo(y_train_0, x_train_0_std_int, 1)

cross_validation_demo(y_train_1, x_train_1_std_int, 1)

cross_validation_demo(y_train_2, x_train_2_std_int, 1)

cross_validation_demo(y_train_0, x_train_0_std_int, 4)

cross_validation_demo(y_train, x_train_std_int_, 5)

cross_validation_demo(y_train, x_train_std_int_, 6)

cross_validation_demo(y_train, x_train_std_int_, 7)

**4. GET THE MODEL**

In [6]:
x_train_0_2 = build_poly(x_train_0_std_int, degree = 2)
x_train_1_2 = build_poly(x_train_1_std_int, degree = 2)
x_train_2_2 = build_poly(x_train_2_std_int, degree = 2)

In [7]:
w_0, loss_0 = ridge_regression(y_train_0, x_train_0_2, 10**(-12))
w_1, loss_1 = ridge_regression(y_train_1, x_train_1_2, 10**(-12))
w_2, loss_2 = ridge_regression(y_train_2, x_train_2_2, 10**(-12))

In [8]:
x_test_0_2 = build_poly(x_test_0_std_int, degree = 2)
x_test_1_2 = build_poly(x_test_1_std_int, degree = 2)
x_test_2_2 = build_poly(x_test_2_std_int, degree = 2)

In [9]:
y_0 = zero_to_neg(np.around(sigmoid(x_test_0_2 @ w_0)))
y_1 = zero_to_neg(np.around(sigmoid(x_test_1_2 @ w_1)))
y_2 = zero_to_neg(np.around(sigmoid(x_test_2_2 @ w_2)))

In [10]:
s_0 = np.column_stack((id_test_0, y_0))
s_1 = np.column_stack((id_test_1, y_1))
s_2 = np.column_stack((id_test_2, y_2))
s = np.vstack((np.vstack((s_0, s_1)), s_2))

In [11]:
ss = s[s[:,0].argsort()]

In [12]:
test = np.column_stack((ids_test, y_test))
test = test[test[:, 0].argsort()]

In [13]:
def test_pred(y_test, y_pred):
    sum_id = np.sum(np.abs(y_test[:, 0] - y_pred[:, 0]))
    acc = 1 - np.sum(np.abs(y_test[:, 1] - y_pred[:, 1])) / y_test.shape[0] * 0.5
    return sum_id, acc

In [14]:
sum_id, acc = test_pred(test, ss)

In [15]:
sum_id, acc

(0.0, 0.76867294117647056)

s = np.column_stack((ids_test, ss_))

s_df = pd.DataFrame(ss)
s_df.columns = ['Id', 'Prediction']

len(s_df)

s_df.to_csv('Data/25_submit_prediction_rid_reg_deg_3_10-9_3_subsets_std_int.csv', index=False)