In [None]:
# Useful starting lines
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
from helpers import *

%load_ext autoreload
%autoreload 2

## Loading

In [None]:
from proj1_helpers import *
DATA_TRAIN_PATH = "../Data/train.csv"
y_train, x_train, _ = load_csv_data(DATA_TRAIN_PATH, sub_sample=True)

print("Loaded training data with dimensions ", x_train.shape)

In [None]:
y_train_source = y_train
x_train_source = x_train

## Standardization

In [None]:
from helpers import standardize_outliers
# Standardize the data and replace undefined values with the mean, column by column
x_train, _, _ = standardize_outliers(x_train_source)

## Feature selection

In [None]:
from quadratic_array import *
indices = select_features(x_train, y_train_source, 0.02)
indices, len(indices)

In [None]:
from quadratic_array import *
restricted_x_train = x_train[:,indices]
quadratic_restricted_x_train = build_quadratic_array(restricted_x_train)
quadratic_restricted_x_train.shape

## Implementing different models

#### adding offset term

In [None]:
N = x_train.shape[0]
tx_train = np.c_[np.ones((N, 1)), quadratic_restricted_x_train]
tx_train.shape

<b>Splitting into train and test set</b>

In [None]:
from helpers import split_data
tx_train, y_train, tx_test, y_test = split_data(tx_train, y_train_source, 0.66)

Linear regression using gradient descent

In [None]:
from gradient_descent import least_squares_GD
from proj1_helpers import predict_labels
from helpers import *

# Define the parameters of the algorithm.
max_iters = 1000
gamma = 0.01

# Initialization
D = tx_train.shape[1]
w_initial = np.zeros(D)

# Start gradient descent.
loss, w = least_squares_GD(y_train, tx_train, w_initial, gamma, max_iters, print_=False)

# Score
s_test = score(tx_test, y_test, w[-1])
print('Score well classified (test) : ', s_test)
s_train = score(tx_train, y_train, w[-1])
print('Score well classified (train) : ', s_train)

Least squares regression using normal equations

In [None]:
from least_squares import least_squares

# least squares
w, loss = least_squares(y_train,tx_train)

# Score
print('parameters w:', w)
print('min loss : ', loss)
s_test = score(tx_test, y_test, w)
print('Score well classified (test) : ', s_test)
s_train = score(tx_train, y_train, w)
print('Score well classified (train) : ', s_train)

Ridge regression using normal equations

In [None]:
from ridge_regression import ridge_regression

# least squares
w, loss = ridge_regression(y_train,tx_train, 0.01)

# Score
print('parameters w:', w)
print('min loss : ', loss)
s_test = score(tx_test, y_test, w)
print('Score well classified (test) : ', s_test)
s_train = score(tx_train, y_train, w)
print('Score well classified (train) : ', s_train)

Logistic regression using gradient descent

In [None]:
from logistic_regression import logistic_regression

# Define the parameters of the algorithm.
max_iters = 10000
gamma = 0.01      

# Initialization
D = tx_train.shape[1]
w_initial = np.zeros(D) 

w, loss = logistic_regression(y_train, tx_train, w_initial, max_iters, gamma)

# Score
print('parameters w:', w)
print('min loss : ', loss)
s_test = score(tx_test, y_test, w)
print('Score well classified (test) : ', s_test)
s_train = score(tx_train, y_train, w)
print('Score well classified (train) : ', s_train)

Regularized logistic regression using gradient descent

In [None]:
from logistic_regression import reg_logistic_regression

# Define the parameters of the algorithm.
max_iters = 10000
gamma = 0.01     
lambda_ = 0.1

# Initialization
D = tx_train.shape[1]
w_initial = np.zeros(D) 

w, loss = reg_logistic_regression(y_train, tx_train, lambda_, w_initial, max_iters, gamma)

# Score
print('parameters w:', w)
print('min loss : ', loss)
s_test = score(tx_test, y_test, w)
print('Score well classified (test) : ', s_test)
s_train = score(tx_train, y_train, w)
print('Score well classified (train) : ', s_train)

In [None]:
from proj1_helpers import *
DATA_TEST_PATH = "../Data/train.csv"
OUTPUT_PATH = 'pred_1.csv'
y_test, x_test, ids_test = load_csv_data(DATA_TEST_PATH)

restricted_x_test = x_test[:,indices]
quadratic_restricted_x_test = build_quadratic_array(restricted_x_test)

N_test = x_test.shape[0]
tx_test = np.c_[np.ones((N_test, 1)), quadratic_restricted_x_test]

w, loss = least_squares(y_train,tx_train)

y_pred = predict_labels(w, tx_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [None]:
y_pred.shape

In [None]:
ids_test.shape