# PCML Project-1 ~ Team #60

## Initial Python Imports

In [1]:
# Useful starting lines
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import datetime

from costs import compute_loss

%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
from helpers import *

DATA_TRAIN_PATH = "../Data/train.csv" # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

#print the shape of the offset x matrix.
print('y shape: ',y.shape)
print('original tX shape: ',tX.shape)
print('ids shape: ',ids.shape)

y shape:  (200,)
original tX shape:  (200, 30)
ids shape:  (200,)


## Data preprocessing

### Standardization

In [3]:
#standardization
#tX, mean_x, std_x = standardize(tX, mean_x=None, std_x=None)
tX, mean_x, std_x = standardize_outliers(tX)
print('standardized tX shape: ',tX.shape)
print('tX mean shape: ',mean_x.shape)
print('tX std shape: ',std_x.shape)

200 30
(30,)
(30,)
standardized tX shape:  (200, 31)
tX mean shape:  (30,)
tX std shape:  (30,)


### Analysis of output y

In [None]:
# Histogram of output y
plt.hist(y, bins=10, align='mid')
plt.title("Histogram of output y")
plt.show()

### Analysis of y as a function of all its features (one by one)

In [None]:
# Analyse y as a function of all the other features (one by one)
number_of_params = tX.shape[1]
for feature in range(tX.shape[1]):
    print('feature: ',feature)
    plt.scatter(tX[:,feature], y)
    plt.show()

## Linear regression using gradient descent

In [None]:
from gradient_descent import least_squares_GD

# Define the parameters of the algorithm.
max_iters = 1000
gamma = 0.01

# Initialization
w_initial = np.zeros(tX.shape[1])

# Start gradient descent.
# start_time = datetime.datetime.now()
gradient_losses, gradient_ws = least_squares_GD(y, tX, w_initial, gamma, max_iters)
# end_time = datetime.datetime.now()


## Linear regression using stochastic gradient descent

In [None]:
from stochastic_gradient_descent import least_squares_SGD

# Define the parameters of the algorithm.
max_iters = 1000
gamma = 0.01
batch_size = 50

# Initialization
w_initial = np.zeros(tX.shape[1])

# Start SGD.
# start_time = datetime.datetime.now()
stoch_gradient_losses, stoch_gradient_ws = least_squares_SGD(y, tX, w_initial, batch_size, gamma, max_iters)
# end_time = datetime.datetime.now()

In [None]:
def get_min_param_index(sgd_losses):
    index = 0
    min_loss = 100000
    min_index = len(sgd_losses) - 1
    for loss in stoch_gradient_losses:
        if loss < min_loss:
            min_loss = loss
            min_index = index
        index += 1
#         print(loss)

    return min_index, min_loss

min_i, min_loss = get_min_param_index(stoch_gradient_losses)
print('min index: ',min_i)
print('min loss: ',min_loss)

## Least squares regression using normal equations

In [None]:
from least_squares import least_squares

# start_ls_time = datetime.datetime.now()
ls_wopt, ls_loss = least_squares(y,tX)
# end_ls_time = datetime.datetime.now()
print('loss=',ls_loss)
print('parameters w: ',ls_wopt)

## Ridge regression using normal equations

In [None]:
from ridge_regression import ridge_regression

# lambdas = np.logspace(-3, 1, 10)      
# φ_x = build_poly(x, degree)
# x_train, x_test, y_train, y_test = split_data(tX, y, ratio, seed)
    
#     for lamb in lambdas:

w_ridge = ridge_regression(y, tX, 0.01)
err = compute_loss(y, tX, w_ridge)

print('loss: ',err)
print('parameters w: ',w_ridge)

# rmse_tr = np.sqrt(2*compute_loss(y, tX, w_ridge))
# rmse_te = np.sqrt(2*compute_loss(y, tX, w_ridge))


## Logistic regression using gradient descent or SGD

In [6]:
from helpers import de_standardize
from logistic_regression import learning_by_gradient_descent, calculate_loss
from plots import visualization

def logistic_regression_gradient_descent_demo(y, x):
    # init parameters
    max_iter = 10000
    threshold = 1e-8
    gamma = 0.000000001
    losses = []

    # build tx
    tx = x
    w = np.zeros((tx.shape[1], 1))
    
    print('tx: ',tx.shape)
    print('w: ',w.shape)
    print('y: ',y.shape)

    # start the logistic regression
    for iter in range(max_iter):
        # get loss and update w.
        loss, w = learning_by_gradient_descent(y, tx, w, gamma)
        # log info
        if iter % 1000 == 0:
            print("Current iteration={i}, the loss={l}".format(i=iter, l=loss))
        # converge criteria
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
    print("The loss={l}".format(l=calculate_loss(y, tx, w)))

# logistic_regression_gradient_descent_demo(y, tX)

NameError: name 'logistic_regression_gradient_descent_demo' is not defined

## Regularized logistic regression using gradient descent or SGD

## Full testing

In [12]:
from test import *

test_GD(y, tX, ratio=0.1)
test_SGD(y, tX, ratio=0.1)
test_LS(y, tX, ratio=0.1)
test_RR(y, tX, ratio=0.1)
test_LR(y, tX, ratio=0.1)
# test_RLR(y, tX, ratio=0.1)

GD test
parameters w:  [-0.34168196  0.02810282 -0.23113275 -0.2642573  -0.00170859  0.07783449
  0.15617678  0.21540708  0.19764853 -0.08054252  0.15725189 -0.19988539
  0.06524013  0.13730836  0.08361021 -0.1112728   0.0207851   0.07465711
 -0.04690335 -0.09979517  0.00044731 -0.17615801 -0.03917441 -0.02734724
  0.11530171  0.06025355 -0.08015996  0.038387   -0.08518585 -0.1174616
  0.14389595]
GD score:  0.75

SGD test
parameters w:  [-0.01464767 -0.00110574 -0.01745173 -0.00124804  0.00993847  0.00225016
  0.00262238 -0.0011861  -0.00202622  0.00167379  0.01274512 -0.01158411
  0.00876329  0.00338675  0.01321572 -0.0008785   0.00320333 -0.00306122
 -0.00161408 -0.00879369 -0.00307329 -0.00727359  0.01035104  0.00830974
  0.00955356  0.00289597  0.00241866  0.00290597 -0.00216775 -0.00140761
  0.01232947]
SGD score:  0.65

LS test
LS score:  0.85

RR test
Ridge Regression score:  0.8

LR test
Logistic Regression score:  12.0



## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = "../Data/test.csv" # TODO: download test data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
OUTPUT_PATH = '' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)