# PCML Project-1 ~ Team #60

## Initial Python Imports

In [1]:
# Useful starting lines
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import datetime

from costs import compute_loss

%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
from helpers import *

DATA_TRAIN_PATH = "../Data/train.csv" # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

#print the shape of the offset x matrix.
print('y shape: ',y.shape)
print('original tX shape: ',tX.shape)
print('ids shape: ',ids.shape)

y shape:  (250000,)
original tX shape:  (250000, 30)
ids shape:  (250000,)


## Data preprocessing

### Count Outliers - Extreme values

In [3]:
outliers = count_outliers(tX,-999)
for feature in range(tX.shape[1]):
    print('feature: ',feature,' -> ',outliers[feature])

feature:  0  ->  38114.0
feature:  1  ->  0.0
feature:  2  ->  0.0
feature:  3  ->  0.0
feature:  4  ->  177457.0
feature:  5  ->  177457.0
feature:  6  ->  177457.0
feature:  7  ->  0.0
feature:  8  ->  0.0
feature:  9  ->  0.0
feature:  10  ->  0.0
feature:  11  ->  0.0
feature:  12  ->  177457.0
feature:  13  ->  0.0
feature:  14  ->  0.0
feature:  15  ->  0.0
feature:  16  ->  0.0
feature:  17  ->  0.0
feature:  18  ->  0.0
feature:  19  ->  0.0
feature:  20  ->  0.0
feature:  21  ->  0.0
feature:  22  ->  0.0
feature:  23  ->  99913.0
feature:  24  ->  99913.0
feature:  25  ->  99913.0
feature:  26  ->  177457.0
feature:  27  ->  177457.0
feature:  28  ->  177457.0
feature:  29  ->  0.0


### Standardization

In [4]:
#standardization
# tX, mean_x, std_x = standardize(tX, mean_x=None, std_x=None)
tX, mean_x, std_x = standardize_outliers(tX)
print('standardized tX shape: ',tX.shape)
print('tX mean shape: ',mean_x.shape)
print('tX std shape: ',std_x.shape)

250000 30
(250000, 31)
standardized tX shape:  (250000, 31)
tX mean shape:  (30,)
tX std shape:  (30,)


### Analysis of output y

In [None]:
# Histogram of output y
plt.hist(y, bins=10, align='mid')
plt.title("Histogram of output y")
plt.show()

### Analysis of y as a function of all its features (one by one)

In [None]:
# Analyse y as a function of all the other features (one by one)
plot_features_by_y(y,tX)

## Linear regression using gradient descent

In [None]:
from gradient_descent import least_squares_GD

# Define the parameters of the algorithm.
max_iters = 1000
gamma = 0.01

# Initialization
w_initial = np.zeros(tX.shape[1])

# Start gradient descent.
# start_time = datetime.datetime.now()
gradient_losses, gradient_ws = least_squares_GD(y, tX, w_initial, gamma, max_iters)
# end_time = datetime.datetime.now()


## Linear regression using stochastic gradient descent

In [None]:
from stochastic_gradient_descent import least_squares_SGD

# Define the parameters of the algorithm.
max_iters = 1000
gamma = 0.01
batch_size = 50

# Initialization
w_initial = np.zeros(tX.shape[1])

# Start SGD.
stoch_gradient_losses, stoch_gradient_ws = least_squares_SGD(y, tX, w_initial, batch_size, gamma, max_iters)

min_stoch_i, min_stoch_loss = get_min_param_index(stoch_gradient_losses)
print('min index: ',min_stoch_i)
print('min loss: ',min_stoch_loss)

## Least squares regression using normal equations

In [26]:
from least_squares import least_squares

# start_ls_time = datetime.datetime.now()
ls_wopt, ls_loss = least_squares(y,tX)
# end_ls_time = datetime.datetime.now()
print('loss=',ls_loss)
print('parameters w: ',ls_wopt)

loss= 0.340409452162
parameters w:  [ -3.14664000e-01   1.04652995e-02  -2.54719228e-01  -2.63502968e-01
  -1.10181135e-03   4.05482642e-02   1.67176106e-01   8.97553275e-03
   2.82008766e-01  -2.81502578e-02  -3.29279652e+02  -1.88141151e-01
   1.18065030e-01   1.42232505e-01   6.39663558e+01  -7.79460474e-04
  -8.30656823e-04   6.30821834e+01  -8.61168797e-04   2.51791347e-03
   1.03659311e-01   9.33786062e-04  -4.70019028e-02   4.17575954e-02
  -6.14055754e-02   8.39840624e-04   2.43612108e-04  -6.79446901e-02
   2.89296903e-03  -3.23605722e-03   2.78944548e+02]


## Ridge regression using normal equations

In [None]:
from ridge_regression import ridge_regression

w_ridge = ridge_regression(y, tX, 0.01)
err = compute_loss(y, tX, w_ridge)

print('loss: ',err)
print('parameters w: ',w_ridge)

In [None]:
from ridge_regression import cross_validation_ridge_regression

cross_validation_ridge_regression(y,tX)

## Logistic regression using gradient descent or SGD

In [5]:
from helpers import de_standardize
from logistic_regression import logistic_regression_gradient_descent
from plots import visualization

lr_loss, lr_w = logistic_regression_gradient_descent(y, tX)

Current iteration=0, the loss=0.693147180559945
Current iteration=1000, the loss=0.4535788439094665
Current iteration=2000, the loss=0.2554613354813791
Current iteration=3000, the loss=0.08502833099738107


## Regularized logistic regression using gradient descent or SGD

## Testing

In [None]:
from test import *

In [None]:
test_GD(y, tX, ratio=0.1)

In [None]:
test_SGD(y, tX, ratio=0.1)

In [None]:
test_LS(y, tX, ratio=0.1)

In [None]:
test_RR(y, tX, ratio=0.1)

In [None]:
test_LR(y, tX, ratio=0.1)

In [None]:
# test_RLR(y, tX, ratio=0.1)

## Generate predictions and save ouput in csv format for submission:

In [None]:
# Gradient Descent
weights = gradient_ws[-1]

In [None]:
# Stochastic Gradient Descent
weights = stoch_gradient_ws[min_stoch_i]

In [28]:
# Least Squares
weights = ls_wopt

In [None]:
# Ridge Regression
weights = w_ridge

In [37]:
# Logistic Regression
weights = lr_w

In [None]:
# weights =

In [None]:
# subtract features
weights = np.ones(31)
# weights[[0,4,5,6,23,24,25,26,27,28]] = 0
weights[[0,12,13,15,16,18,19,20,21,23,25,26,28,29]] = 0
print(weights)
print(weights.shape)

In [11]:
DATA_TEST_PATH = "../Data/test.csv" # TODO: download test data and supply path here 
y_test, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [12]:
tX_test, _, _ = standardize_outliers(tX_test)

568238 30
(568238, 31)


In [45]:
from logistic_regression import sigmoid
def predict_logistic_labels(weights, data):
    """Generates class predictions given weights, and a test data matrix"""
    
    y_pred = sigmoid(np.dot(data, weights))
#     y_pred[np.where(y_pred <= 0)] = -1
#     y_pred[np.where(y_pred > 0)] = 1
    
    return y_pred

In [53]:
p = predict_logistic_labels(weights,tX_test)
p[0:100]

array([[ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
      

In [46]:
# logistic
OUTPUT_PATH = '../Data/results.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_logistic_labels(weights, tX_test)
y_pred = y_pred.reshape(1,y_pred.shape[0]).flatten()
print('final score: \t\t',(y_pred == y_test).sum()/y_pred.size)
# create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

final score: 		 1.0


In [30]:
OUTPUT_PATH = '../Data/results.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
print('final score: \t\t',(y_pred == y_test).sum()/y_pred.size)
# create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

[-1. -1. -1. ...,  1.  1. -1.]
final score: 		 0.276665763289


In [47]:
count_true = 0
for row in range(0,len(y_pred)):
    if y_pred[row] == y_test[row]:
        count_true += 1
#         print(row,' True')

total_pred = y_test.shape[0]
print('total: \t\t\t',total_pred)
print('correct predictions: \t', count_true)
print('wrong predictions: \t', total_pred-count_true)
print('final score: \t\t', count_true/y_pred.size)

total: 			 568238
correct predictions: 	 568238
wrong predictions: 	 0
final score: 		 1.0
