# PCML Project-1 ~ Team #60

## Initial Python Imports

In [1]:
# Useful starting lines
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
from helpers import *

%load_ext autoreload
%autoreload 2

## Training data

### Load data

In [2]:
DATA_TRAIN_PATH = "../Data/train.csv"
y_train, tx_train, _ = load_csv_data(DATA_TRAIN_PATH)

print("Loaded training data with dimensions ", tx_train.shape)

Loaded training data with dimensions  (250000, 30)


### Useful information on the data

#### Count outliers - Extreme values

In [None]:
from data_preparation import count_outliers
outliers = count_outliers(tx_train, -999)
for feature in range(tx_train.shape[1]):
    print('feature: ',feature,' -> ',outliers[feature])

#### Analysis of output y

In [None]:
plt.hist(y_train, bins=2, align='mid')
plt.title("Histogram of output y")
plt.show()

#### Analysis of y as a function of all its features (one by one)

In [None]:
from data_preparation import plot_features_by_y
plot_features_by_y(y_train,tx_train)

#### Rank Deficiency of the matrix

In [None]:
original_rank = np.linalg.matrix_rank(tx_train)
print('original tx_train rank: ',original_rank)
# indices = np.arange(tx_train.shape[1])
# for col in range(30):
#     indices = np.delete(indices,col)
#     rank = np.linalg.matrix_rank(tx_train[:,indices])
#     indices = np.insert(indices,col,col)

As we can see our matrix-rank is 30 which means that we do not have any ill-conditioning between our columns.

### Standardization

In [3]:
from data_preparation import standardize_outliers
# Standardize the data and replace undefined values with the mean, column by column
tx_train, _, _ = standardize_outliers(tx_train, -999)
print('tx standardized shape: ',tx_train.shape)

tx standardized shape:  (250000, 30)


### Feature selection

In [None]:
from quadratic_array import *
indices = select_features(tx_train, y_train, 0.05)
tx_train = tx_train[:,indices]
indices, len(indices)

In [6]:
# Only run if we don't run build_poly_by_feature (adds the ones column in front)
tx_train = np.c_[np.ones(tx_train.shape[0]), tx_train]

### Polynomial expansion

In [None]:
from feature_selection import best_feature_degrees
from implementations import least_squares

best_degrees = best_feature_degrees(y_train, tx_train, least_squares, max_degree=12)
print(best_degrees)

In [None]:
from feature_selection import build_poly_by_feature
tx_train = build_poly_by_feature(tx_train, best_degrees)

print("Created expanded data with shape ", tx_train.shape)

## Model selection

In [4]:
w = {} # Weight vector with the best score of each method
s = {} # Best score for each method

### Linear regression using gradient descent

In [10]:
from cross_validation import test_GD
from implementations import least_squares_GD

# Find the best gamma for gradient descent
gammas = np.logspace(-1, -0.4, num=5)
best_loss = 1000000 # TODO change
best_score = 0
for gamma in gammas:
    loss, score = test_GD(y_train, tx_train, gamma)
    #if loss < best_loss:
    #    best_loss = loss
    #    best_gamma = gamma
    #    best_score = score
    print(gamma, score)
    if score > best_score:
        best_score = score
        best_gamma = gamma
        best_loss = loss

print('best gamma: ', best_gamma)
print("Estimated leaderboard score: ", best_score)
w["gd"], _ = least_squares_GD(y_train, tx_train, np.zeros(tx_train.shape[1]), 100, best_gamma)
s["gd"] = best_score

0.1 0.74104
0.141253754462 0.7425
0.199526231497 0.743476
0.281838293126 0.74406
0.398107170553 0.373304
best gamma:  0.281838293126
Estimated leaderboard score:  0.74406


### Linear regression using stochastic gradient descent

In [11]:
from cross_validation import test_SGD
from implementations import least_squares_SGD

# Find the best gamma for stochastic gradient descent
gammas = np.logspace(-1, -0.4, num=5)
best_loss = 1000000 # TODO change
best_score = 0
for gamma in gammas:
    loss, score = test_SGD(y_train, tx_train, gamma)
    #if loss < best_loss:
    #    best_loss = loss
    #    best_gamma = gamma
    #    best_score = score
    print(gamma, score)
    if score > best_score:
        best_score = score
        best_gamma = gamma
        best_loss = loss

print('best gamma: ', best_gamma)
print("Estimated leaderboard score: ", best_score)
w["sgd"], _ = least_squares_SGD(y_train, tx_train, np.zeros(tx_train.shape[1]), 100, best_gamma)
s["sgd"] = best_score

0.1 0.74104
0.141253754462 0.7425
0.199526231497 0.743476
0.281838293126 0.74406
0.398107170553 0.71282
best gamma:  0.281838293126
Estimated leaderboard score:  0.74406


### Least squares regression using normal equations

In [12]:
from cross_validation import test_LS
from implementations import least_squares

# Compute the score we get with least squares
_, score = test_LS(y_train, tx_train)

print("Estimated leaderboard score: ", score)
w["ls"], _ = least_squares(y_train, tx_train)
s["ls"] = score

Estimated leaderboard score:  0.744388


### Ridge regression using normal equations

In [13]:
from cross_validation import test_RR
from implementations import ridge_regression

# Find the best lambda for ridge regression
lambdas = np.logspace(-15, -1, num=15)
best_loss = 1000000 # TODO change
best_score = 0
for lambda_ in lambdas:
    loss, score = test_RR(y_train, tx_train, lambda_)
    #if loss < best_loss:
    #    best_loss = loss
    #    best_lambda = lambda_
    #    best_score = score
    print(lambda_, score)
    if score > best_score:
        best_score = score
        best_lambda = lambda_
        best_loss = loss
    
print("Estimated leaderboard score: ", best_score)
print(best_lambda)
w["rr"], _ = ridge_regression(y_train, tx_train, best_lambda)
s["rr"] = best_score

1e-15 0.744388
1e-14 0.744388
1e-13 0.74438
1e-12 0.74438
1e-11 0.744396
1e-10 0.744364
1e-09 0.744356
1e-08 0.744356
1e-07 0.74436
1e-06 0.74436
1e-05 0.744372
0.0001 0.744356
0.001 0.744304
0.01 0.743012
0.1 0.736068
Estimated leaderboard score:  0.744396
1e-11


### Logistic regression using gradient descent

In [14]:
from cross_validation import test_LR
from implementations import logistic_regression

# Find the best gamma for logistic regression
gammas = np.logspace(-5, 3, num=15)
best_loss = 1000000 # TODO change
best_score = 0
for gamma in gammas:
    loss, score = test_LR(y_train, tx_train, gamma)
    #if loss < best_loss:
    #    best_loss = loss
    #    best_gamma = gamma
    #    best_score = score
    if score > best_score:
        best_score = score
        best_gamma = gamma
        best_loss = loss
    
print("Estimated leaderboard score: ", best_score)
print('best_gamma: ',best_gamma)
w["lr"], _ = logistic_regression(y_train, tx_train, np.zeros(tx_train.shape[1]), 1000, best_gamma)
s["lr"] = best_score

  return 1 / (1 + np.exp(-t))
  return np.sqrt(2*mse)
  return (1/N) * np.sum(np.log(1 + np.exp(tx.dot(w))) - y * tx.dot(w))


Estimated leaderboard score:  0.708524
best_gamma:  1e-05


### Regularized logistic regression using gradient descent

In [18]:
from cross_validation import test_RLR
from implementations import reg_logistic_regression

# Find the best lambda and gamma for regularized logistic regression
gammas = np.logspace(-2, -0.5, 5) #100
lambdas = np.logspace(-1, 1, 5)
best_loss = 10000 # TODO change
best_score = 0
ii = 0
for gamma in gammas:
    for lambda_ in lambdas:
        loss, score = test_RLR(y_train, tx_train, lambda_, gamma)
        print(gamma, lambda_, score)
        #if loss < best_loss:
        #    best_loss = loss
        #    best_gamma = gamma
        #    best_lambda = lambda_
        #    best_score = score
        if score > best_score:
            best_score = score
            best_gamma = gamma
            best_lambda = lambda_
            best_loss = loss
    
print("Estimated leaderboard score: ", best_score)
print('best lambda: ',best_lambda)
print('best gamma: ',best_gamma)
w["rlr"], _ = reg_logistic_regression(y_train, tx_train, best_lambda, np.zeros(tx_train.shape[1]), 50, best_gamma)
s["rlr"] = best_score

  return 1 / (1 + np.exp(-t))
  return (1/N) * np.sum(np.log(1 + np.exp(tx.dot(w))) - y * tx.dot(w))


0.01 0.1 0.708468
0.01 0.316227766017 0.70846
0.01 1.0 0.708536
0.01 3.16227766017 0.708644
0.01 10.0 0.708784
0.0237137370566 0.1 0.708448
0.0237137370566 0.316227766017 0.708516
0.0237137370566 1.0 0.70864
0.0237137370566 3.16227766017 0.708808
0.0237137370566 10.0 0.708724
0.056234132519 0.1 0.708512
0.056234132519 0.316227766017 0.708588
0.056234132519 1.0 0.708852
0.056234132519 3.16227766017 0.708824
0.056234132519 10.0 0.706976
0.133352143216 0.1 0.708564
0.133352143216 0.316227766017 0.708704
0.133352143216 1.0 0.708756
0.133352143216 3.16227766017 0.707668
0.133352143216 10.0 0.702956
0.316227766017 0.1 0.708644
0.316227766017 0.316227766017 0.708784
0.316227766017 1.0 0.7083
0.316227766017 3.16227766017 0.704704
0.316227766017 10.0 0.698572
Estimated leaderboard score:  0.708852
best lambda:  1.0
best gamma:  0.056234132519


## Test data

### Choice of method

In [None]:
# Choose the best method based on the score
best = 0
for method, score in s.items():
    if score > best:
        best = score
        best_method = method

print(best_method)
print(s[best_method])
weights = w[best_method] # Weight vector of the best method

### Loading and standardization

In [None]:
from data_preparation import standardize_outliers
from feature_selection import build_poly_by_feature

DATA_TEST_PATH = "../Data/test.csv"
_, tx_test, ids_test = load_csv_data(DATA_TEST_PATH)
tx_test, _, _ = standardize_outliers(tx_test, -999)
# Build test data with the same shape as the training data
tx_test = build_poly_by_feature(tx_test, best_degrees)

### Predictions and submission

In [None]:
if best_method in [ "lr", "rlr" ]:
    y_pred = predict_logistic_labels(weights, tx_test, threshold = 0.5)
else:
    y_pred = predict_labels(weights, tx_test)

In [None]:
OUTPUT_PATH = "../Data/results.csv"
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)