# PCML Project-1 ~ Team #60

## Initial Python Imports

In [1]:
# Useful starting lines
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
from helpers import *

%load_ext autoreload
%autoreload 2

## Training data

### Load data

In [2]:
DATA_TRAIN_PATH = "../Data/train.csv"
y_train, tx_train, _ = load_csv_data(DATA_TRAIN_PATH)

print("Loaded training data with dimensions ", tx_train.shape)

Loaded training data with dimensions  (250000, 30)


### Useful information on the data

#### Count outliers - Extreme values

In [None]:
from data_preparation import count_outliers
outliers = count_outliers(tx_train, -999)
for feature in range(tx_train.shape[1]):
    print('feature: ',feature,' -> ',outliers[feature])

#### Analysis of output y

In [None]:
plt.hist(y_train, bins=2, align='mid')
plt.title("Histogram of output y")
plt.show()

#### Analysis of y as a function of all its features (one by one)

In [None]:
from data_preparation import plot_features_by_y
plot_features_by_y(y_train,tx_train)

### Rank Deficiency of the matrix

In [None]:
original_rank = np.linalg.matrix_rank(tx_train)
print('original tx_train rank: ',original_rank)
# indices = np.arange(tx_train.shape[1])
# for col in range(30):
#     indices = np.delete(indices,col)
#     rank = np.linalg.matrix_rank(tx_train[:,indices])
#     indices = np.insert(indices,col,col)

As we can see our matrix-rank is 30 which means that we do not have any ill-conditioning between our columns.

### Standardization

In [3]:
from data_preparation import standardize_outliers
# Standardize the data and replace undefined values with the mean, column by column
tx_train, _, _ = standardize_outliers(tx_train, -999)
print('tx standardized shape: ',tx_train.shape)

tx standardized shape:  (250000, 30)


### Feature selection and polynomial expansion

In [None]:
from quadratic_array import *
indices = select_features(tx_train, y_train, 0.05)
tx_train = tx_train[:,indices]
indices, len(indices)

In [None]:
tx_train = np.c_[np.ones(tx_train.shape[0]), tx_train]

In [4]:
from feature_selection import best_feature_degrees
from implementations import least_squares

best_degrees = best_feature_degrees(y_train, tx_train, least_squares, max_degree=12)
print(best_degrees)

[12 12 12  9  7 12 12 12  6 12 12  0  9 12  9  3 12 11  4  8  7 11  3  7  9
  0 10 10 12 12]


In [5]:
from feature_selection import build_poly_by_feature
tx_train = build_poly_by_feature(tx_train, best_degrees)

print("Created expanded data with shape ", tx_train.shape)

Created expanded data with shape  (250000, 268)


## Model selection

In [6]:
w = {}
s = {}

### Linear regression using gradient descent

In [None]:
from cross_validation import test_GD
from implementations import least_squares_GD

gammas = np.logspace(-1, 0, num=5)
best_loss = 1000000 # TODO change
best_score = 0
for gamma in gammas:
    loss, score = test_GD(y_train, tx_train, gamma)
    #if loss < best_loss:
    #    best_loss = loss
    #    best_gamma = gamma
    #    best_score = score
    print(gamma, score)
    if score > best_score:
        best_score = score
        best_gamma = gamma
        best_loss = loss

print('best gamma: ', best_gamma)
print("Estimated leaderboard score: ", best_score)
w["gd"], _ = least_squares_GD(y_train, tx_train, np.zeros(tx_train.shape[1]), 1000, best_gamma)
s["gd"] = best_score

### Linear regression using stochastic gradient descent

In [None]:
from cross_validation import test_SGD
from implementations import least_squares_SGD

gammas = np.logspace(-2, -0.05, num=5)
best_loss = 1000000 # TODO change
best_score = 0
for gamma in gammas:
    loss, score = test_SGD(y_train, tx_train, gamma)
    #if loss < best_loss:
    #    best_loss = loss
    #    best_gamma = gamma
    #    best_score = score
    print(gamma, score)
    if score > best_score:
        best_score = score
        best_gamma = gamma
        best_loss = loss

print('best gamma: ', best_gamma)
print("Estimated leaderboard score: ", best_score)
w["sgd"], _ = least_squares_SGD(y_train, tx_train, np.zeros(tx_train.shape[1]), 10000, best_gamma)
s["sgd"] = best_score

### Least squares regression using normal equations

In [7]:
from cross_validation import test_LS
from implementations import least_squares

_, score = test_LS(y_train, tx_train)

print("Estimated leaderboard score: ", score)
w["ls"], _ = least_squares(y_train, tx_train)
s["ls"] = score

Estimated leaderboard score:  0.817816


### Ridge regression using normal equations

In [8]:
from cross_validation import test_RR
from implementations import ridge_regression

lambdas = np.logspace(-15, -1, num=15)
best_loss = 1000000 # TODO change
best_score = 0
for lambda_ in lambdas:
    loss, score = test_RR(y_train, tx_train, lambda_)
    #if loss < best_loss:
    #    best_loss = loss
    #    best_lambda = lambda_
    #    best_score = score
    print(lambda_, score)
    if score > best_score:
        best_score = score
        best_lambda = lambda_
        best_loss = loss
    
print("Estimated leaderboard score: ", best_score)
print(best_lambda)
w["rr"], _ = ridge_regression(y_train, tx_train, best_lambda)
s["rr"] = best_score

1e-15 0.8178
1e-14 0.817828
1e-13 0.817808
1e-12 0.817848
1e-11 0.817904
1e-10 0.817888
1e-09 0.817852
1e-08 0.817852
1e-07 0.817884
1e-06 0.818028
1e-05 0.817648
0.0001 0.817832
0.001 0.81722
0.01 0.813432
0.1 0.795224
Estimated leaderboard score:  0.818028
1e-06


### Logistic regression using gradient descent

In [None]:
from cross_validation import test_LR
from implementations import logistic_regression

gammas = np.logspace(-5, 3, num=15)
best_loss = 1000000 # TODO change
best_score = 0
for gamma in gammas:
    loss, score = test_LR(y_train, tx_train, gamma)
    if loss < best_loss:
        best_loss = loss
        best_gamma = gamma
        best_score = score
    if score > best_score:
        best_score = score
        best_gamma = gamma
        best_loss = loss
    
print("Estimated leaderboard score: ", best_score)
print('best_gamma: ',best_gamma)
w["lr"], _ = logistic_regression(y_train, tx_train, np.zeros(tx_train.shape[1]), 1000, best_gamma)
s["lr"] = best_score

### Regularized logistic regression using gradient descent

In [None]:
from cross_validation import test_RLR
from implementations import reg_logistic_regression

gammas = np.logspace(-20, -9, 10) #100
lambdas = np.logspace(-5, 5, 100)
best_loss = 10000 # TODO change
best_score = 0
ii = 0
for gamma in gammas:
    print('\n',ii,'\n')
    ii+=1
    print('gamma: ',gamma)
    for lambda_ in lambdas:
        loss, score = test_RLR(y_train, tx_train, lambda_, gamma)
        if loss < best_loss:
            best_loss = loss
            best_gamma = gamma
            best_lambda = lambda_
            best_score = score
        if score > best_score:
            best_score = score
            best_gamma = gamma
            best_lambda = lambda_
            best_loss = loss
    
print("Estimated leaderboard score: ", best_score)
print('best lambda: ',best_lambda)
print('best gamma: ',best_gamma)
w["rlr"], _ = reg_logistic_regression(y_train, tx_train, best_lambda, np.zeros(tx_train.shape[1]), 50, best_gamma)
s["rlr"] = best_score

## Test data

### Choice of method

In [9]:
best = 0
for method, score in s.items():
    if score > best:
        best = score
        best_method = method

print(best_method)
print(s[best_method])
weights = w[best_method]

rr
0.818028


### Loading and standardization

In [10]:
from data_preparation import standardize_outliers
from feature_selection import build_poly_by_feature

DATA_TEST_PATH = "../Data/test.csv"
_, tx_test, ids_test = load_csv_data(DATA_TEST_PATH)
tx_test, _, _ = standardize_outliers(tx_test, -999)
tx_test = build_poly_by_feature(tx_test, best_degrees)

### Predictions and submission

In [11]:
if best_method in [ "lr", "rlr" ]:
    y_pred = predict_logistic_labels(weights, tx_test, threshold = 0.5)
else:
    y_pred = predict_labels(weights, tx_test)

In [12]:
OUTPUT_PATH = "../Data/results.csv"
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)