# PCML Project-1 ~ Team #60

## Initial Python Imports

In [1]:
# Useful starting lines
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
from helpers import *

%load_ext autoreload
%autoreload 2

## Training data

### Load data

In [13]:
DATA_TRAIN_PATH = "../Data/train.csv"
y_train, tx_train, _ = load_csv_data(DATA_TRAIN_PATH)

print("Loaded training data with dimensions ", tx_train.shape)

Loaded training data with dimensions  (250000, 30)


### Useful information on the data

#### Count outliers - Extreme values

In [None]:
from data_preparation import count_outliers
outliers = count_outliers(tx_train, -999)
for feature in range(tx_train.shape[1]):
    print('feature: ',feature,' -> ',outliers[feature])

#### Analysis of output y

In [None]:
plt.hist(y_train, bins=2, align='mid')
plt.title("Histogram of output y")
plt.show()

#### Analysis of y as a function of all its features (one by one)

In [None]:
from data_preparation import plot_features_by_y
plot_features_by_y(y_train,tx_train)

#### Rank deficiency of the matrix

In [None]:
original_rank = np.linalg.matrix_rank(tx_train)
print('original tx_train rank: ',original_rank)
# indices = np.arange(tx_train.shape[1])
# for col in range(30):
#     indices = np.delete(indices,col)
#     rank = np.linalg.matrix_rank(tx_train[:,indices])
#     indices = np.insert(indices,col,col)

### Standardization

In [14]:
from data_preparation import standardize_outliers
# Standardize the data and replace undefined values with the mean, column by column
tx_train, _, _ = standardize_outliers(tx_train, -999)

### Feature selection and polynomial expansion

In [15]:
from feature_selection import best_feature_degrees
from cross_validation import test_LS

best_degrees = best_feature_degrees(y_train, tx_train, test_LS, max_degree=8)
print(best_degrees)

[8 8 8 8 8 7 8 7 5 4 6 0 8 6 8 0 7 8 5 8 2 7 3 7 8 0 6 8 7 8]


In [16]:
from feature_selection import build_poly_by_feature
tx_train = build_poly_by_feature(tx_train, best_degrees)

print("Created expanded data with shape ", tx_train.shape)

Created expanded data with shape  (250000, 184)


## Model selection

In [17]:
w = {}
s = {}

### Linear regression using gradient descent

In [12]:
from cross_validation import test_GD
from implementations import least_squares_GD

gammas = [0.01] #np.logspace(-3, 0, num=4) # TODO play with
best_loss = 10000 # TODO change
best_score = 0
for gamma in gammas:
    loss, score = test_GD(y_train, tx_train, gamma)
    if loss < best_loss:
        best_loss = loss
        best_gamma = gamma
        best_score = score
    # if score > best_score:
    #     best_score = score
    #     best_gamma = gamma
    #     best_loss = loss
    
print("Estimated leaderboard score: ", best_score)
w["gd"], _ = least_squares_GD(y_train, tx_train, np.zeros(tx_train.shape[1]), 50, best_gamma)
s["gd"] = best_score

  return 1/2*np.mean(e**2)
  y_pred[np.where(y_pred <= threshold)] = -1
  y_pred[np.where(y_pred > threshold)] = 1


Estimated leaderboard score:  0


NameError: name 'best_gamma' is not defined

### Linear regression using stochastic gradient descent

In [None]:
from cross_validation import test_SGD
from implementations import least_squares_SGD

gammas = np.logspace(-3, 0, num=4) # TODO play with
best_loss = 10000 # TODO change
best_score = 0
for gamma in gammas:
    loss, score = test_SGD(y_train, tx_train, gamma)
    if loss < best_loss:
        best_loss = loss
        best_gamma = gamma
        best_score = score
    # if score > best_score:
    #     best_score = score
    #     best_gamma = gamma
    #     best_loss = loss
    
print("Estimated leaderboard score: ", best_score)
w["sgd"], _ = least_squares_SGD(y_train, tx_train, np.zeros(tx_train.shape[1]), 50, best_gamma)
s["sgd"] = best_score

### Least squares regression using normal equations

In [18]:
from cross_validation import test_LS
from implementations import least_squares

_, score = test_LS(y_train, tx_train)

print("Estimated leaderboard score: ", score)
w["ls"], _ = least_squares(y_train, tx_train)
s["ls"] = score

Estimated leaderboard score:  0.8103


### Ridge regression using normal equations

In [None]:
from cross_validation import test_RR
from implementations import ridge_regression

lambdas = np.logspace(-15, 3, num=20)
best_loss = 10000 # TODO change
best_score = 0
for lambda_ in lambdas:
    loss, score = test_RR(y_train, tx_train, lambda_)
    if loss < best_loss:
        best_loss = loss
        best_lambda = lambda_
        best_score = score
    # if score > best_score:
    #     best_score = score
    #     best_lambda = lambda_
    #     best_loss = loss
    
print("Estimated leaderboard score: ", best_score)
w, _ = ridge_regression(y_train, tx_train, best_lambda)
s["rr"] = best_score

### Logistic regression using gradient descent

In [None]:
from cross_validation import test_LR
from implementations import logistic_regression

gammas = np.logspace(-10, 3, 13)
best_loss = 10000 # TODO change
best_score = 0
for gamma in gammas:
    loss, score = test_LR(y_train, tx_train, gamma)
    if loss < best_loss:
        best_loss = loss
        best_gamma = gamma
        best_score = score
    # if score > best_score:
    #     best_score = score
    #     best_gamma = gamma
    #     best_loss = loss
    
print("Estimated leaderboard score: ", best_score)
w["lr"], _ = logistic_regression(y_train, tx_train, np.zeros(tx_train.shape[1]), 50, best_gamma)
s["lr"] = best_score

### Regularized logistic regression using gradient descent

In [None]:
from cross_validation import test_RLR
from implementations import reg_logistic_regression

gammas = np.logspace(-2, 2, 3)
lambdas = np.logspace(-2, 2, 3)
best_loss = 10000 # TODO change
best_score = 0
for gamma in gammas:
    for lambda_ in lambdas:
        loss, score = test_RLR(y_train, tx_train, lambda_, gamma)
        if loss < best_loss:
            best_loss = loss
            best_gamma = gamma
            best_lambda = lambda_
            best_score = score
        # if score > best_score:
        #     best_score = score
        #     best_gamma = gamma
        #     best_lambda = lambda_
        #     best_loss = loss
    
print("Estimated leaderboard score: ", best_score)
w["rlr"], _ = reg_logistic_regression(y_train, tx_train, best_lambda, np.zeros(tx_train.shape[1]), 50, best_gamma)
s["rlr"] = best_score

## Test data

### Choice of method

In [19]:
best = 0
for method, score in s.items():
    if score > best:
        best = score
        best_method = method

print(best_method)
print(s[best_method])
weights = w[best_method]

ls
0.8103


### Loading and standardization

In [20]:
from data_preparation import standardize_outliers
from feature_selection import build_poly_by_feature

DATA_TEST_PATH = "../Data/test.csv"
_, tx_test, ids_test = load_csv_data(DATA_TEST_PATH)
tx_test, _, _ = standardize_outliers(tx_test, -999)
tx_test = build_poly_by_feature(tx_test, best_degrees)

### Predictions and submission

In [21]:
if best_method in [ "lr", "rlr" ]:
    y_pred = predict_logistic_labels(weights, tx_test, threshold = 0.5)
else:
    y_pred = predict_labels(weights, tx_test)

In [22]:
OUTPUT_PATH = "../Data/results.csv"
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)