In [1]:
from basics import *
from implementations import *
from processing import *
from cleaning import *
from metrics import *
from helper import *
from definitions import ROOT_DIR

import implementations as imp
import helper as hp

## Load the data

In [26]:
dataset_path = os.path.join(ROOT_DIR, 'dataset_to_release')
x_tr, x_te, y_tr, tr_ids, te_ids = load_csv_data(dataset_path, False)

In [27]:
print(x_tr.shape)

(328135, 321)


## Define global variables

In [28]:
cv = True #for the sake of time, if set to False will use premade list of best parameters already computed. Those same parameters can be found by keeping cv to True. 
split_ratio = 0.8 #80% of the dataset will be used to train the model, the rest will be used to evaluate performance
nan_threshold = 0.8 #to filter our features with more nan than the threshold
max_unique_values = 50 
remove_const = False
const_thresholds = [0.0000001, 0.000001] #to filter constant continuous , when filtering out features w too many categories
n_components = False
PCA = False #if set to True will run PCA on continuous features and keep principal components until 99% of variance is explained
pca_thresholds = 99 #in % explained variance
correlation_threshold = 0.9 #correlation threshold

models = ['gradient descent', 'stochastic gradient descent', 'least squares', 'ridge regression',  'logistic regression', 'reg logistic regression']
mapping_threshold = 0


## Preprocessing + cleaning

In [29]:
#split the data

clean_train = clean(x_tr, nan_threshold, 
                    remove_const, const_thresholds, 
                    PCA, n_components, pca_thresholds, 
                    max_unique_values, correlation_threshold)

clean_x_tr, clean_x_te, y_tr_set, y_te_set = split_data(clean_train, y_tr, split_ratio)

Finished cleaning - data is now (328135, 559)


In [30]:
print(clean_train.shape)

(328135, 559)


## TUNE HYPERPARAMETERS

In [None]:
def mean_squared_error_gd(y, tx, initial_w, max_iters, gamma):
    """Calculates the gradient of the unregularized loss and uses it in gradient descent to approximate optimal weights
    y: numpy array of shape=(N, )
        tx: numpy array of shape=(N,2)
        w: numpy array of shape=(2, ). The vector of model parameters.

    Args:
        y (np.ndarray): shape = (N,) contains the data we want to predict
        tx (np.ndarray): shape = (N,2) contains the features used to predict
        initial_w (np.ndarray): shape = (2,) the initial weight pair that will get updated with gradient
        max_iters (int): maximum number of steps
        gamma (float): learning rate

    Returns:
        np.ndarray : shape = (2,) optimal weights
        float : mean squared erros
    """
    w = initial_w # initiate w_{t}
    for n_iter in range(max_iters):
        e = y - np.dot(tx,w)
        gradient = -tx.T.dot(e) / len(e)
        w = w - gamma * gradient # w_{t+1} = w_{t} - gamma * \/L(w_{t})
    e = y - np.dot(tx,w)
    return w, compute_mse(e)

In [None]:
def compute_losses_for_hyperparameters(model, y, tx, k_fold, max_iters=0, lambdas = ['Nan'], gammas = ['Nan'], seed = 1):
    """Process cross-validation with the chosen model 
        Calculate the test and train errors for every hyperparameters 

    Args:
        model (string): name of the regression technique chosen
        y (np.ndarray): shape = (N,) contains the data we want to predict
        tx (np.ndarray): shape = (N,D) contains the features used to predict
        initial_w (np.ndarray): shape = (D,) the initial weight pair that will get updated with gradient
        max_iters (int): maximum number of steps
        lambdas (np.ndarray): hyperparameter for the penalized loss for regularized regression
        gammas (np.ndarray): hyperparameter for GD and SGD implementation
        k_fold (int): K in K-fold, i.e. the fold num
        seed (int):  the random seed

    Returns:
        np.ndarray : shape(N,5) train and test errors for each hyperparameters of the chosen model
    """

    results = np.array(["model", "lambda", "gamma", "train error", "test error"])
    
    k_indices = build_k_indices(y, k_fold, seed)

    for lambda_ in lambdas:
        for gamma in gammas:
            losses_tr = []
            losses_te = []
            for k in range(k_fold):
                loss_tr, loss_te = cv_loss(model, y, tx, k_indices, k, lambda_, max_iters, gamma)
                losses_tr.append(loss_tr)
                print('losses tr' , losses_tr)
                losses_te.append(loss_te)
            loss_tr = np.mean(losses_tr)
            loss_te = np.mean(losses_te)
            res =  np.array([model, lambda_, gamma, loss_tr, loss_te])
            print(res)
            results = np.append(results, res)

    return results


def cv_loss(model, y, x, k_indices, k, lambda_, max_iters, gamma): 
    """to complete ????

    Args:
        model:      str, ['gradient descent', 'stochastic gradient descent', 'ridge regression', 'logistic regression', 'reg logistic regression]
        y:          shape=(N,)
        x:          shape=(N,)
        k_indices:  2D array returned by build_k_indices()
        k:          scalar, the k-th fold (N.B.: not to confused with k_fold which is the fold nums)
        lambda_:    scalar, cf. ridge_regression()
        initial_w:  scalar, default to 0, needed for all models but ridge regression
        max_iters:  scalar, default to 0, needed for all models but ridge regression
        gamma:      learning rate, default to 0, needed for all models but ridge regression

    Returns:
        train and test root mean square errors rmse = sqrt(2 mse)

    >>> cross_validation(np.array([1.,2.,3.,4.]), np.array([6.,7.,8.,9.]), np.array([[3,2], [0,1]]), 1, 2, 3)
    (0.019866645527597114, 0.33555914361295175)
    """
    # get k'th subgroup in test, others in train: 
    test = k_indices[k]
    train = np.delete(k_indices, k, axis=0)
    
    y_te = np.array([y[i] for i in test])
    x_te = np.array([x[i] for i in test])
    
    y_tr = np.array([y[i] for i in train.flatten()])
    x_tr = np.array([x[i] for i in train.flatten()])

    initial_w = np.zeros(x.shape[1])
    if model == 'gradient descent':
        w, _ = mean_squared_error_gd(y_tr,x_tr, initial_w, max_iters, gamma)
        
    elif model == 'stochastic gradient descent':
        w, _ = imp.mean_squared_error_sgd(y_tr, x_tr, initial_w, max_iters, gamma)
        
    elif model == 'ridge regression':
        w, _ = imp.ridge_regression(y_tr, x_tr, lambda_)
        
    elif model == 'logistic regression':
        w, _ = imp.logistic_regression(y_tr, x_tr, initial_w, max_iters, gamma)
        
    elif model == 'reg logistic regression':
        w, _ = imp.reg_logistic_regression(y_tr, x_tr, lambda_, initial_w, max_iters, gamma)
    
    # calculate the loss for train and test data: TODO
    te_err = y_te - np.dot(x_te, w)
    loss_te = np.sqrt(2*hp.compute_mse(te_err))
    print('w is : ', w)
    tr_err = y_tr - np.dot(x_tr, w)
    loss_tr = np.sqrt(2*hp.compute_mse(tr_err))
    return loss_tr, loss_te

In [None]:
def find_best_hyperparameters(hyperparameter_losses):
    """Calculate the best hyperparameters based on the test errors computed previously

    Args:
        np.ndarray : shape(N,5) train and test errors for each hyperparameters of the chosen model

    Returns:
        np.ndarray : shape (1,5) train and test errors for the best hyperparameters of the chosen model
    """

    min_index = np.argmin(hyperparameter_losses[1:,-1])
    return hyperparameter_losses[min_index]

In [None]:
#hyperparams = [lambda_, gamma, ]
best_params = {'gradient descent' : [0.01, 0.7], 
               'stochastic gradient descent': [0.01, 0.7],
               'least squares': [0.01, 0.7],
               'ridge regression': [0.01, 0.7],
               'logistic regression': [0.01, 0.7],
               'reg logistic regression': [0.01, 0.7]}#complete w our results as ['model', lambda_, gamma]

k_fold = 4
max_iters = 500000
lambdas = np.linspace(0, 1, 10)
gammas = np.linspace(0.5,1.5,5)

if cv:
    best_params = {}
    for model in models:
        losses = compute_losses_for_hyperparameters(model, y_tr_set, clean_x_tr, k_fold, max_iters, lambdas, gammas)
        print(losses.shape)
        best_params['model'] = [find_best_hyperparameters(losses)[1:3]]




In [None]:
for key, value in best_params.items():
    print(f'{key} - {value}')

## Run and evaluate

In [7]:
best_params = {'gradient descent' : [0.01, 0.7], 
               'stochastic gradient descent': [0.01, 0.7],
               'least squares': [0.01, 0.7],
               'ridge regression': [0.01, 0.7],
               'logistic regression': [0.01, 0.7],
               'reg logistic regression': [0.01, 0.7]}#complete w our results as ['model', lambda_, gamma]

In [31]:
#removing features
absurd_tr = clean_x_tr[:, :15]
absurd_te = clean_x_te[:, :15]


In [32]:
model = 'least squares'
lambda_ = best_params[model][0]
gamma = best_params[model][1]
print(f'working on least squares')
w, loss = least_squares(y_tr_set, absurd_tr)
train_predictions = np.where(np.dot(absurd_tr,w) < mapping_threshold, -1, 1)
test_predictions = np.where(np.dot(absurd_te, w) < mapping_threshold, -1, 1)
tr_err = y_tr_set - np.dot(absurd_tr, w)
te_err = y_te_set - np.dot(absurd_te, w)
train_rmse = np.sqrt(2*compute_mse(tr_err))
test_rmse = np.sqrt(2*compute_mse(te_err))
print('')
print('---------- {LEAST SQUARES} ----------')
print(f'train rmse = {train_rmse} - test rmse = {test_rmse}')
train_metrics = calculate_metrics(y_tr_set, train_predictions)
test_metrics = calculate_metrics(y_te_set, test_predictions)
print('=========== TRAIN metrics ===========')
prettyprint(train_metrics)
print('=========== TEST metrics ===========')
prettyprint(test_metrics)
print('')

    

working on least squares

---------- {LEAST SQUARES} ----------
train rmse = 0.9494482341806864 - test rmse = 0.9497750431223022
Accuracy: 0.6336416413975955 - F1 score 0.7650566231983528
Specificity: 0.620833960434121 - Sensitivity: 0.7650566231983528
Precision: 0.7650566231983528
Accuracy: 0.6348301766041415 - F1 score 0.7647889811054212
Specificity: 0.6225568674538056 - Sensitivity: 0.7647889811054212
Precision: 0.7647889811054212



In [33]:
initial_w = np.zeros(absurd_tr.shape[1])
th = [0,0,0,0,0]

for model in models:
    lambda_ = best_params[model][0]
    gamma = best_params[model][1]
    print(f'working on {model}')
    w, loss = train(model,y_tr_set, absurd_tr, initial_w, 500, gamma, lambda_)
    train_predictions = np.where(np.dot(absurd_tr,w) < mapping_threshold, -1, 1)
    test_predictions = np.where(np.dot(absurd_te, w) < mapping_threshold, -1, 1)
    tr_err = y_tr_set - np.dot(absurd_tr, w)
    te_err = y_te_set - np.dot(absurd_te, w)
    train_rmse = np.sqrt(2*compute_mse(tr_err))
    test_rmse = np.sqrt(2*compute_mse(te_err))
    print('')
    print(f'---------- {model.upper()} ----------')
    print(f'train rmse = {train_rmse} - test rmse = {test_rmse}')
    train_metrics = calculate_metrics(y_tr_set, train_predictions)
    test_metrics = calculate_metrics(y_te_set, test_predictions)
    print('=========== TRAIN metrics ===========')
    prettyprint(train_metrics)
    print('=========== TEST metrics ===========')
    prettyprint(test_metrics)
    print('')

working on gradient descent

---------- GRADIENT DESCENT ----------
train rmse = 0.9494482341806864 - test rmse = 0.9497750431223022
Accuracy: 0.6336416413975955 - F1 score 0.7650566231983528
Specificity: 0.620833960434121 - Sensitivity: 0.7650566231983528
Precision: 0.7650566231983528
Accuracy: 0.6348301766041415 - F1 score 0.7647889811054212
Specificity: 0.6225568674538056 - Sensitivity: 0.7647889811054212
Precision: 0.7647889811054212

working on stochastic gradient descent

---------- STOCHASTIC GRADIENT DESCENT ----------
train rmse = 2.225869390814902e+82 - test rmse = 2.2320636419937186e+82
Accuracy: 0.4141207125116187 - F1 score 0.24579615648593
Specificity: 0.4305255940734795 - Sensitivity: 0.24579615648593
Precision: 0.24579615648593
Accuracy: 0.41428070763557684 - F1 score 0.24633586438283595
Specificity: 0.4301414181842439 - Sensitivity: 0.24633586438283595
Precision: 0.24633586438283595

working on least squares

---------- LEAST SQUARES ----------
train rmse = 0.949448234

  return 1.0 / (1.0 + np.exp(-t))
  loss = - np.mean(y * np.log(s) + (1 - y) * np.log(1 - s))
  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)



---------- REG LOGISTIC REGRESSION ----------
train rmse = 1532.8435654014916 - test rmse = 1533.771820052574
Accuracy: 0.8166303503131334 - F1 score 0.540108098833219
Specificity: 0.8435801602033479 - Sensitivity: 0.540108098833219
Precision: 0.540108098833219
Accuracy: 0.8153808645831746 - F1 score 0.5401730531520396
Specificity: 0.8413714895604029 - Sensitivity: 0.5401730531520396
Precision: 0.5401730531520396



In [None]:
initial_w = np.zeros(clean_x_tr.shape[1])
for model in models:
    lambda_ = best_params[model][0]
    gamma = best_params[model][1]
    print(f'working on {model}')
    w, loss = train(model, y_tr_set, clean_x_tr, initial_w, max_iters, gamma, lambda_)
    train_predictions = np.where(np.dot(clean_x_tr,w) < mapping_threshold, -1, 1)
    test_predictions = np.where(np.dot(clean_x_te, w) < mapping_threshold, -1, 1)
    train_err = y_tr_set - train_predictions
    test_err = y_te_set - test_predictions
    print('')
    print(f'=========== {model} ===========')
    print(f'train error = {train_err} - test error = {test_err}')
    train_metrics = calculate_metrics(y_tr_set, train_predictions)
    test_metrics = calculate_metrics(y_te_set, test_predictions)
    print('=========== TRAIN metrics ===========')
    prettyprint(train_metrics)
    print('=========== TEST metrics ===========')
    prettyprint(test_metrics)
    print('')

## Repeat all steps for total train and make submission

In [35]:
#cleaning
#some parameters might need to be taken from hyperparameters
limit = x_tr.shape[0]

total_data = np.concatenate((x_tr, x_te), axis = 0)

clean_total = clean(total_data, nan_threshold, 
                    remove_const, const_thresholds, 
                    PCA, n_components, pca_thresholds, 
                    max_unique_values, correlation_threshold)

tot_clean_train = clean_total[:limit]

tot_clean_test = clean_total[limit:]

Finished cleaning - data is now (437514, 515)


In [36]:
#run on absurd train and predict + make submission prediction
abs_tot_tr = tot_clean_train[:, :15]
abs_tot_te = tot_clean_test[:,:15]

model = 'reg logistic regression'
lambda_ = best_params[model][0]
gamma = best_params[model][1]
initial_w = np.zeros(abs_tot_tr.shape[1])
print(model)
print('about to train')
w, loss = train(model, y_tr, abs_tot_tr, initial_w, 50000, gamma, lambda_)
print('mapping predictions')
y_pred = np.where(np.dot(abs_tot_te,w) < mapping_threshold, -1, 1)
print('about to create submission')
create_csv_submission([i for i in range(len(y_tr), len(y_tr)+len(y_pred))], y_pred, model.replace(' ' , '_') + '_ABSURD_submission.csv')


reg logistic regression
about to train


  return 1.0 / (1.0 + np.exp(-t))


In [21]:
print(len(y_pred))

109379


In [None]:
#run on absurd train and predict + make submission prediction
abs_tot_tr = tot_clean_train[:, :20]
abs_tot_te = tot_clean_test[:,:20]

for model in models:
    lambda_ = best_params[model][0]
    gamma = best_params[model][1]
    initial_w = np.zeros(abs_tot_tr.shape[1])
    print(model)
    print('about to train')
    w, loss = train(model, y_tr, abs_tot_tr, initial_w, max_iters, gamma, lambda_)
    print('mapping predictions')
    y_pred = np.where(np.dot(abs_tot_te,w) < mapping_threshold, -1, 1)
    print('about to create submission')
    create_csv_submission([i for i in range(len(y_pred))], y_pred, model.replace(' ' , '_') + '_ABSURD_submission')
    

gradient descent
about to train


KeyboardInterrupt: 

In [None]:
#run on train and predict + submit prediction
for model in models:
    lambda_ = best_params[model][0]
    gamma = best_params[model][1]
    initial_w = np.zeros(y_tr)
    w, loss = train(model, y_tr, tot_clean_train, initial_w, max_iters, gamma, lambda_)
    y_pred = np.where(np.dot(tot_clean_test,w) < mapping_threshold, -1, 1)
    create_csv_submission([i for i in range(len(y_pred))], y_pred, model.replace( , '_') + '_submission')
    