In [5]:
from basics import *
from implementations import *
from processing import *
from cleaning import *
from metrics import *
from helper import *
from definitions import ROOT_DIR

## Load the data

In [3]:
dataset_path = os.path.join(ROOT_DIR, 'dataset_to_release')
x_tr, x_te, y_tr, tr_ids, te_ids = load_csv_data(dataset_path, True)

## Define global variables

In [4]:
cv = True #for the sake of time, if set to False will use premade list of best parameters already computed. Those same parameters can be found by keeping cv to True. 
split_ratio = 0.8 #80% of the dataset will be used to train the model, the rest will be used to evaluate performance
nan_threshold = 0.8 #to filter our features with more nan than the threshold
max_unique_values = 50 
remove_const = False
const_thresholds = [0.0000001, 0.000001] #to filter constant continuous , when filtering out features w too many categories
n_components = False
PCA = False #if set to True will run PCA on continuous features and keep principal components until 99% of variance is explained
pca_thresholds = 99 #in % explained variance
correlation_threshold = 0.9 #correlation threshold

models = ['gradient descent', 'stochastic gradient descent', 'least squares', 'ridge regression',  'logistic regression', 'reg logistic regression']
mapping_threshold = 0


## Preprocessing + cleaning

In [None]:
#split the data
x_tr_set, x_te_set, y_tr_set, y_te_set = split_data(x_tr, y_tr, split_ratio)

clean_train = clean(x_tr_set, nan_threshold, 
                    remove_const, const_thresholds, 
                    PCA, n_components, pca_thresholds, 
                    max_unique_values, correlation_threshold)
clean_test = clean(x_te_set)

## TUNE HYPERPARAMETERS

In [None]:
#hyperparams = [lambda_, gamma, ]
best_params = {}#complete w our results as ['model', lambda_, gamma]
k_fold = 4
max_iters = 500
lambdas = np.linspace(0, 1, 10)
gammas = np.linspace(0,2,5)

if cv:
    best_params = {}
    for model in models:
        losses = compute_losses_for_hyperparameters(model, y_tr, x_tr, k_fold,  np.zeros(len(y_tr)), max_iters, lambdas, gammas)
        best_params['model'] = [find_best_hyperparameters(losses)[1:3]]




In [None]:
for key, value in best_params.items():
    print(f'{key} - {value}')

## Run and evaluate

In [None]:
for model in models:
    lambda_ =
    gamma =
    w, loss = train(model, y_tr_set, clean_train, initial_w, max_iters, gamma, lambda_)
    train_predictions = np.where(np.dot(clean_train,w) < mapping_threshold, -1, 1)
    test_predictions = np.where(np.dot(clean_test, w) < mapping_threshold, -1, 1)
    train_err = y_tr_set - train_predictions
    test_err = y_te_set - test_predictions
    print('')
    print(f'=========== {model} ===========')
    print(f'train error = {train_err} - test error = {test_err}')
    train_metrics = calculate_metrics(y_tr_set, train_predictions)
    test_metrics = calculate_metrics(y_te_set, test_predictions)
    print('=========== TRAIN metrics ===========')
    prettyprint(train_metrics)
    print('=========== TEST metrics ===========')
    prettyprint(test_metrics)
    print('')
    

## Repeat all steps for total train and make submission

In [None]:
#cleaning
#some parameters might need to be taken from hyperparameters

tot_clean_train = clean(x_tr, nan_threshold, 
                    remove_const, const_thresholds, 
                    PCA, n_components, pca_thresholds, 
                    max_unique_values, correlation_threshold)

tot_clean_test = clean(x_tr, nan_threshold, 
                    remove_const, const_thresholds, 
                    PCA, n_components, pca_thresholds, 
                    max_unique_values, correlation_threshold)

In [None]:
#run on train and predict + submit prediction
for model in models:
    lambda_ =
    gamma =
    w, loss = train(model, y_tr_set, clean_train, initial_w, max_iters, gamma, lambda_)
    y_pred = np.where(np.dot(clean_train,w) < mapping_threshold, -1, 1)
    create_csv_submission([i for i in range(len(y_pred))], y_pred, model.replace( , '_') + '.submission')
    