In [1]:
import numpy as np
import helpers as h
from implementations import *

In [2]:
X, xHeader, Y, yHeader, indexedX, indexedXheader, indexedY, indexedYheader = loadTrainingData()

Data successfully loaded, there are 321 features and 328135 samples, the shapes of the unindexed data is:
y: (328135, 1), x: (328135, 321)


In [4]:
yClean, xClean, xHeaderClean = dataCleaning(Y,X,xHeader)

For a threshold of 0.7, there are 144 good features, and 177 bad features
There remains in the data 289870 samples with at most 5 missing values
The number of invalid entries remaing in the dataset is 272938
That is 0.006538802834987332 parts of the whole dataset
Removed 21050 samples with outliers more than 10 standard deviations from the mean. There remains 268820 samples in the dataset.
Standardized data by subtracting the mean and dividing by the standard deviation


In [5]:
yBalanced, xBalanced = balanceData(yClean,xClean)
tx = makeTrainingData(xBalanced)

Created a balanced subset of the data, with 46448 samples, 23224 each of positive and negative samples
Added dummy variable and replaced invalid entries with zeros
[1. 0. 0. ... 1. 1. 1.]


In [45]:
def k_fold_cross_validation_sets(y,x,K):
    ''' Function for making K separate training sets out of the provided dataset
    Args:
        y: (N,) array of the labels
        x: (N,d) array of the data with its features
        K: Integer number of separate trainingsets
    Yields:
        y_k: (N/K,) array of the chosen labels. N/K is N//K + 1 for the first sets, and N//K for the rest of the sets
        x_k: (N/K,d) array of the data
    '''
    N = len(y)      # Saving the number of samples as an integer
    batchSize = N // K  # Calculating the batch size
    residual = N - K*batchSize  # Checking how many samples would not be included in sets of size N//K

    indices = np.random.permutation(N) # Randomly permuted indices of the provided dataset
    
    for k in range(K):
        if k < residual: # If the samples 'in' the residual has not 'been used', we include them
            indices_k = indices[k*(batchSize+1):(k+1)*(batchSize+1)] # Indices of the elements for each k batch. Here included one extra samples 'from' the residual
        else:
            indices_k = indices[residual+k*batchSize:residual+(k+1)*batchSize] # Indices of the elements for each k batch
        
        yield y[indices_k], x[indices_k] # Yield returns the first set, and next time the function is called the code continues, so the for loop repeats and yields the next set
        

In [73]:
def k_fold_cross_validation(y,tx,K,initial_w,max_iters,gamma, regressionFunction=logistic_regression, lossFunction=logistic_loss):
    ''' Performing regression on K separate subsets of the provided training set, and returning the average parameters
    Args:
        y: (N,) array of the labels
        tx: (N,d) array of the data and its features
        initital_w: (d,) array with some initialization of the parameters
        max_iters: integer of the maximum iterations per regression
        gamma: float of the step size
        regressionFunction: The function of the chosen type of regression
        lossFunction: The function of the chosen type of loss
    Returns:
        w_avg: (d,) array of the resultant parameters averaged over the cross validation runs
    ''' 
    crossValidationSets = k_fold_cross_validation_sets(y,tx,K)
    
    w, loss = np.zeros((K,tx.shape[1])), np.zeros(K)
    
    for k in range(K):
        y_k, tx_k = next(crossValidationSets)

        w[k], loss[k] = regressionFunction(y_k, tx_k, initial_w, max_iters, gamma)

        print(f'Run {k+1} yielded a loss improvement from {lossFunction(y_k,tx_k,initial_w)} to {lossFunction(y_k,tx_k,w[k])}')
    w_avg = np.sum(w,axis=0) / K
    
    print(f'''-----------------------------------------------------------------------------------------
Averaging the parameters, the loss improves from {lossFunction(y,tx,initial_w)} to {lossFunction(y,tx,w_avg)}''')
    return w_avg

initial_w = np.zeros(tx.shape[1])
K = 5
w = k_fold_cross_validation(yBalanced,tx,K,initial_w,100,0.01, mse_gd_momentum, compute_loss)


Run 1 yielded a loss improvement from 0.5015069967707212 to 0.17084571708603238
Run 2 yielded a loss improvement from 0.5005382131324004 to 0.16869899753041984
Run 3 yielded a loss improvement from 0.49730893433799783 to 0.17453104590318338
Run 4 yielded a loss improvement from 0.5090967811389816 to 0.17350784944791756
Run 5 yielded a loss improvement from 0.4915491441489934 to 0.1747786481539316
-----------------------------------------------------------------------------------------
Averaging the parameters, the loss improves from 0.5 to 0.17387707384674347


In [6]:

w_gd_m, loss_gd_m = mse_gd_momentum(yBalanced,tx,initial_w,1000,0.001)

In [11]:
print(compute_loss(yBalanced,tx,initial_w))
print(loss_gd_m)


0.5
0.1738137728145803
0.1738137728145803


In [28]:
w_logistic, loss_logistic = logistic_regression(yBalanced,tx,initial_w,100,0.2)

print(compute_loss(yBalanced,tx,initial_w))
print(compute_loss(yBalanced,tx,w_logistic))

print(logistic_loss(yBalanced,tx,initial_w))
print(logistic_loss(yBalanced,tx,w_logistic))

0.5
2.3686920114550984
0.6931471805600634
0.4673017607083231


In [33]:
predictions = (np.sign(logistic(tx@w_logistic)-0.5)+1)/2
errors = np.abs(predictions - yBalanced)
print(np.sum(errors))
#for e in errors:
#    print(e)

10105.0
