In [None]:
def opt_hyperparameters(TRAINING_DATA, k_fold):
    """
    call all the training datasets and do the cross validation for each of them -------> 
    Return the best lambda for each data sets.
    
    """
    
    lambdas_star = []    
    
    for idx, data in enumerate(TRAINING_DATA):
        
        y_train, tx_train, ids_train = load_csv_data(data)
        
        tx_train = standardize(tx_train)   ### Standardize the data set
        
        print("Cross-validation for file %s ..." % data)
        
        lambdas = np.logspace(-10, 0, 10) # Define the lambdas values
        
        opt_lambda = cross_validation(y_train, tx_train, k_fold, lambdas)
        
        print("opt lambda for %s is :" %data, opt_lambda)
        
        lambdas_star.append(opt_lambda)
    
    return lambdas_star

#-----------------

def cross_validation(y, tx, k_fold, lambdas):

    print("  Start the %i-fold Cross Validation!..." % k_fold)

    # Spliting datasets into the k-fold subsets
    # It shuffels them as well:
    
    ## Making the shuffel and splitted indices
    
    k_indices = build_k_indices(y, k_fold)
    
    max_iters = 100
    gamma = 0.1
    least_loss = np.inf
    
    for lamb in lambdas:
        loss_folds = 0
        for i in range(k_fold):
            
            #### Here we make the the sub data sets. They have been splitted and shuffeled. 
        
            y_subtrain  = np.delete(y[k_indices],i,axis = 0)
            tx_subtrain = np.delete(tx[k_indices],i,axis = 0)
            
            y_subtest   = y[k_indices[i]]
            tx_subtest  = tx[k_indices][i]
            
            #### Reshape tx and y into the correct form:
            
            y_subtrain = y_subtrain.reshape((-1,))
            tx_subtrain = tx_subtrain.reshape((-1,tx.shape[1]))
            
            initial_w = np.zeros(tx_subtrain.shape[1])
        
            if (len(y_subtrain.shape) == 1): 
                y_subtrain.shape = (-1, 1)
            if (len(y_subtest.shape) == 1): 
                y_subtest.shape = (-1, 1)
            if (len(initial_w.shape) == 1): 
                initial_w.shape = (-1, 1)
                
            ## Finding the best w for each sub train data and lambda value

            w, loss_train =reg_logistic_regression(y_subtrain, tx_subtrain, lamb, initial_w, max_iters, gamma)
            
            ## Finding the loss base on the w obtained at the previous step for each sub test data and lambda value
            
            loss_validation = reg_calculate_loss(y_subtest, tx_subtest, w, lamb)
            
            loss_folds += loss_validation
            
        if loss_folds/k_fold < least_loss:
            
            least_loss = loss_folds/k_fold
            lambda_star = -lamb
            
    return lambda_star
#--------------------------

####------- IMPORTANT : Since we want to do regularized logestic regression, labels should be 0 and 1. 

def load_csv_data(data_path, sub_sample=False): ### Becareful about the data_path ... 
    
    y = np.genfromtxt(data_path, delimiter=",", skip_header=1, dtype=str, usecols=1)
    x = np.genfromtxt(data_path, delimiter=",", skip_header=1)
    ids = x[:, 0].astype(np.int)
    input_data = x[:, 2:]

    # convert class labels from strings to binary (-1,1)
    yb = np.ones(len(y))
    yb[np.where(y=='b')] = 0 # Important point should be considered. 
    
    # sub-sample
    if sub_sample:
        yb = yb[::50]
        input_data = input_data[::50]
        ids = ids[::50]

    return yb, input_data, ids

#-------------------------

def standardize(x):

    std_data = (x - np.mean(x, axis=0)) / np.std(x, axis=0)

    return std_data

#-------------------------

def reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma):
    
    w = initial_w  #### Store the w values
    loss = -1 
    prev_loss = -1
    threshold = 1e-8
    
    for iter_ in range(max_iters):
        
        w, loss= penalized_logistic_regression(y, tx, w, gamma, lambda_)
        
        # Stop the gradient descent if the last two losses defference is below than the threshold:
        
        if prev_loss != -1 and np.abs(loss - prev_loss) < threshold:
            break
        prev_loss = loss
    
    return w, loss # return w and the corresponding loss

#------------------------

def penalized_logistic_regression(y, tx, w, gamma, lambda_):
    
    loss = reg_calculate_loss(y, tx, w, lambda_)
    gradient = reg_calculate_gradient(y, tx, w, lambda_) 
    
    w_new = w - gamma * gradient
    
    return w_new, loss

# ------------------------

def reg_calculate_loss(y, tx, w, lambda_):
    
    """compute the cost by negative log likelihood."""
    
    pred = sigmoid(tx.dot(w))
    pred = np.clip(pred, 1e-10, 1-1e-10) ### IMPORTANT : It prevents that Pred becommes equal to 1 or zero.   
    loss = y.T.dot(np.log(pred)) + (1 - y).T.dot(np.log(1 - pred))
    
    return np.squeeze(- loss)/len(y) + lambda_ * np.squeeze(w.T.dot(w))  ### IMPORTANT : NORMALIZATION

# --------------------------

def reg_calculate_gradient(y, tx, w, lambda_):

    pred = sigmoid(tx.dot(w))
    grad = tx.T.dot(pred - y)
    
    return grad /len(y) + 2 * lambda_ * w   ### IMPORTANT : NORMALIZATION

# --------------------------

def sigmoid(t):

    return 1.0 / (1 + np.exp(-t))

#--------------------

def build_k_indices(y, k_fold):
    """
        Build k-indices for the Cross-Validation
    """
    
    number_of_row = int(y.shape[0] / k_fold)
    indices = np.random.permutation(y.shape[0])
    k_indices = [indices[k * number_of_row: (k + 1) * number_of_row] for k in range(k_fold)]
    return np.array(k_indices)

In [1]:
lambda_star = opt_hyperparameters(TRAINING_DATA, 10)

NameError: name 'opt_hyperparameters' is not defined

In [2]:
def training_dataset(TRAINING_DATA, lambda_star):

    weights = []
    total = 0
    mean = 0
    
    for idx, data in enumerate(TRAINING_DATA):

        print(u'Training with file {0:s}'.format(data))
        
        y_train, tx_train, ids_train = load_csv_data(data) ####------- IMPORTANT : Since we want to do regularized logestic regression, labels should be 0 and 1. 

        tx_train = standardize(tx_train)
        
        max_iters = 100
        gamma = 0.01
        initial_w = np.zeros(tx_train.shape[1])
        
                
        w_star, loss_RR = reg_logistic_regression(y_train, tx_train, lambda_star[idx], initial_w, max_iters, gamma)
        
        # Getting the percentage of correct predictions:
        
        Acc = percentage_of_correct_prediction(y_train, tx_train, w_star)
        
        print(u'  The accuracy of prediction is equal to {0:f}'.format(100. * Acc))
        
        weights.append(w_star)

    return weights


def percentage_of_correct_prediction(y, tx, w_star):
    """
        Return the percentage of correct predictions (between 0 and 1)
    """

    pred = np.dot(tx, w_star)

    pred[pred > 0.5] = 1
    pred[pred <= 0.5] = 0

    right = np.sum(pred == y)
    wrong = len(pred) - right

    return 1-(float(wrong) / float(len(pred)))

In [3]:
weights = training_dataset(TRAINING_DATA, lambda_star)

NameError: name 'TRAINING_DATA' is not defined

In [4]:
def testing_dataset(TESTING_DATA, weights):


    y_pred = []
    ids_pred = []


    for idx, data in enumerate(TESTING_DATA):
        
        print("Testing with file %s" % data)

        data_file = data

        _, tx_test, ids_test = load_csv_data(data_file)
        
        tx_test = standardize(tx_test)

        # Labelling the predictions
        
        y_pred.append(predict_labels(weights[idx], tx_test))
        
        ids_pred.append(ids_test)

    # Puting all the predictions together and making the submission file. 
    
    ids = []
    pred = []

    idx = min(ids_pred[:][0])

    length = np.sum(len(i) for i in y_pred)

    print("Concatenate the predictions.")
    
    # Sortting the final data based on idx:

    for i in range(length):
        for j in range(len(TESTING_DATA)):
            if len(ids_pred[j]) > 0:
                if ids_pred[j][0] == idx:
                    ids.append(idx)
                    pred.append(y_pred[j][0])
                    ids_pred[j] = np.delete(ids_pred[j], 0)
                    y_pred[j] = np.delete(y_pred[j], 0)
                    break

        if i % 100000 == 0: ### it prints the steps of concatenation.
            print(u'  {0:d}/{1:d} concatenated'.format(i, length))

        idx += 1

    pred = np.array(pred)
    ids = np.array(ids)

    # Writing the submission file:
    
    create_csv_submission(ids, pred, "Final_Prediction.csv")

    print(u'Ready to be submitted!')

###-----------------------

# Since the evaluation will be based on -1 and 1, now we will predict based on that

def predict_labels(weights, data):
    
    """Generates class predictions given weights, and a test data matrix"""
    
    y_pred = np.dot(data, weights)
    y_pred[np.where(y_pred <= 0.5)] = -1
    y_pred[np.where(y_pred > 0.5)] = 1

    return y_pred

#### --------------------
    
def create_csv_submission(ids, y_pred, name):
    
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id': int(r1), 'Prediction': int(r2)})

In [None]:
testing_dataset(TESTING_DATA, weights)