In [1]:
from implementations import *
from helpers import *
import numpy as np
import matplotlib.pyplot as plt

## Load the data ###

In [2]:
data_path = './dataset_to_release/'
train_data_path = "./dataset_to_release/x_train.csv"
test_data_path = "./dataset_to_release/x_test.csv"

In [3]:
x_train, x_test, y_train, train_ids, test_ids = load_csv_data_all(data_path, sub_sample=False)

## Data Preprocessing

In [4]:
# Handling missing values

def replace_nan_by_mean(data):
    ''' function that handels the missing values by replacing them with the column means'''
    nan_indices = np.isnan(data)
    column_means = np.nanmean(data, axis=0)
    data[nan_indices] = np.take(column_means, np.where(nan_indices)[1])
    return data

data_train = replace_nan_by_mean(x_train)

In [5]:
print(x_train.shape)
print(data_train.shape)

(328135, 321)
(328135, 321)


In [6]:
# Data filtering: we only keep relevant features

def filtering(data, data_path):
    columns = extract_first_line(data_path).split(',')
    columns.pop(0)
    columns_to_keep = []
    for c in columns:
        if c.startswith('_'):
            columns_to_keep.append(c)
    indices_to_keep = [columns.index(c) for c in columns_to_keep]
    data_f = data[:, indices_to_keep]
    return(data_f)

In [7]:
features_to_keep = ["_AGE80", "_AGE65YR", "_AGEG5YR", "_AGE_G", "_AIDTST3", "_ASTHMS1", "_BMI5", "_BMI5CAT",
                     "_CASTHM1", "_CHLDCNT", "_CHOLCHK", "_DRDXAR1", "_DRNKWEK", "_DUALCOR", "_DUALUSE"
                     , "_FLSHOT6", "_FRT16", "_FRTLT1", "_FRTRESP", "_FRUITEX", "_FRUTSUM", "_HCVU651",
                       "_LLCPWT", "_LMTACT1", "_LMTSCL1", "_LMTWRK1", "_LTASTH1", "_MICHD", "_MINAC11", "_MINAC21", 
                       "_MISFRTN", "_MISVEGN", "_MRACE1", "_PA30021", "_PA150R2", "_PA300R2", "_PACAT1", "_PAINDX1", 
                       "_PASTAE1", "_PASTRNG", "_PNEUMO2",
                       "_RFBING5", "_RFBMI5", "_RFCHOL", "_RFDRHV5", "_RFHLTH", "_RFHYPE5", "_RFSMOK3", 
                       "_SMOKER3", "_TOTINDA", "_VEG23", "_VEGESUM", "_VEGETEX", "_VEGLT1", "_VEGRESP"]

In [8]:
# Second version of data filtering, remove 9 more columns

def filtering_2(data,data_path):
    columns = extract_first_line(data_path).split(',')
    columns.pop(0)
    filtered_columns = [col for col in columns if col in features_to_keep]
    indices_to_keep = [columns.index(c) for c in filtered_columns]
    print(len(indices_to_keep))
    data_f = data[:, indices_to_keep]
    return(data_f)

data_train_filtered = filtering_2(data_train, train_data_path)
data_train_filtered.shape

54


(328135, 54)

In [9]:
# standardization of the data
def standardize(data):
    small_value = 1*10**(-9)
    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0) + small_value
    return((data - mean) / (std))

data_train_standard = standardize(data_train_filtered)

In [10]:
# feature augmentation
def feature_expansion(data, degree):
    augmented_features = []
    for i in range(data.shape[1]):
        feature = data[:,i]
        augmented_feature = build_poly(feature, degree)
        augmented_features.append(augmented_feature)

    # Stack the augmented features horizontally
    augmented_data = np.hstack(augmented_features)
    return(augmented_data)

augmented_data = feature_expansion(data_train_standard, 4)

## Cross-Validation

In [28]:
def compute_f1_score(true_labels, predicted_labels):
    """
    Computes the F1 score for a classification model using NumPy.

    Parameters:
    true_labels (numpy.ndarray): True labels for the data.
    predicted_labels (numpy.ndarray): Predicted labels from the model.

    Returns:
    f1 (float): The F1 score.
    """
    true_positive = np.sum(np.logical_and(true_labels == 1, predicted_labels == 1))
    false_positive = np.sum(np.logical_and(true_labels == 0, predicted_labels == 1))
    false_negative = np.sum(np.logical_and(true_labels == 1, predicted_labels == 0))
    
    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)
    
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return f1

In [29]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold.

    Args:
        y:      shape=(N,)
        k_fold: K in K-fold, i.e. the fold num
        seed:   the random seed

    Returns:
        A 2D array of shape=(k_fold, N/k_fold) that indicates the data indices for each fold

    """
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval : (k + 1) * interval] for k in range(k_fold)]
    
    return np.array(k_indices)

In [28]:
def apply_model(test, model):
    pred = (sigmoid(test.dot(model))>=0.35).astype(int)
    return(pred)

In [47]:
def cross_validation(y, x, k_indices, k, lambda_, degree):
    """return the loss of ridge regression for a fold corresponding to k_indices

    Args:
        y:          shape=(N,)
        x:          shape=(N,)
        k_indices:  2D array returned by build_k_indices()
        k:          scalar, the k-th fold (N.B.: not to confused with k_fold which is the fold nums)
        lambda_:    scalar, cf. ridge_regression()
        degree:     scalar, cf. build_poly()

    Returns:
        train and test root mean square errors rmse = sqrt(2 mse)

    """
    
    # get k'th subgroup in test, others in train:
    train_idx = np.reshape(k_indices[[i for i in range(len(k_indices)) if i!=k]], -1)
    test_idx = k_indices[k]

    x_train = x[train_idx,:]
    y_train = y[train_idx]
    x_test = x[test_idx,:]
    y_test = y[test_idx]
    
    y_tr = np.expand_dims(y_train, 1)
    y_te = np.expand_dims(y_test, 1)

    y_tr = np.where(y_tr == -1, 0, y_tr)
    y_te = np.where(y_te == -1, 0, y_te)

    max_iters = 1000
    gamma = 0.5

    # form data with polynomial degree:
    train_data = feature_expansion(x_train, degree)
    test_data = feature_expansion(x_test, degree)
    train_data = standardize(train_data)
    test_data = standardize(test_data)
    
    # build tx
    tx_tr = np.c_[np.ones((y_train.shape[0], 1)), train_data]
    tx_te = np.c_[np.ones((test_data.shape[0], 1)), test_data]
    initial_w = np.zeros((tx_tr.shape[1], 1))

    # reg logistic regression: 
    w = reg_logistic_regression(y_tr,tx_tr,lambda_,initial_w, max_iters, gamma)[0]
    y_pred = apply_model(tx_te, w)
    
    # calculate f1 score on test:
    f1_te = compute_f1_score(y_te, y_pred)
  
    return f1_te

In [48]:
def cross_validation_demo(degree, k_fold, lambdas):
    """cross validation over regularisation parameter lambda.

    Args:
        degree: integer, degree of the polynomial expansion
        k_fold: integer, the number of folds
        lambdas: shape = (p, ) where p is the number of values of lambda to test
    Returns:
        best_lambda : scalar, value of the best lambda
        best_rmse : scalar, the associated root mean squared error for the best lambda
    """

    seed = 12
    #degree = degree
    k_fold = k_fold
    # split data in k fold
    k_indices = build_k_indices(y_train, k_fold, seed)
    # define lists to store the loss of training data and test data
    f1_score=np.zeros((len(degree), len(lambdas)))
    # cross validation over lambdas:
    for i in range(len(degree)):
        d=degree[i]
        for j in range(len(lambdas)):
            lambda_ = lambdas[j]
            cross_val = [cross_validation(y_train, data_train_standard, k_indices, k, lambda_, d) for k in range(k_fold)]
            f1 = np.mean(cross_val)
            f1_score[i,j] = f1
    best_degree = degree[np.unravel_index(np.argmax(f1_score, axis=None), f1_score.shape)[0]]
    best_lambda = lambdas[np.unravel_index(np.argmax(f1_score, axis=None), f1_score.shape)[1]]
    best_f1 = np.max(f1_score)
    
    return best_degree, best_f1 , best_lambda , f1_score

In [54]:
best_degree, best_f1 , best_lambda , f1_score = cross_validation_demo(np.array([4]).astype(int), 2, np.array([0]))

Current iteration=0, loss=0.6043003767871573
Current iteration=100, loss=0.24195432849718485
Current iteration=200, loss=0.23959155567457874
Current iteration=300, loss=0.2390008625142806


KeyboardInterrupt: 

In [45]:
print("The best_degree is {}, the best lambda is {} and the best f1 is {}:".format(best_degree, best_lambda, best_f1))

The best_degree is 4, the best lambda is 0.01 and the best f1 is 0.3240984911927453:


## Training

In [11]:
data_standardized = standardize(augmented_data)
data_standardized.shape

(328135, 270)

In [23]:
# Split the test set in two
def split_data(x, y, ratio, seed=1):
    """
    split the dataset based on the split ratio. If ratio is 0.8
    you will have 80% of your data set dedicated to training
    and the rest dedicated to testing. If ratio times the number of samples is not round
    you can use np.floor. Also check the documentation for np.random.permutation,
    it could be useful.

    Args:
        x: numpy array of shape (N,), N is the number of samples.
        y: numpy array of shape (N,).
        ratio: scalar in [0,1]
        seed: integer.

    Returns:
        x_tr: numpy array containing the train data.
        x_te: numpy array containing the test data.
        y_tr: numpy array containing the train labels.
        y_te: numpy array containing the test labels.
    """
    N=int(ratio*len(x))
    
    # set seed
    np.random.seed(seed)
    
    # split the data based on the given ratio: 
    shuffled_data = np.random.permutation(x)
    shuffled_labels = np.random.permutation(y)
    np.random.seed(seed)
    x_tr = shuffled_data[:N] #train data
    x_te = shuffled_data[N:] #test data
    y_tr = shuffled_labels[:N] #train labels
    y_te = shuffled_labels[N:] # test labels

    return(x_tr,x_te, y_tr, y_te)

In [32]:
x_tr,x_te, y_tr, y_te = split_data(data_standardized, y_train, ratio=0.8)

In [35]:
# Binary classification using logistic regression

max_iters = 2000
gamma = 0.5

 # Build tx
tx_tr = np.c_[np.ones((y_train.shape[0], 1)), data_standardized]
initial_w = np.zeros((tx_tr.shape[1], 1))

In [15]:
y_train = np.expand_dims(y_train, 1)
y_train = np.where(y_train == -1, 0, y_train)

y_train

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [58]:
# grad = calculate_log_likelihood_gradient(y_train,tx_tr,initial_w)
# loss = calculate_log_likelihood_loss(y_train,tx_tr,initial_w)
# print(loss)

In [17]:
w,loss = logistic_regression(y_train, tx_tr, initial_w, max_iters, gamma = 0.5)

Current iteration=0, loss=0.6043822619210014
Current iteration=100, loss=0.24285325687469855
Current iteration=200, loss=0.24048368896360986
Current iteration=300, loss=0.2398927053722558
Current iteration=400, loss=0.23961441616080237
Current iteration=500, loss=0.2394396533106269
Current iteration=600, loss=0.2393145463037291
Current iteration=700, loss=0.23921886153579436
Current iteration=800, loss=0.23914283264365963
Current iteration=900, loss=0.23908093361352936
Current iteration=1000, loss=0.23902968961132795
Current iteration=1100, loss=0.23898674532729466
Current iteration=1200, loss=0.23895041462607067
Current iteration=1300, loss=0.2389194415062791
Current iteration=1400, loss=0.2388928628747833
Current iteration=1500, loss=0.23886992419503963
Current iteration=1600, loss=0.2388500245489164
Current iteration=1700, loss=0.23883267914039452
Current iteration=1800, loss=0.23881749274294256
Current iteration=1900, loss=0.23880414034521819
Current iteration=2000, loss=0.23879235

In [18]:
# lambda_ = 10e-4
# w_reg,loss_reg = reg_logistic_regression(y_train, tx_tr, lambda_, initial_w, max_iters, gamma)

## Test and Accuracy

In [26]:
print(x_te.shape)
print(w.shape)

(65627, 270)
(271, 1)


In [36]:
y_te = np.where(y_te ==-1, 0, y_te)
y_pred = apply_model(tx_te, w)

NameError: name 'tx_te' is not defined

In [246]:
#Calculate accuracy
correct_predictions = np.sum(y_pred == y_te)
total_samples = len(y_te)
accuracy = correct_predictions / total_samples
print(accuracy)

In [247]:
compute_f1_score(y_te, y_pred)

0.3508144616607072

## Test

In [63]:
xt = replace_nan_by_mean(x_test)
print(x_test.shape)

In [65]:
xt_filtered = filtering_2(xt, test_data_path)
print(xt_filtered.shape)

65


In [4]:
augmented_data_test = feature_expansion(xt_filtered, 5)
print(augmented_data_test.shape)

NameError: name 'feature_expansion' is not defined

In [5]:
xt_standardized = standardize(augmented_data_test)
xtest = np.c_[np.ones((xt.shape[0], 1)), xt_standardized]
print(xtest.shape)

NameError: name 'standardize' is not defined

In [71]:
predictions = apply_model(xtest, w_reg)
predictions = np.where(predictions == 0,-1, predictions)
predictions

In [73]:
create_csv_submission(test_ids, predictions, 'predictions_LR_0.2_5.csv')