In [1]:
from implementations import *
from helpers import *
import numpy as np
import matplotlib.pyplot as plt

## Load the data ###

In [2]:
data_path='./dataset_to_release/'
train_data_path="./dataset_to_release/x_train.csv"
test_data_path="./dataset_to_release/x_test.csv"

In [3]:
x_train, x_test, y_train, train_ids, test_ids=load_csv_data_all(data_path, sub_sample=False)

## Data Preprocessing

In [9]:
# Handling the missing values

def replace_nan_by_mean(data):
    ''' function that handels the missing values by replacing them with the column means'''
    nan_indices = np.isnan(data)
    column_means = np.nanmean(data, axis=0)
    data[nan_indices] = np.take(column_means, np.where(nan_indices)[1])
    return data

data_train = replace_nan_by_mean(x_train)

In [11]:
x_train.shape

(328135, 321)

In [12]:
data_train.shape

(328135, 321)

In [13]:
# Data filtering: we only keep relevant features

def filtering(data, data_path):
    columns = extract_first_line(data_path).split(',')
    columns.pop(0)
    columns_to_keep = []
    for c in columns:
        if c.startswith('_'):
            columns_to_keep.append(c)
    indices_to_keep = [columns.index(c) for c in columns_to_keep]
    data_f = data[:, indices_to_keep]
    return(data_f)

In [14]:
features_to_keep = ["_AGE80", "_AGE65YR", "_AGEG5YR", "_AGE_G", "_AIDTST3", "_ASTHMS1", "_BMI5", "_BMI5CAT", "_CASTHM1", "_CHLDCNT", "_CHOLCHK", "_DRDXAR1", "_DRNKWEK", "_DUALCOR", "_DUALUSE", "_EDUCAG", "_FLSHOT6", "_FRT16", "_FRTLT1", "_FRTRESP", "_FRUITEX", "_FRUTSUM", "_HCVU651", "_HISPANC", "_INCOMG", "_LLCPWT", "_LMTACT1", "_LMTSCL1", "_LMTWRK1", "_LTASTH1", "_MICHD", "_MINAC11", "_MINAC21", "_MISFRTN", "_MISVEGN", "_MRACE1", "_PA30021", "_PA150R2", "_PA300R2", "_PACAT1", "_PAINDX1", "_PAREC1", "_PASTAE1", "_PASTRNG", "_PNEUMO2", "_PRACE1", "_RACE", "_RACEG21", "_RACEGR3", "_RACE_G1", "_RFBING5", "_RFBMI5", "_RFCHOL", "_RFDRHV5", "_RFHLTH", "_RFHYPE5", "_RFSEAT2", "_RFSEAT3", "_RFSMOK3", "_SMOKER3", "_TOTINDA", "_VEG23", "_VEGESUM", "_VEGETEX", "_VEGLT1", "_VEGRESP"]

In [15]:
len(features_to_keep)

66

In [16]:
# Secode version of data filtering, remove 9 more columns
def filtering_2(data,data_path):
    columns = extract_first_line(data_path).split(',')
    columns.pop(0)
    filtered_columns = [col for col in columns if col in features_to_keep]
    indices_to_keep = [columns.index(c) for c in filtered_columns]
    print(len(indices_to_keep))

    data_f = data[:, indices_to_keep]
    return(data_f)

In [17]:
data_train_filtered_2=filtering_2(data_train, train_data_path)

65


In [18]:
data_train_filtered_2.shape


(328135, 65)

In [19]:
# standardization of the data
def standardize(data):
    small_value=1*10**(-9)
    mean=np.mean(data, axis=0)
    std=np.std(data, axis=0)+small_value
    return((data - mean) / (std))

In [21]:
# feature augmentation
def feature_expansion(data, degree):
    augmented_features=[]
    for i in range(data.shape[1]):
        feature=data[:,i]
        augmented_feature=build_poly(feature, degree)
        augmented_features.append(augmented_feature)

    # Stack the augmented features horizontally
    augmented_data = np.hstack(augmented_features)
    return(augmented_data)

   

## Cross-Validation

In [22]:
def compute_f1_score(true_labels, predicted_labels):
    """
    Computes the F1 score for a classification model using NumPy.

    Parameters:
    true_labels (numpy.ndarray): True labels for the data.
    predicted_labels (numpy.ndarray): Predicted labels from the model.

    Returns:
    f1 (float): The F1 score.
    """
    true_positive = np.sum(np.logical_and(true_labels == 1, predicted_labels == 1))
    false_positive = np.sum(np.logical_and(true_labels == 0, predicted_labels == 1))
    false_negative = np.sum(np.logical_and(true_labels == 1, predicted_labels == 0))
    
    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)
    
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return f1

In [23]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold.

    Args:
        y:      shape=(N,)
        k_fold: K in K-fold, i.e. the fold num
        seed:   the random seed

    Returns:
        A 2D array of shape=(k_fold, N/k_fold) that indicates the data indices for each fold

    """
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval : (k + 1) * interval] for k in range(k_fold)]
    
    return np.array(k_indices)

In [24]:
def apply_model(test, model, thresh = 0.5):
    pred=(sigmoid(test.dot(model))>=thresh).astype(int)
    return(pred)

In [26]:
data_train_filtered_2.shape

(328135, 65)

In [32]:
def cross_validation(y, x, k_indices, k, lambda_, degree):
    """return the loss of ridge regression for a fold corresponding to k_indices

    Args:
        y:          shape=(N,)
        x:          shape=(N,)
        k_indices:  2D array returned by build_k_indices()
        k:          scalar, the k-th fold (N.B.: not to confused with k_fold which is the fold nums)
        lambda_:    scalar, cf. ridge_regression()
        degree:     scalar, cf. build_poly()

    Returns:
        train and test root mean square errors rmse = sqrt(2 mse)

    """

    # ***************************************************
    # get k'th subgroup in test, others in train:
    train_idx=np.reshape(k_indices[[i for i in range(len(k_indices)) if i!=k]], -1)
    test_idx=k_indices[k]

    x_train=x[train_idx,:]
    print(x_train.shape)
    y_train=y[train_idx]
    x_test=x[test_idx,:]
    y_test=y[test_idx]
    
    y_tr=np.expand_dims(y_train, 1)
    y_te=np.expand_dims(y_test, 1)

    y_tr=np.where(y_tr == -1, 0, y_tr)
    print(y_tr, y_tr.shape)
    y_te=np.where(y_te == -1, 0, y_te)

    max_iters = 1000
    gamma=0.5

    # ***************************************************
    # form data with polynomial degree: 
    print('on va auggmenter le data')
    train_data=feature_expansion(x_train, degree)
    test_data=feature_expansion(x_test, degree)
    train_data=standardize(train_data)
    test_data=standardize(test_data)
    # ***************************************************
     # build tx
    tx_tr = np.c_[np.ones((y_train.shape[0], 1)), train_data]
    tx_te = np.c_[np.ones((test_data.shape[0], 1)), test_data]
    print(tx_tr.shape)
    print(tx_te.shape)
    initial_w=np.zeros((tx_tr.shape[1], 1))

    # reg logistic regression: 
    w=reg_logistic_regression(y_tr,tx_tr,lambda_,initial_w, max_iters, gamma)[0]
    print(w.shape)
    print(tx_te.shape)
    y_pred=apply_model(tx_te, w, 0.5)
    # calculate f1 score on test:
    f1_te=compute_f1_score(y_te, y_pred)
  
    return f1_te

In [33]:
def cross_validation_demo(degree, k_fold, lambda_):
    """cross validation over regularisation parameter lambda.

    Args:
        degree: integer, degree of the polynomial expansion
        k_fold: integer, the number of folds
        lambdas: shape = (p, ) where p is the number of values of lambda to test
    Returns:
        best_lambda : scalar, value of the best lambda
        best_rmse : scalar, the associated root mean squared error for the best lambda
    """

    seed = 12
    #degree = degree
    k_fold = k_fold
    lambda_ = lambda_
    # split data in k fold
    k_indices = build_k_indices(y_train, k_fold, seed)
    # define lists to store the loss of training data and test data
    f1_score = []
    # cross validation over lambdas:
    for d in degree:
        cross_val = [cross_validation(y_train, data_train_filtered_2, k_indices, k, lambda_, d) for k in range(k_fold)]
        f1 = np.mean(cross_val)
        f1_score.append(f1)
    print('on y est presque')
    best_degree = degree[np.argmax(f1_score)]
    best_f1 = np.min(f1_score)
    # print(
    #     "For polynomial expansion up to degree %.f, the choice of lambda which leads to the best test rmse is %.5f with a test rmse of %.3f"
    #     % (degree, best_lambda, f1_score)
    # )
    return best_degree, best_f1, f1_score

In [34]:
y_train.shape

(328135,)

In [35]:
best_lambda, best_f1, f1_score = cross_validation_demo(np.array([1,2,3,4,5]).astype(int), 4, 0)

(246099, 65)
[[0]
 [1]
 [0]
 ...
 [0]
 [1]
 [0]] (246099, 1)
on va auggmenter le data
(246099, 131)
(82033, 131)
Current iteration=0, loss=0.6031271020869896
Current iteration=100, loss=0.2507973231123299
Current iteration=200, loss=0.24924630680398083
Current iteration=300, loss=0.2489755077280358
Current iteration=400, loss=0.24889727970199313
Current iteration=500, loss=0.24886350174454194
Current iteration=600, loss=0.24884235012557887
Current iteration=700, loss=0.24882582642474602
Current iteration=800, loss=0.24881166149241313
Current iteration=900, loss=0.2487990871437696
loss=0.24878787054760435
(131, 1)
(82033, 131)
(246099, 65)
[[1]
 [0]
 [1]
 ...
 [0]
 [1]
 [0]] (246099, 1)
on va auggmenter le data
(246099, 131)
(82033, 131)
Current iteration=0, loss=0.6030464334324386
Current iteration=100, loss=0.25038515240587816
Current iteration=200, loss=0.24883294041135184
Current iteration=300, loss=0.2485629896646563
Current iteration=400, loss=0.24848672289618903
Current iteration

loss=0.2456469378717906
(326, 1)
(82033, 326)
(246099, 65)
[[1]
 [0]
 [1]
 ...
 [0]
 [1]
 [0]] (246099, 1)
on va auggmenter le data
(246099, 326)
(82033, 326)
Current iteration=0, loss=0.6160010015839307
Current iteration=100, loss=0.24331247924509683
Current iteration=200, loss=0.24108998922495803
Current iteration=300, loss=0.24058398369389122
Current iteration=400, loss=0.2403524811312662
Current iteration=500, loss=0.24020337602958217
Current iteration=600, loss=0.24009137784306844
Current iteration=700, loss=0.2400010685442561
Current iteration=800, loss=0.23992549796426157
Current iteration=900, loss=0.23986081684217456
loss=0.24486198897849082
(326, 1)
(82033, 326)
(246099, 65)
[[1]
 [0]
 [1]
 ...
 [0]
 [1]
 [0]] (246099, 1)
on va auggmenter le data
(246099, 326)
(82033, 326)
Current iteration=0, loss=0.6160508901420431
Current iteration=100, loss=0.2444284548423153
Current iteration=200, loss=0.2422145817336045
Current iteration=300, loss=0.24170659684254656
Current iteration=4

In [38]:
f1_score

[0.03133278875903502,
 0.0969554944715395,
 0.10127011100936746,
 0.13156706820136763,
 0.08083069575188771]

In [37]:
best_lambda

4

In [41]:
augmented_data=feature_expansion(data_train_filtered_2, 4)

## Training

In [42]:
# standardization of the data
def standardize(data):
    small_value=1*10**(-9)
    mean=np.mean(data, axis=0)
    std=np.std(data, axis=0)+small_value
    return((data - mean) / (std))

In [40]:
data_train_filtered

NameError: name 'data_train_filtered' is not defined

In [43]:
data_standardized = standardize(augmented_data)

In [44]:
data_standardized.shape

(328135, 325)

In [187]:
#split the test set in two


def split_data(x, y, ratio, seed=1):
    """
    split the dataset based on the split ratio. If ratio is 0.8
    you will have 80% of your data set dedicated to training
    and the rest dedicated to testing. If ratio times the number of samples is not round
    you can use np.floor. Also check the documentation for np.random.permutation,
    it could be useful.

    Args:
        x: numpy array of shape (N,), N is the number of samples.
        y: numpy array of shape (N,).
        ratio: scalar in [0,1]
        seed: integer.

    Returns:
        x_tr: numpy array containing the train data.
        x_te: numpy array containing the test data.
        y_tr: numpy array containing the train labels.
        y_te: numpy array containing the test labels.
    """
    N=int(ratio*len(x))
    # set seed
    np.random.seed(seed)
    # split the data based on the given ratio: 
    shuffled_data=np.random.permutation(x)
    #print(shuffled_data)
    np.random.seed(seed)
    shuffled_labels=np.random.permutation(y)
    #print(shuffled_labels)
    x_tr=shuffled_data[:N] #train data
    x_te=shuffled_data[N:] #test data
    y_tr=shuffled_labels[:N]#train labels
    y_te=shuffled_labels[N:]# test labels

    return(x_tr,x_te, y_tr, y_te)

In [188]:
x_tr,x_te, y_tr, y_te=split_data(data_standardized, y_train, ratio=0.8)


In [189]:
print(y_tr.shape, x_tr.shape)

(262508, 1) (262508, 276)


In [55]:
y_train=np.expand_dims(y_train, 1)

In [56]:
#y_train.shape

In [57]:
# Binary classification using logistic regression

max_iters = 5000
gamma = 0.5

 # build tx
tx_tr = np.c_[np.ones((y_train.shape[0], 1)), data_standardized]
initial_w=np.zeros((tx_tr.shape[1], 1))


In [58]:
y_train = np.where(y_train == -1, 0, y_train)

In [59]:
y_train.shape

(328135, 1)

In [60]:
#grad=calculate_log_likelihood_gradient(y_train,tx_tr,initial_w)


In [61]:
# w=initial_w-gamma*grad
# print(np.min(w), np.max(w))

In [62]:
# loss=calculate_log_likelihood_loss(y_train,tx_tr,initial_w)
# print(loss)

In [64]:
w,loss= logistic_regression(y_train, tx_tr, initial_w, max_iters, gamma=0.5)

Current iteration=0, loss=0.6160716356978916
Current iteration=100, loss=0.24387140039725128
Current iteration=200, loss=0.24166921447641773
Current iteration=300, loss=0.2411635671984961
Current iteration=400, loss=0.24093252425720585
Current iteration=500, loss=0.24078481285726935
Current iteration=600, loss=0.24067477159377584
Current iteration=700, loss=0.24058669349151837
Current iteration=800, loss=0.24051346093505038
Current iteration=900, loss=0.24045112859956683
Current iteration=1000, loss=0.24039719621012348
Current iteration=1100, loss=0.2403499411441481
Current iteration=1200, loss=0.24030811272553176
Current iteration=1300, loss=0.2402707708620497
Current iteration=1400, loss=0.24023719117153403
Current iteration=1500, loss=0.24020680472120326
Current iteration=1600, loss=0.24017915761426312
Current iteration=1700, loss=0.24015388281664168
Current iteration=1800, loss=0.2401306799514313
Current iteration=1900, loss=0.2401093004926852
Current iteration=2000, loss=0.2400895

In [68]:
#w,loss= logistic_regression(y_train, tx_tr, initial_w, max_iters, gamma=0.5)

In [67]:
#w,loss= logistic_regression(y_train, tx_tr, initial_w, max_iters, gamma=0.5)

In [65]:
lambda_ = 0.0005
w_reg,loss_reg= reg_logistic_regression(y_tr, tx_tr, lambda_, initial_w, max_iters, gamma)

NameError: name 'y_tr' is not defined

In [66]:
w_reg.shape

NameError: name 'w_reg' is not defined

## Feature expansion

In [50]:
build_poly(x_train, 4)

NameError: name 'build_poly' is not defined

## Test and Accuracy

In [241]:

tx_te=np.c_[np.ones((y_te.shape[0], 1)), x_te]

In [242]:
tx_te.shape

(65627, 277)

In [243]:
w.shape

(277, 1)

In [244]:
y_te=np.where(y_te==-1, 0, y_te)

In [245]:
y_pred=apply_model(tx_te, w)

In [246]:
#Calculate accuracy
correct_predictions = np.sum(y_pred == y_te)
total_samples = len(y_te)
accuracy = correct_predictions / total_samples

In [247]:
compute_f1_score(y_te, y_pred)

0.3508144616607072

In [248]:
print(accuracy)

0.8506102671156689


## Test

In [69]:
def apply_model(test, model):
    pred=(sigmoid(test.dot(model))>=0.32).astype(int)
    return(pred)

In [70]:
xt=replace_nan_by_mean(x_test)

In [71]:
x_test.shape

(109379, 321)

In [72]:
xt_filtered = filtering_2(xt, test_data_path)

65


In [74]:
xt_filtered.shape

(109379, 65)

In [75]:
augmented_data_test = feature_expansion(xt_filtered, 4)

In [80]:
augmented_data_test.shape

(109379, 325)

In [84]:
xt_standardized = standardize(augmented_data_test)
xtest = np.c_[np.ones((xt.shape[0], 1)), xt_standardized]

In [85]:
xtest.shape

(109379, 326)

In [86]:
predictions = apply_model(xtest, w)
predictions = np.where(predictions==0,-1, predictions)

In [87]:
predictions


array([[-1],
       [-1],
       [-1],
       ...,
       [-1],
       [-1],
       [-1]])

In [88]:
create_csv_submission(test_ids, predictions, 'predictions_deg4.csv')