In [54]:
from implementations import *
from helpers import *
import numpy as np
import matplotlib.pyplot as plt

## Load the data ###

In [55]:
data_path='./dataset_to_release/'
train_data_path="./dataset_to_release/x_train.csv"
test_data_path="./dataset_to_release/x_test.csv"

In [56]:
x_train, x_test, y_train, train_ids, test_ids=load_csv_data_all(data_path, sub_sample=False)

In [4]:
# train_data_path="./dataset/x_train.csv"
# train_label_path="./dataset/y_train.csv"
# train_data=pd.read_csv(train_data_path)
# train_label=pd.read_csv(train_label_path)


In [13]:
# load the data points
# xb=load_csv_data(train_data_path,sub_sample=False)

In [14]:
# we load the labels 
# yb=load_csv_data_labels(train_label_path, sub_sample=False)

In [15]:

# y = np.expand_dims(yb, axis=1)

In [16]:
# Extract the column names from the x_train.csv and convert it into a list



## Data Preprocessing

In [109]:
# Handeling the missing values

def replace_nan_by_mean(data):
    ''' function that handels the missing values by replacing them with the column means'''
    nan_indices = np.isnan(data)
    column_means = np.nanmean(data, axis=0)
    data[nan_indices] = np.take(column_means, np.where(nan_indices)[1])
    return data


data_train = replace_nan_by_mean(x_train)


In [110]:
## Verifying that the nan values have been successfully removed
# matrix=np.isnan(data_train)
# num_true=np.count_nonzero(matrix)
# num_false = matrix.size - num_true
# print(num_true)

In [111]:
x_train.shape

(328135, 321)

In [112]:
data_train.shape

(328135, 321)

In [113]:
# Data filtering: we only keep relevent features

def filtering(data, data_path):
    columns = extract_first_line(data_path).split(',')
    columns.pop(0)
    columns_to_keep = []
    for c in columns:
        if c.startswith('_'):
            columns_to_keep.append(c)
    indices_to_keep = [columns.index(c) for c in columns_to_keep]
    data_f = data[:, indices_to_keep]
    return(data_f)

In [249]:
features_to_keep = ["_AGE80", "_AGE65YR", "_AGEG5YR", "_AGE_G", "_AIDTST3", "_ASTHMS1", "_BMI5", "_BMI5CAT", "_CASTHM1", "_CHLDCNT", "_CHOLCHK", "_DRDXAR1", "_DRNKWEK", "_DUALCOR", "_DUALUSE", "_EDUCAG", "_FLSHOT6", "_FRT16", "_FRTLT1", "_FRTRESP", "_FRUITEX", "_FRUTSUM", "_HCVU651", "_HISPANC", "_INCOMG", "_LLCPWT", "_LMTACT1", "_LMTSCL1", "_LMTWRK1", "_LTASTH1", "_MICHD", "_MINAC11", "_MINAC21", "_MISFRTN", "_MISVEGN", "_MRACE1", "_PA30021", "_PA150R2", "_PA300R2", "_PACAT1", "_PAINDX1", "_PAREC1", "_PASTAE1", "_PASTRNG", "_PNEUMO2", "_PRACE1", "_RACE", "_RACEG21", "_RACEGR3", "_RACE_G1", "_RFBING5", "_RFBMI5", "_RFCHOL", "_RFDRHV5", "_RFHLTH", "_RFHYPE5", "_RFSEAT2", "_RFSEAT3", "_RFSMOK3", "_SMOKER3", "_TOTINDA", "_VEG23", "_VEGESUM", "_VEGETEX", "_VEGLT1", "_VEGRESP"]


In [252]:
len(features_to_keep)

66

In [262]:
# Secode version of data filtering, remove 9 more columns
def filtering_2(data,data_path):
    columns = extract_first_line(data_path).split(',')
    columns.pop(0)
    filtered_columns = [col for col in columns if col in features_to_keep]
    indices_to_keep = [columns.index(c) for c in filtered_columns]
    print(len(indices_to_keep))

    data_f = data[:, indices_to_keep]
    return(data_f)

In [264]:
data_train_filtered_2=filtering_2(data_train, train_data_path)

65


In [265]:
data_train_filtered_2.shape


(328135, 65)

In [120]:
# standardization of the data
def standardize(data):
    small_value=1*10**(-9)
    mean=np.mean(data, axis=0)
    std=np.std(data, axis=0)+small_value
    return((data - mean) / (std))

In [11]:
#data_train_standard=standardize(data_train_filtered)

In [63]:
# feature augmentation
def feature_expansion(data, degree):
    augmented_features=[]
    for i in range(data.shape[1]):
        feature=data[:,i]
        augmented_feature=build_poly(feature, degree)
        augmented_features.append(augmented_feature)

    # Stack the augmented features horizontally
    augmented_data = np.hstack(augmented_features)
    return(augmented_data)

   

## Cross-Validation

In [13]:
def compute_f1_score(true_labels, predicted_labels):
    """
    Computes the F1 score for a classification model using NumPy.

    Parameters:
    true_labels (numpy.ndarray): True labels for the data.
    predicted_labels (numpy.ndarray): Predicted labels from the model.

    Returns:
    f1 (float): The F1 score.
    """
    true_positive = np.sum(np.logical_and(true_labels == 1, predicted_labels == 1))
    false_positive = np.sum(np.logical_and(true_labels == 0, predicted_labels == 1))
    false_negative = np.sum(np.logical_and(true_labels == 1, predicted_labels == 0))
    
    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)
    
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return f1

In [14]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold.

    Args:
        y:      shape=(N,)
        k_fold: K in K-fold, i.e. the fold num
        seed:   the random seed

    Returns:
        A 2D array of shape=(k_fold, N/k_fold) that indicates the data indices for each fold

    """
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval : (k + 1) * interval] for k in range(k_fold)]
    
    return np.array(k_indices)

In [234]:
def apply_model(test, model):
    pred=(sigmoid(test.dot(model))>=0.2).astype(int)
    return(pred)
    

In [16]:
data_train_filtered.shape

(328135, 75)

In [21]:
def cross_validation(y, x, k_indices, k, lambda_, degree):
    """return the loss of ridge regression for a fold corresponding to k_indices

    Args:
        y:          shape=(N,)
        x:          shape=(N,)
        k_indices:  2D array returned by build_k_indices()
        k:          scalar, the k-th fold (N.B.: not to confused with k_fold which is the fold nums)
        lambda_:    scalar, cf. ridge_regression()
        degree:     scalar, cf. build_poly()

    Returns:
        train and test root mean square errors rmse = sqrt(2 mse)

    """

    # ***************************************************
    # get k'th subgroup in test, others in train:
    train_idx=np.reshape(k_indices[[i for i in range(len(k_indices)) if i!=k]], -1)
    test_idx=k_indices[k]

    x_train=x[train_idx,:]
    print(x_train.shape)
    y_train=y[train_idx]
    x_test=x[test_idx,:]
    y_test=y[test_idx]
    
    y_tr=np.expand_dims(y_train, 1)
    y_te=np.expand_dims(y_test, 1)

    y_tr=np.where(y_tr == -1, 0, y_tr)
    print(y_tr, y_tr.shape)
    y_te=np.where(y_te == -1, 0, y_te)

    max_iters = 1000
    gamma=0.5

    # ***************************************************
    # form data with polynomial degree: 
    print('on va auggmenter le data')
    train_data=feature_expansion(x_train, degree)
    test_data=feature_expansion(x_test, degree)
    train_data=standardize(train_data)
    test_data=standardize(test_data)
    # ***************************************************
     # build tx
    tx_tr = np.c_[np.ones((y_train.shape[0], 1)), train_data]
    tx_te = np.c_[np.ones((test_data.shape[0], 1)), test_data]
    print(tx_tr.shape)
    print(tx_te.shape)
    initial_w=np.zeros((tx_tr.shape[1], 1))

    # reg logistic regression: 
    w=reg_logistic_regression(y_tr,tx_tr,lambda_,initial_w, max_iters, gamma)[0]
    print(w.shape)
    print(tx_te.shape)
    y_pred=apply_model(tx_te, w)
    # calculate f1 score on test:
    f1_te=compute_f1_score(y_te, y_pred)
  
    return f1_te

In [37]:
def cross_validation_demo(degree, k_fold, lambda_):
    """cross validation over regularisation parameter lambda.

    Args:
        degree: integer, degree of the polynomial expansion
        k_fold: integer, the number of folds
        lambdas: shape = (p, ) where p is the number of values of lambda to test
    Returns:
        best_lambda : scalar, value of the best lambda
        best_rmse : scalar, the associated root mean squared error for the best lambda
    """

    seed = 12
    #degree = degree
    k_fold = k_fold
    lambda_ = lambda_
    # split data in k fold
    k_indices = build_k_indices(y_train, k_fold, seed)
    # define lists to store the loss of training data and test data
    f1_score=[]
    # cross validation over lambdas:
    for d in degree:
        cross_val=[cross_validation(y_train, data_train_filtered, k_indices, k, lambda_, d) for k in range(k_fold)]
        f1=np.mean(cross_val)
        f1_score.append(f1)
    print('on y est presque')
    best_degree=degree[np.argmax(f1_score)]
    best_f1=np.min(f1_score)
    # print(
    #     "For polynomial expansion up to degree %.f, the choice of lambda which leads to the best test rmse is %.5f with a test rmse of %.3f"
    #     % (degree, best_lambda, f1_score)
    # )
    return best_degree, best_f1

In [38]:
y_train.shape

(328135,)

In [39]:
best_lambda, best_f1 = cross_validation_demo(np.array([1,2,3,4,5]).astype(int), 2, 0)

(164067, 75)
[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]] (164067, 1)
on va auggmenter le data
(164067, 151)
(164067, 151)
Current iteration=0, loss=0.602928317667096
Current iteration=100, loss=0.24995198845399821
Current iteration=200, loss=0.24839048464870833
Current iteration=300, loss=0.2481155255834338
Current iteration=400, loss=0.24803562715193608
Current iteration=500, loss=0.24800115758122054
Current iteration=600, loss=0.24797971970665422
Current iteration=700, loss=0.24796310425383883
Current iteration=800, loss=0.24794895128918823
Current iteration=900, loss=0.24793644759737749
loss=0.24792533450197074
(151, 1)
(164067, 151)
(164067, 75)
[[1]
 [0]
 [1]
 ...
 [1]
 [0]
 [0]] (164067, 1)
on va auggmenter le data
(164067, 151)
(164067, 151)
Current iteration=0, loss=0.6031193721524989
Current iteration=100, loss=0.25165567787123
Current iteration=200, loss=0.25006465583741655
Current iteration=300, loss=0.24978007723228135
Current iteration=400, loss=0.24969841484790592
Current iterati

In [40]:
best_f1

0.03153655475434211

In [41]:
best_lambda

5

In [183]:
augmented_data=feature_expansion(data_train_filtered_2, 5)

## Training

In [184]:
# standardization of the data
def standardize(data):
    small_value=1*10**(-9)
    mean=np.mean(data, axis=0)
    std=np.std(data, axis=0)+small_value
    return((data - mean) / (std))

In [67]:
data_train_filtered

array([[5.30000000e+01, 2.01501563e+09, 5.32049000e+05, ...,
        2.28990981e+00, 2.40679360e+00, 2.00000000e+00],
       [3.30000000e+01, 2.01500439e+09, 3.31071000e+05, ...,
        2.28990981e+00, 2.40679360e+00, 1.96667511e+00],
       [2.00000000e+01, 2.01500564e+09, 2.01091000e+05, ...,
        1.00000000e+00, 2.00000000e+00, 2.00000000e+00],
       ...,
       [3.90000000e+01, 2.01500490e+09, 3.91012000e+05, ...,
        2.00000000e+00, 2.00000000e+00, 2.00000000e+00],
       [3.30000000e+01, 2.01500445e+09, 3.31072000e+05, ...,
        2.28990981e+00, 2.40679360e+00, 2.00000000e+00],
       [3.20000000e+01, 2.01500118e+09, 3.21011000e+05, ...,
        2.28990981e+00, 2.40679360e+00, 2.00000000e+00]])

In [185]:
data_standardized = standardize(augmented_data)

In [186]:
data_standardized.shape

(328135, 276)

In [187]:
#split the test set in two


def split_data(x, y, ratio, seed=1):
    """
    split the dataset based on the split ratio. If ratio is 0.8
    you will have 80% of your data set dedicated to training
    and the rest dedicated to testing. If ratio times the number of samples is not round
    you can use np.floor. Also check the documentation for np.random.permutation,
    it could be useful.

    Args:
        x: numpy array of shape (N,), N is the number of samples.
        y: numpy array of shape (N,).
        ratio: scalar in [0,1]
        seed: integer.

    Returns:
        x_tr: numpy array containing the train data.
        x_te: numpy array containing the test data.
        y_tr: numpy array containing the train labels.
        y_te: numpy array containing the test labels.
    """
    N=int(ratio*len(x))
    # set seed
    np.random.seed(seed)
    # split the data based on the given ratio: 
    shuffled_data=np.random.permutation(x)
    #print(shuffled_data)
    np.random.seed(seed)
    shuffled_labels=np.random.permutation(y)
    #print(shuffled_labels)
    x_tr=shuffled_data[:N] #train data
    x_te=shuffled_data[N:] #test data
    y_tr=shuffled_labels[:N]#train labels
    y_te=shuffled_labels[N:]# test labels

    return(x_tr,x_te, y_tr, y_te)

In [188]:
x_tr,x_te, y_tr, y_te=split_data(data_standardized, y_train, ratio=0.8)


In [189]:
print(y_tr.shape, x_tr.shape)

(262508, 1) (262508, 276)


In [190]:
#y_train=np.expand_dims(y_train, 1)

In [191]:
#y_train.shape

(328135, 1, 1)

In [192]:
# Binary classification using logistic regression

max_iters = 10000
gamma = 0.5

 # build tx
tx_tr = np.c_[np.ones((y_tr.shape[0], 1)), x_tr]
initial_w=np.zeros((tx_tr.shape[1], 1))


In [72]:
y_train = np.where(y_train == -1, 0, y_train)

In [73]:
y_train

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [74]:
#grad=calculate_log_likelihood_gradient(y_train,tx_tr,initial_w)


In [75]:
# w=initial_w-gamma*grad
# print(np.min(w), np.max(w))

In [76]:
# loss=calculate_log_likelihood_loss(y_train,tx_tr,initial_w)
# print(loss)

In [195]:
w,loss= logistic_regression(y_tr, tx_tr, initial_w, max_iters, gamma=0.5)

Current iteration=0, loss=0.6046881574088626
Current iteration=100, loss=0.2523966461566142
Current iteration=200, loss=0.24975666071294256
Current iteration=300, loss=0.24882370999293232
Current iteration=400, loss=0.24856549904230596
Current iteration=500, loss=0.24843893361640268
Current iteration=600, loss=0.24833554114888434
Current iteration=700, loss=0.24824505245815223
Current iteration=800, loss=0.2481647170626667
Current iteration=900, loss=0.248092918219218
Current iteration=1000, loss=0.2480284636159432
Current iteration=1100, loss=0.24797039679946647
Current iteration=1200, loss=0.24791792257985393
Current iteration=1300, loss=0.24787036835776782
Current iteration=1400, loss=0.24782715960604917
Current iteration=1500, loss=0.24778780230292832
Current iteration=1600, loss=0.24775186947921538
Current iteration=1700, loss=0.24771899053553295
Current iteration=1800, loss=0.24768884257904317
Current iteration=1900, loss=0.24766114330416086
Current iteration=2000, loss=0.2476356

In [239]:
lambda_ = 0.0005
w_reg,loss_reg= reg_logistic_regression(y_tr, tx_tr, lambda_, initial_w, max_iters, gamma)

Current iteration=0, loss=0.6046881574088626
Current iteration=100, loss=0.25339777445437145
Current iteration=200, loss=0.2511641976501825
Current iteration=300, loss=0.2505238156845519
Current iteration=400, loss=0.2502513434531619
Current iteration=500, loss=0.2501033920836258
Current iteration=600, loss=0.2500052502528166
Current iteration=700, loss=0.24993110482341782
Current iteration=800, loss=0.24987105980806934
Current iteration=900, loss=0.24982068041987632
Current iteration=1000, loss=0.2497775940874236
Current iteration=1100, loss=0.24974032006262192
Current iteration=1200, loss=0.24970782570367855
Current iteration=1300, loss=0.2496793372745955
Current iteration=1400, loss=0.24965424855203278
Current iteration=1500, loss=0.24963207100708112
Current iteration=1600, loss=0.24961240358171263
Current iteration=1700, loss=0.24959491269844467
Current iteration=1800, loss=0.2495793181601774
Current iteration=1900, loss=0.24956538274302884
Current iteration=2000, loss=0.2495529042

In [240]:
w_reg.shape

(277, 1)

## Feature expansion

In [50]:
build_poly(x_train, 4)

NameError: name 'build_poly' is not defined

## Test and Accuracy

In [241]:

tx_te=np.c_[np.ones((y_te.shape[0], 1)), x_te]

In [242]:
tx_te.shape

(65627, 277)

In [243]:
w.shape

(277, 1)

In [244]:
y_te=np.where(y_te==-1, 0, y_te)

In [245]:
y_pred=apply_model(tx_te, w)

In [246]:
#Calculate accuracy
correct_predictions = np.sum(y_pred == y_te)
total_samples = len(y_te)
accuracy = correct_predictions / total_samples

In [247]:
compute_f1_score(y_te, y_pred)

0.3508144616607072

In [248]:
print(accuracy)

0.8506102671156689


## Test

In [229]:
def apply_model(test, model):
    pred=(sigmoid(test.dot(model))>=0.25).astype(int)
    return(pred)
    

In [80]:
xt=replace_nan_by_mean(x_test)

In [81]:
x_test.shape

(109379, 321)

In [82]:
xt_filtered=filtering(xt, test_data_path)

In [83]:
xt_filtered.shape

(109379, 75)

In [84]:
#augmented_data_test=feature_expansion(xt_filtered, 2)

In [85]:
#augmented_data_test.shape

In [86]:

xt_standardized=standardize(xt_filtered)
xtest=np.c_[np.ones((xt.shape[0], 1)), xt_standardized]


In [87]:
xtest.shape

(109379, 76)

In [106]:
predictions=apply_model(xtest, w)
predictions=np.where(predictions==0,-1, predictions)

In [107]:
predictions


array([[-1],
       [-1],
       [-1],
       ...,
       [-1],
       [-1],
       [-1]])

In [108]:
create_csv_submission(test_ids, predictions, 'predictions_LR_2.csv')