In [95]:
import numpy as np

In [102]:
def load_csv_data(data_path, sub_sample=False):
    """Loads data and returns y (class labels), tX (features) and ids (event ids)"""
    y = np.genfromtxt(data_path, delimiter=",", skip_header=1, dtype=str, usecols=1)
    x = np.genfromtxt(data_path, delimiter=",", skip_header=1)
    ids = x[:, 0].astype(np.int)
    input_data = x[:, 2:]

    # convert class labels from strings to binary (-1,1)
    yb = np.ones(len(y))
    yb[np.where(y == 'b')] = -1

    # sub-sample
    if sub_sample:
        yb = yb[::50]
        input_data = input_data[::50]
        ids = ids[::50]

    return yb, input_data, ids
y,tx,_=load_csv_data('train.csv')

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  ids = x[:, 0].astype(np.int)


(250000, 30)

In [107]:
def missing_values_elimination(X):
    """
    Deletion of features with more than 70% missing values and imposition of the median in the remaining features
    """
    N, D = X.shape
    missing_data = np.zeros(D)
    cols_to_delete = []
    for i in range(D):
        missing_data[i] = np.count_nonzero(X[:,i]==-999)/N

        if missing_data[i]>0.7:
            cols_to_delete.append(i)

        elif missing_data[i]>0:
            X_feature = X[:,i]
            median = np.median(X_feature[X_feature != -999])
            X[:,i] = np.where(X[:,i]==-999, median, X[:,i])

    X[:,cols_to_delete]=0

    return X
tx=missing_values_elimination(tx)

In [108]:
def standardize(x):
    """Standardize the original data set."""
    mean_x = np.mean(x)
    x = x - mean_x
    std_x = np.std(x)
    x = x / std_x
    return x
tx=standardize(tx)

In [109]:
tx=np.array(tx)
type(tx)

numpy.ndarray

In [116]:
def compute_loss_MSE(y, tx, w):

    """
    Computes the loss function using the Mean Squared Error as Cost
    INPUTS: y = target, tx = sample matrix, w = weights vector
    OUTPUT: evaluation of the MSE given the inputs
    """

    e = y - tx @ w
    N = len(y)
    return (e**2).sum()/(2*N)
def ridge_regression(y, tx, lambda_):

    """
    Computation of the weights vector by solving the L2-regularized normal equations for linear regression
    INPUTS: y = target, tx = sample matrix, lambda_ = regularization parameter
    OUTPUTS: w = weights vector, loss = corresponding MSE evaluation
    """

    N = len(y)
    D = tx.shape[1]
    I = np.eye(D)
    w = np.linalg.solve(tx.T @ tx + 2*N*lambda_*I, tx.T @ y)
    loss = compute_loss_MSE(y, tx, w)
    return w,loss


w,loss=ridge_regression(y,tx, 0.1)


In [132]:
_,data_test,_=load_csv_data('test.csv')
data_test=missing_values_elimination(data_test)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  ids = x[:, 0].astype(np.int)


In [133]:
y_pred=data_test@w
y_pred

array([-25.42392119, -14.54822374, -13.14886464, ...,   0.11004155,
        -0.23295398, -24.96113589])