In [40]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold

In [41]:
def fit(X, y, lam):
    model = Ridge(alpha=lam, fit_intercept=False, tol = 1e-7)
    model.fit(X,y)
    weights = model.coef_
    assert weights.shape == (13,)
    return weights
    
def calculate_RMSE(w, X, y):
    RMSE = np.sqrt(np.mean(np.square(y-X.dot(w))))
    assert np.isscalar(RMSE)
    return RMSE


def average_LR_RMSE(X, y, lambdas, n_folds):
    RMSE_mat = np.zeros((n_folds, len(lambdas)))
    
    kf = KFold(n_folds)
    for lam_idx, lam in enumerate(lambdas):
        for fold, (train,test) in enumerate(kf.split(X)):
            print(f"fold: {fold}, train: {train}, test: {test}")
            weights = fit(X[train],y[train], lam)
            gen_loss = calculate_RMSE(weights,X[test],y[test])
            RMSE_mat[fold,lam_idx] = gen_loss


    avg_RMSE = np.mean(RMSE_mat, axis=0)
    assert avg_RMSE.shape == (len(lambdas),)
    return avg_RMSE



In [42]:
data = pd.read_csv("train.csv")
y = data["y"].to_numpy()
data = data.drop(columns="y")
# print a few data samples
print(data.head())

X = data.to_numpy()
# The function calculating the average RMSE
lambdas = [0.1, 1, 10, 100, 200]
n_folds = 10
avg_RMSE = average_LR_RMSE(X, y, lambdas, n_folds)
# Save results in the required format
print(avg_RMSE)
np.savetxt("./results.csv", avg_RMSE, fmt="%.12f")

         x1   x2     x3   x4     x5     x6     x7      x8    x9    x10   x11  \
0   0.06724  0.0   3.24  0.0  0.460  6.333   17.2  5.2146   4.0  430.0  16.9   
1   9.23230  0.0  18.10  0.0  0.631  6.216  100.0  1.1691  24.0  666.0  20.2   
2   0.11425  0.0  13.89  1.0  0.550  6.373   92.4  3.3633   5.0  276.0  16.4   
3  24.80170  0.0  18.10  0.0  0.693  5.349   96.0  1.7028  24.0  666.0  20.2   
4   0.05646  0.0  12.83  0.0  0.437  6.232   53.7  5.0141   5.0  398.0  18.7   

      x12    x13  
0  375.21   7.34  
1  366.15   9.53  
2  393.74  10.50  
3  396.90  19.77  
4  386.40  12.34  
fold: 0, train: [ 15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32
  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50
  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68
  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86
  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104
 105 106 107 108 109 110

In [43]:
# This serves as a template which will guide you through the implementation of this task. It is advised
# to first read the whole template and get a sense of the overall structure of the code before trying to fill in any of the TODO gaps
# First, we import necessary libraries:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold


def fit2(X, y, lam):
    """
    This function receives training data points, then fits the ridge regression on this data
    with regularization hyperparameter lambda. The weights w of the fitted ridge regression
    are returned. 

    Parameters
    ----------
    X: matrix of floats, dim = (135,13), inputs with 13 features
    y: array of floats, dim = (135,), input labels)
    lam: float. lambda parameter, used in regularization term

    Returns
    ----------
    w: array of floats: dim = (13,), optimal parameters of ridge regression
    """
    model = Ridge(alpha=lam, fit_intercept=False, tol = 1e-7)
    model.fit(X,y)
    weights = model.coef_
    assert weights.shape == (13,)
    return weights


def calculate_RMSE2(w, X, y):
    """This function takes test data points (X and y), and computes the empirical RMSE of 
    predicting y from X using a linear model with weights w. 

    Parameters
    ----------
    w: array of floats: dim = (13,), optimal parameters of ridge regression 
    X: matrix of floats, dim = (15,13), inputs with 13 features
    y: array of floats, dim = (15,), input labels

    Returns
    ----------
    RMSE: float: dim = 1, RMSE value
    """
    RMSE = np.sqrt(np.mean(np.square(y-X.dot(w))))
    assert np.isscalar(RMSE)
    return RMSE


def average_LR_RMSE2(X, y, lambdas, n_folds):
    """
    Main cross-validation loop, implementing 10-fold CV. In every iteration (for every train-test split), the RMSE for every lambda is calculated, 
    and then averaged over iterations.
    
    Parameters
    ---------- 
    X: matrix of floats, dim = (150, 13), inputs with 13 features
    y: array of floats, dim = (150, ), input labels
    lambdas: list of floats, len = 5, values of lambda for which ridge regression is fitted and RMSE estimated
    n_folds: int, number of folds (pieces in which we split the dataset), parameter K in KFold CV
    
    Returns
    ----------
    avg_RMSE: array of floats: dim = (5,), average RMSE value for every lambda
    """
    RMSE_mat = np.zeros((n_folds, len(lambdas)))
    #this KFold object splits the data into n_folds pieces
    kf = KFold(n_folds)
    #this loop calculates RMSE for every lambda and every fold
    #explanation: for every lambda, we fit the model on the training set and calculate RMSE on the test set
    for lam_idx, lam in enumerate(lambdas):
        for fold, (train,test) in enumerate(kf.split(X)):
            #print(f"fold: {fold}, train: {train}, test: {test}")
            weights = fit2(X[train],y[train], lam)
            gen_loss = calculate_RMSE2(weights,X[test],y[test])
            RMSE_mat[fold,lam_idx] = gen_loss

    #a shorter version of the above loop
    #for lam_idx, lam in enumerate(lambdas):
    #    for fold, (train,test) in enumerate(kf.split(X)):
    #        RMSE_mat[fold,lam_idx] = calculate_RMSE2(fit2(X[train],y[train], lam),X[test],y[test])

    #same as above, but using cross_val_score
    for lam_idx, lam in enumerate(lambdas):
        RMSE_mat[:,lam_idx] = -cross_val_score(Ridge(alpha=lam, fit_intercept=False, tol = 1e-7), X, y, cv=kf, scoring='neg_root_mean_squared_error')

    avg_RMSE = np.mean(RMSE_mat, axis=0)
    assert avg_RMSE.shape == (len(lambdas),)
    return avg_RMSE


# Main function. You don't have to change this
if __name__ == "__main__":
    # Data loading
    data = pd.read_csv("train.csv")
    y = data["y"].to_numpy()
    data = data.drop(columns="y")
    # print a few data samples
    print(data.head())

    X = data.to_numpy()
    # The function calculating the average RMSE
    lambdas = [0.1, 1, 10, 100, 200]
    n_folds = 10
    avg_RMSE = average_LR_RMSE2(X, y, lambdas, n_folds)
    # Save results in the required format
    np.savetxt("./results.csv", avg_RMSE, fmt="%.12f")


         x1   x2     x3   x4     x5     x6     x7      x8    x9    x10   x11  \
0   0.06724  0.0   3.24  0.0  0.460  6.333   17.2  5.2146   4.0  430.0  16.9   
1   9.23230  0.0  18.10  0.0  0.631  6.216  100.0  1.1691  24.0  666.0  20.2   
2   0.11425  0.0  13.89  1.0  0.550  6.373   92.4  3.3633   5.0  276.0  16.4   
3  24.80170  0.0  18.10  0.0  0.693  5.349   96.0  1.7028  24.0  666.0  20.2   
4   0.05646  0.0  12.83  0.0  0.437  6.232   53.7  5.0141   5.0  398.0  18.7   

      x12    x13  
0  375.21   7.34  
1  366.15   9.53  
2  393.74  10.50  
3  396.90  19.77  
4  386.40  12.34  


In [44]:
data = pd.read_csv("train.csv")
y = data["y"].to_numpy()
data = data.drop(columns="y")
# print a few data samples
print(data.head())

X = data.to_numpy()
# The function calculating the average RMSE
lambdas = [0.1, 1, 10, 100, 200]
n_folds = 10
avg_RMSE = average_LR_RMSE2(X, y, lambdas, n_folds)
# Save results in the required format
print(avg_RMSE)
np.savetxt("./results.csv", avg_RMSE, fmt="%.12f")

         x1   x2     x3   x4     x5     x6     x7      x8    x9    x10   x11  \
0   0.06724  0.0   3.24  0.0  0.460  6.333   17.2  5.2146   4.0  430.0  16.9   
1   9.23230  0.0  18.10  0.0  0.631  6.216  100.0  1.1691  24.0  666.0  20.2   
2   0.11425  0.0  13.89  1.0  0.550  6.373   92.4  3.3633   5.0  276.0  16.4   
3  24.80170  0.0  18.10  0.0  0.693  5.349   96.0  1.7028  24.0  666.0  20.2   
4   0.05646  0.0  12.83  0.0  0.437  6.232   53.7  5.0141   5.0  398.0  18.7   

      x12    x13  
0  375.21   7.34  
1  366.15   9.53  
2  393.74  10.50  
3  396.90  19.77  
4  386.40  12.34  
[5.5036383  5.48040028 5.46988555 5.93193113 6.2433465 ]
