In [25]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold

In [26]:
def fit(X, y, lam):
    model = Ridge(alpha=lam, solver='sparse_cg')
    model.fit(X,y)
    weights = model.coef_
    assert weights.shape == (13,)
    return weights
    
def calculate_RMSE(w, X, y):
    RMSE = 1/len(y)*np.sqrt(np.square(y-X.dot(w)).sum())
    assert np.isscalar(RMSE)
    return RMSE


def average_LR_RMSE(X, y, lambdas, n_folds):
    RMSE_mat = np.zeros((n_folds, len(lambdas)))
    kf = KFold(n_folds)
    for lam_idx, lam in enumerate(lambdas):
        for fold, (train,test) in enumerate(kf.split(X)):
            weights = fit(X[train],y[train], lam)
            gen_loss = calculate_RMSE(weights,X[test],y[test])
            RMSE_mat[fold,lam_idx] = gen_loss


    avg_RMSE = np.mean(RMSE_mat, axis=0)
    assert avg_RMSE.shape == (5,)
    return avg_RMSE



In [27]:
data = pd.read_csv("train.csv")
y = data["y"].to_numpy()
data = data.drop(columns="y")
# print a few data samples
print(data.head())

X = data.to_numpy()
# The function calculating the average RMSE
lambdas = [0.1, 1, 10, 100, 200]
n_folds = 10
avg_RMSE = average_LR_RMSE(X, y, lambdas, n_folds)
# Save results in the required format
np.savetxt("./results.csv", avg_RMSE, fmt="%.12f")

         x1   x2     x3   x4     x5     x6     x7      x8    x9    x10   x11  \
0   0.06724  0.0   3.24  0.0  0.460  6.333   17.2  5.2146   4.0  430.0  16.9   
1   9.23230  0.0  18.10  0.0  0.631  6.216  100.0  1.1691  24.0  666.0  20.2   
2   0.11425  0.0  13.89  1.0  0.550  6.373   92.4  3.3633   5.0  276.0  16.4   
3  24.80170  0.0  18.10  0.0  0.693  5.349   96.0  1.7028  24.0  666.0  20.2   
4   0.05646  0.0  12.83  0.0  0.437  6.232   53.7  5.0141   5.0  398.0  18.7   

      x12    x13  
0  375.21   7.34  
1  366.15   9.53  
2  393.74  10.50  
3  396.90  19.77  
4  386.40  12.34  
