In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn import metrics

In [2]:
# Read training data
train_data = pd.read_csv('datasets/train.csv', delimiter=',')
X = train_data.iloc[:,1:]
y = train_data.iloc[:,0]
X_cols = X.columns

# Check data
print("\n===== CHECK DATA =====")
print("TRAIN DATA")
print(train_data.head())
print("\n")
print("FEATURES")
print(X.head())
print("\n")
print("TARGET")
print(y.head())

# Convert to numpy arrays to use indices from KFold later on
X = X.to_numpy()
y = y.to_numpy()


===== CHECK DATA =====
TRAIN DATA
      y        x1   x2     x3   x4     x5     x6     x7      x8    x9    x10  \
0  22.6   0.06724  0.0   3.24  0.0  0.460  6.333   17.2  5.2146   4.0  430.0   
1  50.0   9.23230  0.0  18.10  0.0  0.631  6.216  100.0  1.1691  24.0  666.0   
2  23.0   0.11425  0.0  13.89  1.0  0.550  6.373   92.4  3.3633   5.0  276.0   
3   8.3  24.80170  0.0  18.10  0.0  0.693  5.349   96.0  1.7028  24.0  666.0   
4  21.2   0.05646  0.0  12.83  0.0  0.437  6.232   53.7  5.0141   5.0  398.0   

    x11     x12    x13  
0  16.9  375.21   7.34  
1  20.2  366.15   9.53  
2  16.4  393.74  10.50  
3  20.2  396.90  19.77  
4  18.7  386.40  12.34  


FEATURES
         x1   x2     x3   x4     x5     x6     x7      x8    x9    x10   x11  \
0   0.06724  0.0   3.24  0.0  0.460  6.333   17.2  5.2146   4.0  430.0  16.9   
1   9.23230  0.0  18.10  0.0  0.631  6.216  100.0  1.1691  24.0  666.0  20.2   
2   0.11425  0.0  13.89  1.0  0.550  6.373   92.4  3.3633   5.0  276.0  16.4   
3  

In [3]:
## Split in K-folds
#X_train, X_validation, y_train, y_validation = train_test_split(Xolds, y, train_size=0.10)
kf = KFold(n_splits=10)

In [4]:
# Regularization parameters
reg_params = [0.1, 1, 10, 100, 200]

# Dictionary to store RMSE for each regularization parameter
RMSE = {}

# Loop over all regularisation parameters to test
for reg_param in reg_params:
    
    # Initialize dictionary to store RMSE for each fold as well as avg and std
    RMSE[reg_param] = {"list": [], "avg": 0., "std": 0.}

    # We have to split each time! Cannot loop more than once over kf.split...
    kf_indices = kf.split(X)
    for train_indices, validation_indices in kf_indices:
        # Initialize a Ridge regression model
        ridge_regressor = Ridge(alpha=reg_param)
        # Fit it on the training part of the fold
        ridge_regressor.fit(X[train_indices], y[train_indices])
        # Evaluate on the rest of the dataset
        y_validation_predictions = ridge_regressor.predict(X[validation_indices])
        # Calculate and store RMSE
        RMSE[reg_param]["list"].append(np.sqrt(metrics.mean_squared_error(y[validation_indices], y_validation_predictions)))

    # Compute average RMSE and std
    RMSE[reg_param]["avg"] = np.mean(RMSE[reg_param]["list"])
    RMSE[reg_param]["std"] = np.std(RMSE[reg_param]["list"], ddof=1)

In [5]:
file = open("to_submit.txt", "w") 

for param in RMSE.keys():
    print("lambda = %.1e" %param)
    print("\tavg = %.3f   std = %.3f" %(RMSE[param]["avg"], RMSE[param]["std"]))
    file.write(str(RMSE[param]["avg"])+"\n")
    
file.close()

lambda = 1.0e-01
	avg = 5.502   std = 1.340
lambda = 1.0e+00
	avg = 5.500   std = 1.401
lambda = 1.0e+01
	avg = 5.484   std = 1.522
lambda = 1.0e+02
	avg = 5.637   std = 1.792
lambda = 2.0e+02
	avg = 5.721   std = 1.875


In [6]:
!cat to_submit.txt

5.501809445057857
5.499838741278097
5.483631486072287
5.636642135414034
5.721233719861127
