In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate, KFold
from sklearn.linear_model import Ridge
from tqdm import tqdm

np.random.seed(1337)

# Data

In [2]:
# load the data from a filepath
data_filepath = 'data/train.csv'
# create the dataframe
raw_df = pd.read_csv(data_filepath)
display(raw_df.head())

# Shuffle the data
shuffled_df = raw_df.sample(frac=1)

# Split into features and labels (still dataframes)
X_df = shuffled_df.iloc[:, 2:15]
y_df = shuffled_df.iloc[:, 1]

display(X_df.head())
display(y_df.head())

Unnamed: 0,Id,y,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13
0,0,22.6,0.06724,0.0,3.24,0.0,0.46,6.333,17.2,5.2146,4.0,430.0,16.9,375.21,7.34
1,1,50.0,9.2323,0.0,18.1,0.0,0.631,6.216,100.0,1.1691,24.0,666.0,20.2,366.15,9.53
2,2,23.0,0.11425,0.0,13.89,1.0,0.55,6.373,92.4,3.3633,5.0,276.0,16.4,393.74,10.5
3,3,8.3,24.8017,0.0,18.1,0.0,0.693,5.349,96.0,1.7028,24.0,666.0,20.2,396.9,19.77
4,4,21.2,0.05646,0.0,12.83,0.0,0.437,6.232,53.7,5.0141,5.0,398.0,18.7,386.4,12.34


Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13
291,0.75026,0.0,8.14,0.0,0.538,5.924,94.1,4.3996,4.0,307.0,21.0,394.33,16.3
119,0.01501,90.0,1.21,1.0,0.401,7.923,24.8,5.885,1.0,198.0,13.6,395.52,3.16
464,0.20746,0.0,27.74,0.0,0.609,5.093,98.0,1.8226,4.0,711.0,20.1,318.43,29.68
363,0.08873,21.0,5.64,0.0,0.439,5.963,45.7,6.8147,4.0,243.0,16.8,395.56,13.45
379,4.89822,0.0,18.1,0.0,0.631,4.97,100.0,1.3325,24.0,666.0,20.2,375.52,3.26


291    15.6
119    50.0
464     8.1
363    19.7
379    50.0
Name: y, dtype: float64

In [3]:
# Create numpy arrays from the pandas dataframes
X = X_df.to_numpy()
y = y_df.to_numpy()

print(X.shape, y.shape)

(506, 13) (506,)


# Ridge Regression via 10-fold Cross Validation

In [4]:
# Perform a k-fold CV with k = 10
k = 10

# Regularization params
lambdas = [0.01, 0.1, 1, 10, 100]

class RidgeTraining:
    
    @staticmethod
    def start(X, y, reg_alpha, k, random_state):
        """Train a ridge regression model on the supplied data (X, y)"""
        # Model
        model = Ridge(alpha=reg_alpha)
        
        cv_results = cross_validate(model, X, y, 
                                    cv=KFold(n_splits=k, shuffle=True, random_state=random_state),
                                    scoring=('neg_mean_squared_error'))  
        return np.sqrt(-(cv_results['test_score']))

In [5]:
# Train k models from on the train folds, validate on validation fold

def get_scores(i=0):
    # Array of the RMSE for each regularization parameter
    res_rmse = []

    for reg_lambda in lambdas:
        # Perform the cross validation training
        val_folds_rmse = RidgeTraining.start(X, y, reg_lambda, k, i)

        # Root mean squared error accumulated over k folds
        rmse = np.sum(val_folds_rmse) / k

        # Add it to the array of RMSEs
        res_rmse.append(rmse)

    return res_rmse

In [6]:
# The resulting scores
get_scores()

[4.804908624967248,
 4.802214486786927,
 4.80416409901698,
 4.851784345683722,
 4.964394876805956]

In [7]:
# Average out the results of n_iterations
n_iter = 10000
res = np.zeros(5)

for i in tqdm(range(n_iter)):
    res += get_scores(i+1)
    
print(res/n_iter)

100%|██████████| 10000/10000 [08:21<00:00, 19.96it/s]

[4.80531341 4.80480721 4.81752789 4.87170078 4.97422208]





In [8]:
# Save the result f
np.savetxt('submission.csv', res/n_iter, fmt="%f")