In [37]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import KFold

In [25]:
# Import data into dataframe

df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,y,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13
0,22.6,0.06724,0.0,3.24,0.0,0.46,6.333,17.2,5.2146,4.0,430.0,16.9,375.21,7.34
1,50.0,9.2323,0.0,18.1,0.0,0.631,6.216,100.0,1.1691,24.0,666.0,20.2,366.15,9.53
2,23.0,0.11425,0.0,13.89,1.0,0.55,6.373,92.4,3.3633,5.0,276.0,16.4,393.74,10.5
3,8.3,24.8017,0.0,18.1,0.0,0.693,5.349,96.0,1.7028,24.0,666.0,20.2,396.9,19.77
4,21.2,0.05646,0.0,12.83,0.0,0.437,6.232,53.7,5.0141,5.0,398.0,18.7,386.4,12.34


In [35]:
# Transform dataframe into arrays of testpoints x and labels y

y = np.array(df['y'])
x = (df.to_numpy())[:,1:]
x, y

(array([[6.72400e-02, 0.00000e+00, 3.24000e+00, ..., 1.69000e+01,
         3.75210e+02, 7.34000e+00],
        [9.23230e+00, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,
         3.66150e+02, 9.53000e+00],
        [1.14250e-01, 0.00000e+00, 1.38900e+01, ..., 1.64000e+01,
         3.93740e+02, 1.05000e+01],
        ...,
        [6.90500e-02, 0.00000e+00, 2.18000e+00, ..., 1.87000e+01,
         3.96900e+02, 5.33000e+00],
        [7.36711e+00, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,
         9.67300e+01, 2.15200e+01],
        [1.68118e+01, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,
         3.96900e+02, 3.08100e+01]]),
 array([22.6, 50. , 23. ,  8.3, 21.2, 19.9, 20.6, 18.7, 16.1, 18.6,  8.8,
        17.2, 14.9, 10.5, 50. , 29. , 23. , 33.3, 29.4, 21. , 23.8, 19.1,
        20.4, 29.1, 19.3, 23.1, 19.6, 19.4, 38.7, 18.7, 14.6, 20. , 20.5,
        20.1, 23.6, 16.8,  5.6, 50. , 14.5, 13.3, 23.9, 20. , 19.8, 13.8,
        16.5, 21.6, 20.3, 17. , 11.8, 27.5, 15.6, 23.1, 24.3, 42.8, 15.6,
   

In [26]:
# K-fold split for cross validation

kf = KFold(n_splits=10)
kf

KFold(n_splits=10, random_state=None, shuffle=False)

In [32]:
# Verify proper splitting of test set into 10 parts

for train_index, test_index in kf.split(x):
    print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [ 15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32
  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50
  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68
  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86
  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104
 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
 141 142 143 144 145 146 147 148 149] TEST: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  30  31  32
  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50
  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68
  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86
  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104
 105 106 107 108 10

In [40]:
# Define 5 different ridge models with different value for lambda

lambdas = [0.1, 1, 10, 100, 200]
models = []
for l in lambdas:
    models.append(linear_model.Ridge(alpha = l))
models

[Ridge(alpha=0.1),
 Ridge(alpha=1),
 Ridge(alpha=10),
 Ridge(alpha=100),
 Ridge(alpha=200)]

In [47]:
# Do k-fold ridge regression with the splits and models previously defined. Take the mean of the resulting RMSE
# and store that in a list

import math
import statistics

results = []
for clf in models:
    temp = []
    for train_index, test_index in kf.split(x):
        # print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # print("MODEL:", clf)
        clf.fit(X_train, y_train)

        pred = clf.predict(X_test)

        mse = metrics.mean_squared_error(y_test, pred)
        rmse = math.sqrt(mse)
        # print("RMSE:", rmse)

        temp.append(rmse)
    results.append(statistics.mean(temp))
results

[5.501809445057858,
 5.499838741278097,
 5.483631486072287,
 5.636642135414034,
 5.721233719861127]

In [46]:
# Write final results into a CSV

out = pd.DataFrame(results)
out.to_csv("submission.csv", header=False, index=False)

Unnamed: 0,0
0,5.501809
1,5.499839
2,5.483631
3,5.636642
4,5.721234
