In [11]:
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
import math

In [12]:
# Load our matrices and drop our index
df = pd.read_csv('x06Simple.csv', header=0)
df.drop(df.columns[0], axis=1, inplace=True)
print(df)

    Age  Temp of Water  Length of Fish
0    14             25             620
1    28             25            1315
2    41             25            2120
3    55             25            2600
4    69             25            3110
5    83             25            3535
6    97             25            3935
7   111             25            4465
8   125             25            4530
9   139             25            4570
10  153             25            4600
11   14             27             625
12   28             27            1215
13   41             27            2110
14   55             27            2805
15   69             27            3255
16   83             27            4015
17   97             27            4315
18  111             27            4495
19  125             27            4535
20  139             27            4600
21  153             27            4600
22   14             29             590
23   28             29            1305
24   41             29   

In [13]:
# convert our dataframe into an nparray
data = df.to_numpy()
print(data)

[[  14   25  620]
 [  28   25 1315]
 [  41   25 2120]
 [  55   25 2600]
 [  69   25 3110]
 [  83   25 3535]
 [  97   25 3935]
 [ 111   25 4465]
 [ 125   25 4530]
 [ 139   25 4570]
 [ 153   25 4600]
 [  14   27  625]
 [  28   27 1215]
 [  41   27 2110]
 [  55   27 2805]
 [  69   27 3255]
 [  83   27 4015]
 [  97   27 4315]
 [ 111   27 4495]
 [ 125   27 4535]
 [ 139   27 4600]
 [ 153   27 4600]
 [  14   29  590]
 [  28   29 1305]
 [  41   29 2140]
 [  55   29 2890]
 [  69   29 3920]
 [  83   29 3920]
 [  97   29 4515]
 [ 111   29 4520]
 [ 125   29 4525]
 [ 139   29 4565]
 [ 153   29 4566]
 [  14   31  590]
 [  28   31 1205]
 [  41   31 1915]
 [  55   31 2140]
 [  69   31 2710]
 [  83   31 3020]
 [  97   31 3030]
 [ 111   31 3040]
 [ 125   31 3180]
 [ 139   31 3257]
 [ 153   31 3214]]


In [15]:
# We expect train and test to be the same length horizontally
# Returns root mean square error
def linear_regression(train, test):
    rowlen = train.shape[1]

    # Extract the x matrix and standardize
    # (with range its i:j+1 and with single value its just j... easy to have an off by 1 error with column extraction)
    train_x = train[:, 0:rowlen-1]

    # extracted the y column, and convert it back into a single column matrix
    train_y = train[:, rowlen-1]
    train_y = train_y.reshape(train_y.shape[0], 1)

    # Normalize our x training data
    train_x_mean = np.mean(train_x, axis=0)
    train_x_std = np.std(train_x, axis=0, ddof=1)
    train_x_norm = (train_x-train_x_mean) / train_x_std

    # Add our bias to the training data
    train_x_norm = np.append(np.ones((train.shape[0], 1)), train_x_norm, 1)

    # 1 shot computation of normal form of function
    thetas = np.linalg.inv(train_x_norm.T@train_x_norm) @ (train_x_norm.T@train_y)

    # Now we can use our test data and make some predictions in order to calculate root mean square error
    total_squared_error = 0
    for row in test:

        # Normalize our test data based on training data
        row_x = row[0:rowlen-1]
        row_x_norm = (row_x-train_x_mean) / train_x_std

        # Dont forget to add the bias!
        row_x_norm = np.append(np.ones(1), row_x_norm)

        # Calculate the distance between the predicted vs actual, and sum them up so we can average at end
        predicted = (row_x_norm @ thetas)
        actual = row[rowlen-1]
        error = actual - predicted
        total_squared_error += error**2

    mse = total_squared_error / test.shape[0]
    rmse = math.sqrt(mse)
    return rmse

In [16]:
# Returns a tuple containing the mean and std dev of the RMSEs
def sfolds(s, ds, out=False):
    kf = KFold(n_splits=s, shuffle=True)

    # From 1 -> S, calculate the root mean square error..
    # Kfold shuffle seed is generated at random implicitly
    rmse_vec = []
    for train_index, test_index in kf.split(ds):
        rmse = linear_regression(ds[train_index], ds[test_index])
        rmse_vec.append(rmse)

    # Calculate our mean and create our tuple
    mean = np.mean(np.array(rmse_vec))

    # Print..
    if out:
        print("\nS: " + str(s))
        print("Mean: " + str(mean))

    return mean

In [17]:
# Run sfold n times
# Return average mean and Std Dev
def run_n_sfolds(s, ds, n=20):

    # Tally up the means and stdevs
    means = []
    for _ in range(n):
        sf_mean = sfolds(s, ds)
        means.append(sf_mean)

    # Create the tuple that averages out our means and stdevs
    means = np.array(means)
    avg_sf_tuple = np.mean(means), np.std(means, ddof=1)

    # Print..
    print("\nS: " + str(s))
    print("Mean: " + str(avg_sf_tuple[0]))
    print("Std Dev: " + str(avg_sf_tuple[1]))

In [22]:
run_n_sfolds(3,         data)
run_n_sfolds(5,         data)
run_n_sfolds(10,        data)
run_n_sfolds(20,        data)
run_n_sfolds(len(data), data)


S: 3
Mean: 628.1942701793475
Std Dev: 43.375948585004636

S: 5
Mean: 614.4233435971977
Std Dev: 23.52840465428092

S: 10
Mean: 592.2226575532882
Std Dev: 21.187269290607762

S: 20
Mean: 558.614218899307
Std Dev: 11.686914031251684

S: 44
Mean: 493.4732598766561
Std Dev: 1.7398563456515857e-13
