In [1]:
import numpy as np
import pandas as pd
import math

In [2]:
# Load our matrices and drop our index
df = pd.read_csv('x06Simple.csv', header=0)
df.drop(df.columns[0], axis=1, inplace=True)
print(df)

    Age  Temp of Water  Length of Fish
0    14             25             620
1    28             25            1315
2    41             25            2120
3    55             25            2600
4    69             25            3110
5    83             25            3535
6    97             25            3935
7   111             25            4465
8   125             25            4530
9   139             25            4570
10  153             25            4600
11   14             27             625
12   28             27            1215
13   41             27            2110
14   55             27            2805
15   69             27            3255
16   83             27            4015
17   97             27            4315
18  111             27            4495
19  125             27            4535
20  139             27            4600
21  153             27            4600
22   14             29             590
23   28             29            1305
24   41             29   

In [3]:
# Pick randomized training and testing data
train = df.sample(frac=0.667,random_state=0)
test = df.drop(train.index)

# Convert to numpy arrays
train = train.to_numpy()
test = test.to_numpy()

print("TRAINING DATA:")
print(train)
print("\nTESTING DATA:")
print(test)

TRAINING DATA:
[[ 125   29 4525]
 [  69   31 2710]
 [  83   29 3920]
 [  69   25 3110]
 [ 153   25 4600]
 [  55   29 2890]
 [  97   29 4515]
 [  14   27  625]
 [  83   31 3020]
 [ 139   29 4565]
 [ 111   29 4520]
 [ 139   27 4600]
 [ 125   31 3180]
 [  41   25 2120]
 [ 153   31 3214]
 [ 111   27 4495]
 [  69   27 3255]
 [  14   29  590]
 [  83   27 4015]
 [ 111   31 3040]
 [ 125   25 4530]
 [  41   27 2110]
 [  83   25 3535]
 [  97   27 4315]
 [ 153   29 4566]
 [  55   27 2805]
 [  41   31 1915]
 [ 111   25 4465]
 [  28   31 1205]]

TESTING DATA:
[[  14   25  620]
 [  28   25 1315]
 [  55   25 2600]
 [  97   25 3935]
 [ 139   25 4570]
 [  28   27 1215]
 [ 125   27 4535]
 [ 153   27 4600]
 [  28   29 1305]
 [  41   29 2140]
 [  69   29 3920]
 [  14   31  590]
 [  55   31 2140]
 [  97   31 3030]
 [ 139   31 3257]]


In [4]:
# Normalize our training data
rowlen = train.shape[1]

# Extract the x matrix and standardize
# (with range its i:j+1 and with single value its just j... easy to have an off by 1 error with column extraction)
train_x = train[:, 0:rowlen-1]

# extracted the y column, and convert it back into a single column matrix, then add our bias
train_y = train[:, rowlen-1]
train_y = train_y.reshape(train_y.shape[0], 1)

# Normalize our x training data
train_x_mean = np.mean(train_x, axis=0)
train_x_std = np.std(train_x, axis=0, ddof=1)
train_x_norm = (train_x-train_x_mean) / train_x_std

# Add our bias to the training data
train_x_norm = np.append(np.ones((train.shape[0], 1)), train_x_norm, 1)
print("Normalized training X data:")
print(train_x_norm)
print("\nTraining Y data:")
print(train_y)

Normalized training X data:
[[ 1.          0.86499308  0.41184693]
 [ 1.         -0.47669628  1.33058239]
 [ 1.         -0.14127394  0.41184693]
 [ 1.         -0.47669628 -1.42562399]
 [ 1.          1.53583777 -1.42562399]
 [ 1.         -0.81211863  0.41184693]
 [ 1.          0.1941484   0.41184693]
 [ 1.         -1.79442691 -0.50688853]
 [ 1.         -0.14127394  1.33058239]
 [ 1.          1.20041542  0.41184693]
 [ 1.          0.52957074  0.41184693]
 [ 1.          1.20041542 -0.50688853]
 [ 1.          0.86499308  1.33058239]
 [ 1.         -1.14754097 -1.42562399]
 [ 1.          1.53583777  1.33058239]
 [ 1.          0.52957074 -0.50688853]
 [ 1.         -0.47669628 -0.50688853]
 [ 1.         -1.79442691  0.41184693]
 [ 1.         -0.14127394 -0.50688853]
 [ 1.          0.52957074  1.33058239]
 [ 1.          0.86499308 -1.42562399]
 [ 1.         -1.14754097 -0.50688853]
 [ 1.         -0.14127394 -1.42562399]
 [ 1.          0.1941484  -0.50688853]
 [ 1.          1.53583777  0.4118469

In [5]:
# 1 shot computation of normal form of function
thetas = np.linalg.inv(train_x_norm.T@train_x_norm) @ (train_x_norm.T@train_y)

# Print out final model
subscr = str.maketrans("0123456789", "₀₁₂₃₄₅₆₇₈₉")
print("FINAL PREDICTION MODEL:")
print("y = " + str(thetas[0][0]), end='')
for i in range(1, len(thetas)):
    print(" + " + str(thetas[i][0]) + "*X" + str(i).translate(subscr), end='')
print()

FINAL PREDICTION MODEL:
y = 3343.2758620689656 + 1036.6301652311076*X₁ + -295.66859638525415*X₂


In [6]:
# Now we can use our test data and make some predictions in order to calculate root mean square error
total_squared_error = 0
for row in test:

    # Normalize our test data based on training data
    row_x = row[0:rowlen-1]
    row_x_norm = (row_x-train_x_mean) / train_x_std

    # Dont forget to add the bias!
    row_x_norm = np.append(np.ones(1), row_x_norm)

    # Calculate the distance between the predicted vs actual, and sum them up so we can average at end
    predicted = (row_x_norm @ thetas)
    actual = row[rowlen-1]
    error = actual - predicted
    total_squared_error += error**2

mse = total_squared_error / test.shape[0]
rmse = math.sqrt(mse)
print("ROOT MEAN SQUARE ERROR:")
print(rmse)

ROOT MEAN SQUARE ERROR:
653.7601025967209
