In [68]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import datasets
import matplotlib.pyplot as plt

# best to split the data into a train & test batch (75:25 or similar)

In [69]:
# using sklean diabetes dataset
data_X, data_y = datasets.load_diabetes(return_X_y=True)
n = 10
m = len(data_y)
# peaking into the dataset
print(data_X.shape, data_y.shape)
print(data_X[0:5], data_y[0:5])
print(n, m)
print(np.min(data_y), np.max(data_y), np.mean(data_y))

(442, 10) (442,)
[[ 0.03807591  0.05068012  0.06169621  0.02187235 -0.0442235  -0.03482076
  -0.04340085 -0.00259226  0.01990842 -0.01764613]
 [-0.00188202 -0.04464164 -0.05147406 -0.02632783 -0.00844872 -0.01916334
   0.07441156 -0.03949338 -0.06832974 -0.09220405]
 [ 0.08529891  0.05068012  0.04445121 -0.00567061 -0.04559945 -0.03419447
  -0.03235593 -0.00259226  0.00286377 -0.02593034]
 [-0.08906294 -0.04464164 -0.01159501 -0.03665645  0.01219057  0.02499059
  -0.03603757  0.03430886  0.02269202 -0.00936191]
 [ 0.00538306 -0.04464164 -0.03638469  0.02187235  0.00393485  0.01559614
   0.00814208 -0.00259226 -0.03199144 -0.04664087]] [151.  75. 141. 206. 135.]
10 442
25.0 346.0 152.13348416289594


In [70]:
# function to predict y
def predict(weights, Xs):
    return weights[0] + weights[1:].T.dot(Xs)

# evaluate total cost (predictions of entire dataset)
def cost_function(weights, X):
    losses = []
    for i in range(m):
        losses.append((predict(weights, X[i])-data_y[i])**2)
    return (1/m)*np.sum(losses)

# get derivative with respect to each individual weight
def derive_cost(weights, X):
    H = 0.0000001
    derivatives = np.zeros(n+1)
    for i in range(n+1):
        temp = np.copy(weights)
        temp[i] = temp[i]+H
        a = cost_function(temp, X) - cost_function(weights, X)
        derivatives[i] = a/H
    return derivatives

In [71]:
def absolute_loss(weights, X):
    losses = np.zeros(m)
    for i in range(m):
        losses[i] = abs(predict(weights, X[i])-data_y[i])
    return losses

In [None]:
lr = 0.005
n_epochs = 1337

my_weights = np.ones(n + 1)
print(my_weights)

los = []
abslos = []

print(len(derive_cost(my_weights, data_X)))
for epoch in tqdm(range(n_epochs)):
    
    temp_weights = np.copy(my_weights)
    for i in range(len(temp_weights)):
        temp_weights[i] = temp_weights[i]-derive_cost(my_weights, data_X)[i]*lr
        
    los.append(cost_function(my_weights, data_X))
    abslos.append(absolute_loss(my_weights, data_X))
        
    my_weights = np.copy(temp_weights)

plt.plot(los)

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
11


  5%|██                                       | 66/1337 [00:11<03:36,  5.88it/s]

In [73]:
from random import randint


def display_loss_stats():
    losses = absolute_loss(my_weights, data_X)
    print(np.min(losses), np.max(losses), np.mean(losses))

# evaluate results and compare with labels
print(my_weights)
evals = []
for i in range(m):
    evals.append(predict(my_weights, data_X[i]))
# quickly display n random comparisons using updated weights
for i in range(5):
    r = randint(0, m)
    print("prediction {} [{}] :".format(i, r), predict(my_weights, data_X[r]), data_y[r], end='')
    print("(absolute loss = {})".format(np.abs(predict(my_weights, data_X[r])-data_y[r])))

print(np.sum(absolute_loss(my_weights, data_X)))
display_loss_stats()

[57.43322858  1.31778729  1.07096571  1.99929907  1.75112059  1.35745655
  1.29281725  0.32702127  1.73133066  1.96315041  1.64972671]
prediction 0 [412] : 57.95657061565583 261.0(absolute loss = 203.04342938434417)
prediction 1 [286] : 56.799694995662044 60.0(absolute loss = 3.200305004337956)
prediction 2 [258] : 57.68275373964687 89.0(absolute loss = 31.31724626035313)
prediction 3 [262] : 58.062283676210775 308.0(absolute loss = 249.93771632378923)
prediction 4 [69] : 56.98093088871058 178.0(absolute loss = 121.01906911128941)
42580.19595967147
0.1717831297005361 288.38731482329064 96.3352849766323
