# Classification with linear models

In [133]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import sklearn as sk
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Regression

In [134]:
diabetes = sk.datasets.load_diabetes()
#both numpy arrays
#feature matrix
data = diabetes.data
#one dimensional numpy array
target = diabetes.target

In [135]:
print(f"Number of data points: {data.shape[0]}\nNumber of features: {data.shape[1]}")

Number of data points: 442
Number of features: 10


In [136]:
diabetes.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [137]:
#Lets make train test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [138]:
def initialize_parameters(X):
    #create column vector of ones
    weights = np.zeros((X_train.shape[1], 1))
    bias = 0.0
    return weights, bias

In [139]:
def compute_predictions(X, weights, bias):
    #linear regression is as follows, y = Xw + b
    y = X @ weights + bias
    return y

In [140]:
w, b = initialize_parameters(X_train)
y_pred = compute_predictions(X_train, w, b)

print(f"Sanity checking y dimensions, should be ({X_train.shape[0]}, 1): {y_pred.shape}")
print(f"Y prediction vector: {y_pred}")


Sanity checking y dimensions, should be (353, 1): (353, 1)
Y prediction vector: [[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [

In [141]:
def compute_loss(y_pred, y_true):
    #our loss function is the Mean Squared error
    n = y_true.shape[0]
    loss = (1 / n) * np.sum((y_pred - y_true) ** 2)
    return float(loss)

In [142]:
def derivative_w_wrt_loss(y_pred, y_true, X):
    '''
    The derivative of the loss function with respect to the weights
    L = 1/n * SIGMA((y_pred - y_true)**2)
    Plug in y_pred = x_i*w + b and y_true = y_i
    L = 1/n * SIGMA(x_i*w + b - y_i)**2
    dL/dw = 1/n * SIGMA(2*x_i*(x_i*w + b - y_i))
    dL/dw = 2/n * SIGMA((x_i*w + b - y_i) *x_i) #move the 2 out of the sum
    '''
    n = y_pred.shape[0]
    error = y_pred - y_true
    #X.T has shape (10, 353) and error has shape (353, 1) and dw should have shape (10, 1)
    gradient = 2/n * np.dot(X.T, (y_pred - y_true))
    return gradient

In [143]:
def derivative_b_wrt_loss(y_pred, y_true,X):
    '''
    The derivative of the loss function with respect to the bias
    L = 1/n * SIGMA((y_pred - y_true)**2)
    Plug in y_pred = x_i*w + b and y_true = y_i
    L = 1/n * SIGMA(x_i*w + b - y_i)**2
    dL/dw = 1/n * SIGMA(2(x_i*w + b - y_i))
    dL/dw = 2/n * SIGMA((x_i*w + b - y_i)) #move the 2 out of the sum
    '''
    n = y_pred.shape[0]
    error = y_pred - y_true
    gradient = 2/n * (np.sum(error)) #take the sum since b is a scalar
    return gradient

In [144]:
def train_via_gradient_descent(X_train, y_train, w, b, learning_rate=0.01, num_iterations=1000):
    y_pred = compute_predictions(X_train, w, b)
    y_prev = y_pred
    for i in range(num_iterations):
        print(f"Iteration {i+1} loss: {compute_loss(y_pred, y_train)}")
        dw = derivative_w_wrt_loss(y_pred, y_train, X_train)
        db = derivative_b_wrt_loss(y_pred, y_train, X_train)
        w = w - learning_rate * dw
        b =  b - learning_rate *db
        y_pred = compute_predictions(X_train, w, b)

    return w, b, y_pred


In [145]:
w_best , b_best, y_pred = train_via_gradient_descent(X_train, y_train, w, b, learning_rate=0.005, num_iterations=1000)

Iteration 1 loss: 10488097.0
Iteration 2 loss: 55548499.72330002
Iteration 3 loss: 343975631.5148714
Iteration 4 loss: 2190168859.3995433
Iteration 5 loss: 14007467091.766544
Iteration 6 loss: 89648811347.32436
Iteration 7 loss: 573821491792.7247
Iteration 8 loss: 3672962402055.6816
Iteration 9 loss: 23510253454557.844
Iteration 10 loss: 150486769752518.84
Iteration 11 loss: 963250752924138.2
Iteration 12 loss: 6165671732807364.0
Iteration 13 loss: 3.946584818294191e+16
Iteration 14 loss: 2.5261694762260822e+17
Iteration 15 loss: 1.6169758200259674e+18
Iteration 16 loss: 1.0350100526392609e+19
Iteration 17 loss: 6.624995845937491e+19
Iteration 18 loss: 4.24059359102602e+20
Iteration 19 loss: 2.714361551679834e+21
Iteration 20 loss: 1.737435685614744e+22
Iteration 21 loss: 1.1121152080051409e+23
Iteration 22 loss: 7.118538234920117e+23
Iteration 23 loss: 4.5565051387900187e+24
Iteration 24 loss: 2.916573374288103e+25
Iteration 25 loss: 1.8668694511480744e+26
Iteration 26 loss: 1.1949644

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  loss = (1 / n) * np.sum((y_pred - y_true) ** 2)


Iteration 389 loss: inf
Iteration 390 loss: inf
Iteration 391 loss: inf
Iteration 392 loss: inf
Iteration 393 loss: inf
Iteration 394 loss: inf
Iteration 395 loss: inf
Iteration 396 loss: inf
Iteration 397 loss: inf
Iteration 398 loss: inf
Iteration 399 loss: inf
Iteration 400 loss: inf
Iteration 401 loss: inf
Iteration 402 loss: inf
Iteration 403 loss: inf
Iteration 404 loss: inf
Iteration 405 loss: inf
Iteration 406 loss: inf
Iteration 407 loss: inf
Iteration 408 loss: inf
Iteration 409 loss: inf
Iteration 410 loss: inf
Iteration 411 loss: inf
Iteration 412 loss: inf
Iteration 413 loss: inf
Iteration 414 loss: inf
Iteration 415 loss: inf
Iteration 416 loss: inf
Iteration 417 loss: inf
Iteration 418 loss: inf
Iteration 419 loss: inf
Iteration 420 loss: inf
Iteration 421 loss: inf
Iteration 422 loss: inf
Iteration 423 loss: inf
Iteration 424 loss: inf
Iteration 425 loss: inf
Iteration 426 loss: inf
Iteration 427 loss: inf
Iteration 428 loss: inf
Iteration 429 loss: inf
Iteration 430 lo

  gradient = 2/n *(X.T @ error)
  b =  b - learning_rate *db


Iteration 761 loss: nan
Iteration 762 loss: nan
Iteration 763 loss: nan
Iteration 764 loss: nan
Iteration 765 loss: nan
Iteration 766 loss: nan
Iteration 767 loss: nan
Iteration 768 loss: nan
Iteration 769 loss: nan
Iteration 770 loss: nan
Iteration 771 loss: nan
Iteration 772 loss: nan
Iteration 773 loss: nan
Iteration 774 loss: nan
Iteration 775 loss: nan
Iteration 776 loss: nan
Iteration 777 loss: nan
Iteration 778 loss: nan
Iteration 779 loss: nan
Iteration 780 loss: nan
Iteration 781 loss: nan
Iteration 782 loss: nan
Iteration 783 loss: nan
Iteration 784 loss: nan
Iteration 785 loss: nan
Iteration 786 loss: nan
Iteration 787 loss: nan
Iteration 788 loss: nan
Iteration 789 loss: nan
Iteration 790 loss: nan
Iteration 791 loss: nan
Iteration 792 loss: nan
Iteration 793 loss: nan
Iteration 794 loss: nan
Iteration 795 loss: nan
Iteration 796 loss: nan
Iteration 797 loss: nan
Iteration 798 loss: nan
Iteration 799 loss: nan
Iteration 800 loss: nan
Iteration 801 loss: nan
Iteration 802 lo