# Classification with linear models

In [27]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import sklearn as sk
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Regression

In [28]:
diabetes = sk.datasets.load_diabetes()
#both numpy arrays
#feature matrix
data = diabetes.data
#one dimensional numpy array
target = diabetes.target

In [29]:
print(f"Number of data points: {data.shape[0]}\nNumber of features: {data.shape[1]}")

Number of data points: 442
Number of features: 10


In [30]:
diabetes.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [31]:
#Lets make train test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [32]:
def initialize_parameters(X):
    #create column vector of ones
    weights = np.random.randn(X.shape[1], 1) * 0.01
    bias = 0.0
    return weights, bias

In [33]:
def compute_predictions(X, weights, bias):
    #linear regression is as follows, y = Xw + b
    y = X @ weights + bias
    return y

In [34]:
w, b = initialize_parameters(X_train)
y_pred = compute_predictions(X_train, w, b)

print(f"Sanity checking y dimensions, should be ({X_train.shape[0]}, 1): {y_pred.shape}")
print(f"Y prediction vector: {y_pred}")


Sanity checking y dimensions, should be (353, 1): (353, 1)
Y prediction vector: [[ 6.30309906e-03]
 [ 2.11224862e-02]
 [ 8.93824155e-03]
 [-1.95940722e-02]
 [-2.05418860e-02]
 [-2.73292924e-02]
 [ 1.41996336e-02]
 [ 4.85991649e-02]
 [ 3.10024675e-02]
 [-4.87705811e-02]
 [-2.75634614e-02]
 [ 2.23129566e-02]
 [ 3.46133849e-02]
 [ 5.02373281e-02]
 [ 2.54292831e-03]
 [ 7.19576292e-03]
 [-1.52670819e-02]
 [-8.83790498e-03]
 [-1.00613303e-02]
 [-2.76178334e-02]
 [ 1.74347399e-02]
 [ 2.42842212e-02]
 [-1.43125213e-02]
 [-1.10558314e-02]
 [ 1.08689063e-02]
 [ 3.00860365e-02]
 [-3.02887254e-03]
 [ 1.22056914e-02]
 [ 3.60742478e-03]
 [-3.95520964e-03]
 [-5.65136148e-03]
 [ 5.62426981e-03]
 [-2.29043364e-02]
 [-9.25794358e-03]
 [-9.85210513e-03]
 [ 1.47803392e-02]
 [ 2.34759360e-02]
 [ 4.08583013e-04]
 [ 3.87791092e-02]
 [ 2.21860064e-02]
 [-2.74275439e-03]
 [-4.79768553e-02]
 [-1.43262036e-02]
 [ 6.46158530e-02]
 [ 5.13409671e-02]
 [ 1.53465486e-02]
 [ 2.32892140e-03]
 [ 4.39919693e-02]
 [ 2.504

In [35]:
def compute_loss(y_pred, y_true):
    #our loss function is the Mean Squared error
    y_true = y_true.reshape(-1, 1)
    n = y_true.shape[0]
    loss = (1 / n) * np.sum((y_pred - y_true) ** 2)
    return float(loss)

In [36]:
def derivative_w_wrt_loss(y_pred, y_true, X):
    '''
    The derivative of the loss function with respect to the weights
    L = 1/n * SIGMA((y_pred - y_true)**2)
    Plug in y_pred = x_i*w + b and y_true = y_i
    L = 1/n * SIGMA(x_i*w + b - y_i)**2
    dL/dw = 1/n * SIGMA(2*x_i*(x_i*w + b - y_i))
    dL/dw = 2/n * SIGMA((x_i*w + b - y_i) *x_i) #move the 2 out of the sum
    '''
    y_true = y_true.reshape(-1, 1)
    n = y_pred.shape[0]

    error = y_pred - y_true

    #X.T has shape (10, 353) and error has shape (353, 1) and dw should have shape (10, 1)
    gradient = 2/n * np.dot(X.T, (y_pred - y_true))
    print(f"Gradient shape: {gradient.shape}")
    return gradient

In [37]:
def derivative_b_wrt_loss(y_pred, y_true,X):
    '''
    The derivative of the loss function with respect to the bias
    L = 1/n * SIGMA((y_pred - y_true)**2)
    Plug in y_pred = x_i*w + b and y_true = y_i
    L = 1/n * SIGMA(x_i*w + b - y_i)**2
    dL/dw = 1/n * SIGMA(2(x_i*w + b - y_i))
    dL/dw = 2/n * SIGMA((x_i*w + b - y_i)) #move the 2 out of the sum
    '''
    y_true = y_true.reshape(-1, 1)
    n = y_pred.shape[0]
    error = y_pred - y_true
    gradient = 2/n * np.sum(error) #take the sum since b is a scalar
    return gradient

In [38]:
def train_via_gradient_descent(X_train, y_train, w, b, learning_rate=0.01, num_iterations=1000):

    # y_prev = y_pred
    for i in range(num_iterations):
        y_pred = compute_predictions(X_train, w, b)
        print(f"Iteration {i+1} loss: {compute_loss(y_pred, y_train)}")

        dw = derivative_w_wrt_loss(y_pred, y_train, X_train)
        db = derivative_b_wrt_loss(y_pred, y_train, X_train)
        w = w - learning_rate * dw
        b =  b - learning_rate *db
        y_pred = compute_predictions(X_train, w, b)

    return w, b, y_pred


In [39]:

w,b = initialize_parameters(X_train)
w_best , b_best, y_pred = train_via_gradient_descent(X_train, y_train, w, b, learning_rate=0.001, num_iterations=5000)

Iteration 1 loss: 29713.414188212773
Gradient shape: (10, 1)
Iteration 2 loss: 29583.799513182068
Gradient shape: (10, 1)
Iteration 3 loss: 29455.052009860006
Gradient shape: (10, 1)
Iteration 4 loss: 29327.162730501015
Gradient shape: (10, 1)
Iteration 5 loss: 29200.12284957375
Gradient shape: (10, 1)
Iteration 6 loss: 29073.923661906843
Gradient shape: (10, 1)
Iteration 7 loss: 28948.556580863788
Gradient shape: (10, 1)
Iteration 8 loss: 28824.013136546306
Gradient shape: (10, 1)
Iteration 9 loss: 28700.28497402576
Gradient shape: (10, 1)
Iteration 10 loss: 28577.36385160232
Gradient shape: (10, 1)
Iteration 11 loss: 28455.241639091237
Gradient shape: (10, 1)
Iteration 12 loss: 28333.910316135985
Gradient shape: (10, 1)
Iteration 13 loss: 28213.36197054771
Gradient shape: (10, 1)
Iteration 14 loss: 28093.588796670625
Gradient shape: (10, 1)
Iteration 15 loss: 27974.583093773006
Gradient shape: (10, 1)
Iteration 16 loss: 27856.337264463207
Gradient shape: (10, 1)
Iteration 17 loss: 27

In [None]:
print(w_best)
print(b_best)
print(y_pred)