**This notebook covers work on problems related to overfitting models and how regularization helps achieve more generalization and move away from high variance**

In [11]:
# let's start by importing the required libraries

import numpy as np
import matplotlib.pyplot as plt
import copy, math

In [21]:
# let's start by defining the linear regression prediciton function

def lin_pred_y(x,w,b):
    pred_y = np.dot(x,w) + b
    return pred_y

In [4]:
# now let's define the sigmoid function 

def sig_pred_y(z):
    pred_y = 1/(1+np.exp(-z))
    return pred_y

In [5]:
# now let's define the cost function for linear regression

def cost_lin_reg(x,y,w,b):

    size_x = x.shape[0]
    features_x = x.shape[1]
    cost = 0
    for i in range(size_x):
        error = lin_pred_y(x[i],w,b) - y[i]
        cost += np.square(error)

    cost = cost/(2*size_x)

    return cost

In [6]:
# now let's define the cost function for logistic regression

def cost_log_reg(x,y,w,b):

    size_x = x.shape[0]
    cost=0

    for i in range(size_x):
        pred_y = sig_pred_y(lin_pred_y(x[i],w,b))

        loss  = -y[i] * np.log(pred_y) - (1-y[i])*np.log(1-pred_y)
            
        cost += loss
            
    cost = cost/size_x
    
    return cost

In [9]:
# now let's define the compute gradient function for linear regression

def compute_lin_gradient_fn(x,y,w,b):

    size_x = x.shape[0]
    features_x = x.shape[1]
    dj_dw = np.zeros(features_x)
    dj_db = 0

    for i in range(size_x):

        pred_y = lin_pred_y(x[i],w,b)
        error = pred_y - y[i]
        for j in range(features_x):
            dj_dw[j] += error * x[i,j]
        dj_db += error

    dj_dw = dj_dw/size_x
    dj_db = dj_db/size_x

    return dj_dw, dj_db

In [53]:
# now let's define the function to compute gradient function for logistic regression

def compute_log_gradient_fn(x,y,w,b):

    size_x = x.shape[0]
    features_x = x.shape[1]
    dj_dw = np.zeros(features_x)
    dj_db = 0

    for i in range(size_x):

        pred_y = sig_pred_y(lin_pred_y(x[i],w,b))
        error = pred_y - y[i]
        for j in range(features_x):
            dj_dw[j] += error * x[i,j]
        dj_db += error

    dj_dw = dj_dw/size_x
    dj_db = dj_db/size_x

    return dj_dw, dj_db

In [14]:
# now let's define the function to run gradient descent for linear regression

def run_gradient_descent_lin_reg(x,y,w,b,n,a):

    w_tmp = copy.deepcopy(w)
    b_tmp = b
    size_x = x.shape[0]
    cost_hist = []
    for i in range(n):
        dj_dw, dj_db = compute_lin_gradient_fn(x,y,w_tmp,b_tmp)
        w_tmp = w_tmp - a*dj_dw
        b_tmp = b_tmp - a*dj_db
        cost = cost_lin_reg(x,y,w,b)
        cost_hist.append(cost)
        
    return w_tmp, b_tmp, cost_hist

In [15]:
# now let's define the function to run gradient descent for logistic regression

def run_gradient_descent_log_reg(x,y,w,b,n,a):

    w_tmp = copy.deepcopy(w)
    b_tmp = b
    size_x = x.shape[0]
    cost_hist = []
    for i in range(n):
        dj_dw, dj_db = compute_log_gradient_fn(x,y,w_tmp,b_tmp)
        w_tmp = w_tmp - a*dj_dw
        b_tmp = b_tmp - a*dj_db
        cost = cost_lin_reg(x,y,w,b)
        cost_hist.append(cost)
        
    return w_tmp, b_tmp, cost_hist

In [28]:
# now let's define the regularized cost function for linear regression

# in order to regularize cost function we add Y/2m * (sum(square(weight params)))
# Y here is lambra is the degree of regularization

def reg_cost_lin_reg(x,y,w,b,Y=1.0):

    size_x = x.shape[0]
    features_x = x.shape[1]
    cost = 0
    for i in range(size_x):
        error = lin_pred_y(x[i],w,b) - y[i]
        cost += np.square(error)

    cost = cost/(2*size_x)

    # let's calculate regularization here
    reg_cost = 0
    for j in range(features_x):
        reg_cost+=np.square(w[j])

    reg_cost = (Y/(2*size_x))*reg_cost
    total_cost = cost + reg_cost
    return total_cost

In [29]:
# now let's define the regularized cost function for logistic regression 
# in order to regularize cost function we add Y/2m * (sum(square(weight params)))
# Y here is lambra is the degree of regularization

def reg_cost_log_reg(x,y,w,b,Y=1):

    size_x = x.shape[0]
    features_x = x.shape[1]
    cost=0

    for i in range(size_x):
        pred_y = sig_pred_y(lin_pred_y(x[i],w,b))

        loss  = -y[i] * np.log(pred_y) - (1-y[i])*np.log(1-pred_y)
            
        cost += loss
            
    cost = cost/size_x
    
    # let's calculate regularization here
    reg_cost = 0
    for j in range(features_x):
        reg_cost+=np.square(w[j])

    reg_cost = (Y/(2*size_x))*reg_cost
    total_cost = cost + reg_cost
    return total_cost

In [30]:
# let's test our regularize cost function for linear regression

np.random.seed(1)
X_tmp = np.random.rand(5,6)
y_tmp = np.array([0,1,0,1,0])
w_tmp = np.random.rand(X_tmp.shape[1]).reshape(-1,)-0.5
b_tmp = 0.5
lambda_tmp = 0.7
cost_tmp = reg_cost_lin_reg(X_tmp, y_tmp, w_tmp, b_tmp, lambda_tmp)

print("Regularized cost:", cost_tmp)

Regularized cost: 0.07917239320214277


In [31]:
# now let's test the regularized cost function for logistic regresion

np.random.seed(1)
X_tmp = np.random.rand(5,6)
y_tmp = np.array([0,1,0,1,0])
w_tmp = np.random.rand(X_tmp.shape[1]).reshape(-1,)-0.5
b_tmp = 0.5
lambda_tmp = 0.7
cost_tmp = reg_cost_log_reg(X_tmp, y_tmp, w_tmp, b_tmp, lambda_tmp)

print("Regularized cost:", cost_tmp)

Regularized cost: 0.6850849138741673


In [46]:
# now let's define the compute gradient function for regularized linear regression
# when deriving derivative the derivative of regularization becomes Y/m*w atj (this is when only applying it to w)
def compute_lin_gradient_reg_fn(x,y,w,b,Y=1.0):

    size_x = x.shape[0]
    features_x = x.shape[1]
    dj_dw = np.zeros(features_x)
    dj_db = 0

    for i in range(size_x):

        pred_y = lin_pred_y(x[i],w,b)
        error = pred_y - y[i]
        for j in range(features_x):
            dj_dw[j] += error * x[i,j]
        dj_db += error

    dj_dw = dj_dw/size_x
    dj_db = dj_db/size_x

    for j in range(features_x):
        dj_dw[j] = dj_dw[j]+((Y/size_x)*w[j])

    return dj_db, dj_dw

In [47]:
# let's put the regularized compute gradient derivatives function to test

np.random.seed(1)
X_tmp = np.random.rand(5,3)
y_tmp = np.array([0,1,0,1,0])
w_tmp = np.random.rand(X_tmp.shape[1])
b_tmp = 0.5
lambda_tmp = 0.7
dj_db_tmp, dj_dw_tmp =  compute_lin_gradient_reg_fn(X_tmp, y_tmp, w_tmp, b_tmp, lambda_tmp)

print(f"dj_db: {dj_db_tmp}", )
print(f"Regularized dj_dw:\n {dj_dw_tmp.tolist()}", )

dj_db: 0.6648774569425726
Regularized dj_dw:
 [0.29653214748822276, 0.4911679625918033, 0.21645877535865857]


In [51]:
# now let's define the regularized function to compute gradient function for logistic regression

def compute_log_gradient_reg_fn(x,y,w,b,Y=1.):

    size_x = x.shape[0]
    features_x = x.shape[1]
    dj_dw = np.zeros(features_x)
    dj_db = 0

    for i in range(size_x):

        pred_y = sig_pred_y(lin_pred_y(x[i],w,b))
        error = pred_y - y[i]
        for j in range(features_x):
            dj_dw[j] += error * x[i,j]
        dj_db += error

    dj_dw = dj_dw/size_x
    dj_db = dj_db/size_x

    for j in range(features_x):
        dj_dw[j] = dj_dw[j]+((Y/size_x)*w[j])

    return dj_db, dj_dw

In [52]:
np.random.seed(1)
X_tmp = np.random.rand(5,3)
y_tmp = np.array([0,1,0,1,0])
w_tmp = np.random.rand(X_tmp.shape[1])
b_tmp = 0.5
lambda_tmp = 0.7
dj_db_tmp, dj_dw_tmp =  compute_log_gradient_reg_fn(X_tmp, y_tmp, w_tmp, b_tmp, lambda_tmp)

print(f"dj_db: {dj_db_tmp}", )
print(f"Regularized dj_dw:\n {dj_dw_tmp.tolist()}", )

dj_db: 0.341798994972791
Regularized dj_dw:
 [0.17380012933994293, 0.32007507881566943, 0.10776313396851499]
