In [1]:
import numpy as np
import copy
import math

## Logistic Model Prediction (+Sigmoid or Logistic Function)
- We would like the predictions of our classification model to be between 0 and 1 since our output variable $y$ is either 0 or 1. 
- This can be accomplished by using a "sigmoid function" which maps all input values to values between 0 and 1. 

$$g(z) = \frac{1}{1+e^{-z}}$$

 A logistic regression model applies the sigmoid to the familiar linear regression model as shown below:

$$ f_{\mathbf{w},b}(\mathbf{x}^{(i)}) = g(\mathbf{w} \cdot \mathbf{x}^{(i)} + b ) = \frac{1}{1+e^{-[{w} \cdot \mathbf{x}^{(i)} + b]}}\tag{3} $$

## Logistic Loss Function

It is natural to consider Linear regression's cost function for logistic regression as well. However, $f_{wb}(x)$ now has a non-linear component, the sigmoid function, and the squared error doesn't handle values from 0 to 1 well.

>**Definition Note:**   In this course, these definitions are used:  
**Loss** is a measure of the difference of a single example to its target value while the  
**Cost** is a measure of the losses over the training set


Logistic Regression uses a loss function more suited to the task of categorization where the target is 0 or 1 rather than any number. 

This is defined: 
* $loss(f_{\mathbf{w},b}(\mathbf{x}^{(i)}), y^{(i)})$ is the cost for a single data point, which is:

\begin{equation}
  loss(f_{\mathbf{w},b}(\mathbf{x}^{(i)}), y^{(i)}) = \begin{cases}
    - \log\left(f_{\mathbf{w},b}\left( \mathbf{x}^{(i)} \right) \right) & \text{if $y^{(i)}=1$}\\
    - \log \left( 1 - f_{\mathbf{w},b}\left( \mathbf{x}^{(i)} \right) \right) & \text{if $y^{(i)}=0$}
  \end{cases}
\end{equation}


*  $f_{\mathbf{w},b}(\mathbf{x}^{(i)})$ is the model's prediction, while $y^{(i)}$ is the target value.

*  $f_{\mathbf{w},b}(\mathbf{x}^{(i)}) = g(\mathbf{w} \cdot\mathbf{x}^{(i)}+b)$ where function $g$ is the sigmoid function.

The defining feature of this loss function is the fact that it uses two separate curves. One for the case when the target is zero or ($y=0$) and another for when the target is one ($y=1$). Combined, these curves provide the behavior useful for a loss function, namely, being zero when the prediction matches the target and rapidly increasing in value as the prediction differs from the target.

The loss function above can be rewritten to be easier to implement, but might be rather formidable-looking equation.
    $$loss(f_{\mathbf{w},b}(\mathbf{x}^{(i)}), y^{(i)}) = (-y^{(i)} \log\left(f_{\mathbf{w},b}\left( \mathbf{x}^{(i)} \right) \right) - \left( 1 - y^{(i)}\right) \log \left( 1 - f_{\mathbf{w},b}\left( \mathbf{x}^{(i)} \right) \right)$$

## Gradient Descent for Logistic Regression
 - The gradient functions for linear and logistic regression are very similar. They differ only in the implementation of $f_{wb}$.

In [2]:
def sigmoid(z):
    """Compute the sigmoid of z

    Args:
        z (ndarray): A scalar, numpy array of any size.

    Returns:
        g (ndarray): sigmoid(z), with the same shape as z
    """

    g = 1 / (1 + np.exp(-z))
   
    return g

In [3]:
def compute_cost_logistic(X, y, w, b):
    """Computes cost

    Args:
      X (ndarray (m,n)): Data, m examples with n features
      y (ndarray (m,)) : target values
      w (ndarray (n,)) : model parameters  
      b (scalar)       : model parameter

    Returns:
      cost (scalar): cost
    """

    m, n = X.shape
    total_cost = 0.0
    for i in range(m):
        z_i = np.dot(X[i],w) + b
        f_wb_i = sigmoid(z_i)
        total_cost += -y[i] * np.log(f_wb_i) - (1-y[i]) * np.log(1-f_wb_i)

    total_cost = cost / m
    return total_cost

In [4]:
def compute_gradient_logistic(X, y, w, b): 
    """Computes the gradient for linear regression
 
    Args:
      X (ndarray (m,n): Data, m examples with n features
      y (ndarray (m,)): target values
      w (ndarray (n,)): model parameters  
      b (scalar)      : model parameter
    Returns
      dj_dw (ndarray (n,)): The gradient of the cost w.r.t. the parameters w. 
      dj_db (scalar)      : The gradient of the cost w.r.t. the parameter b. 
    """

    m, n = X.shape
    dj_dw = np.zeros(w.shape)                           # (n,)
    dj_db = 0.

    for i in range(m):
        f_wb_i = sigmoid(np.dot(X[i],w) + b)          # (n,)(n,) = scalar
        err_i  = f_wb_i  - y[i]                       # scalar
        for j in range(n):
            dj_dw[j] += err_i * X[i, j]      # scalar
        dj_db += err_i

    dj_dw = dj_dw / m                                   # (n,)
    dj_db = dj_db / m                                   # scalar
        
    return dj_db, dj_dw

# # UNQ_C3
# # GRADED FUNCTION: compute_gradient
# def compute_gradient(X, y, w, b, *argv): 
#     """
#     Computes the gradient for logistic regression 
 
#     Args:
#       X : (ndarray Shape (m,n)) data, m examples by n features
#       y : (ndarray Shape (m,))  target value 
#       w : (ndarray Shape (n,))  values of parameters of the model      
#       b : (scalar)              value of bias parameter of the model
#       *argv : unused, for compatibility with regularized version below
#     Returns
#       dj_dw : (ndarray Shape (n,)) The gradient of the cost w.r.t. the parameters w. 
#       dj_db : (scalar)             The gradient of the cost w.r.t. the parameter b. 
#     """
#     m, n = X.shape
#     dj_dw = np.zeros(w.shape)
#     dj_db = 0.

#     ### START CODE HERE ### 
#     for i in range(m):
#         z_wb = 0
#         for j in range(n):
#             z_wb_ij = X[i, j] * w[j]
#             z_wb += z_wb_ij
#         z_wb += b
#         f_wb = sigmoid(z_wb)
        
#         dj_db_i = f_wb - y[i]
#         dj_db += dj_db_i
        
#         for j in range(n):
#             dj_dw[j] += dj_db_i * X[i, j]
            
#     dj_dw = dj_dw / m
#     dj_db = dj_db / m
#     ### END CODE HERE ###
        
#     return dj_db, dj_dw

In [None]:
def gradient_descent(X, y, w_in, b_in, alpha, num_iters): 
    """Performs batch gradient descent
    
    Args:
      X (ndarray (m,n)   : Data, m examples with n features
      y (ndarray (m,))   : target values
      w_in (ndarray (n,)): Initial values of model parameters  
      b_in (scalar)      : Initial values of model parameter
      alpha (float)      : Learning rate
      num_iters (scalar) : number of iterations to run gradient descent
      
    Returns:
      w (ndarray (n,))   : Updated values of parameters
      b (scalar)         : Updated value of parameter 
    """

    # An array to store cost J and w's at each iteration primarily for graphing later
    J_history = []
    w = copy.deepcopy(w_in)  #avoid modifying global w within function
    b = b_in
    
    for i in range(num_iters):
        # Calculate the gradient and update the parameters
        dj_db, dj_dw = compute_gradient_logistic(X, y, w, b)   

        # Update Parameters using w, b, alpha and gradient
        w = w - alpha * dj_dw               
        b = b - alpha * dj_db               
      
        # Save cost J at each iteration
        if i<100000:      # prevent resource exhaustion 
            J_history.append( compute_cost_logistic(X, y, w, b) )

        # Print cost every at intervals 10 times or as many iterations if < 10
        if i% math.ceil(num_iters / 10) == 0:
            print(f"Iteration {i:4d}: Cost {J_history[-1]}   ")
        
    return w, b, J_history         #return final w,b and J history for graphing

# UNQ_C4
# GRADED FUNCTION: predict

def predict(X, w, b): 
    """
    Predict whether the label is 0 or 1 using learned logistic
    regression parameters w
    
    Args:
      X : (ndarray Shape (m,n)) data, m examples by n features
      w : (ndarray Shape (n,))  values of parameters of the model      
      b : (scalar)              value of bias parameter of the model

    Returns:
      p : (ndarray (m,)) The predictions for X using a threshold at 0.5
    """
    # number of training examples
    m, n = X.shape   
    p = np.zeros(m)
   
    ### START CODE HERE ### 
    # Loop over each example
    for i in range(m):   
        z_wb = 0
        # Loop over each feature
        for j in range(n): 
            # Add the corresponding term to z_wb
            z_wb += X[i, j] * w[j]
        
        # Add bias term 
        z_wb += b
        
        # Calculate the prediction for this example
        f_wb = sigmoid(z_wb)

        # Apply the threshold
        p[i] = 1 if f_wb >= 0.5 else 0
        
    ### END CODE HERE ### 
    return p
