In [1]:
%matplotlib notebook
from typing import Callable, Dict, List, Tuple, Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf

from matplotlib import cm

sns.set(font_scale=1.5)
sns.set_style("whitegrid", {'grid.linestyle':'--'})

## Linear regression example

In [2]:
auto = pd.read_csv("../lecture_3/auto_mpg.csv")
# drop the null values
auto.dropna(inplace=True)
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0,chevrolet chevelle malibu
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,1.0,buick skylark 320
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,1.0,plymouth satellite
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,1.0,amc rebel sst
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,1.0,ford torino


In [3]:
# simple linear regression with the `statsmodels` library
auto_model = smf.ols(formula='mpg ~ horsepower', data=auto)
auto_result = auto_model.fit()
print(auto_result.summary())

                            OLS Regression Results                            
Dep. Variable:                    mpg   R-squared:                       0.606
Model:                            OLS   Adj. R-squared:                  0.605
Method:                 Least Squares   F-statistic:                     599.7
Date:                Sun, 10 Oct 2021   Prob (F-statistic):           7.03e-81
Time:                        23:05:53   Log-Likelihood:                -1178.7
No. Observations:                 392   AIC:                             2361.
Df Residuals:                     390   BIC:                             2369.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     39.9359      0.717     55.660      0.0

In [4]:
def linear_regression_loss(
    X: np.ndarray,
    y: np.ndarray,
    betas: np.ndarray,
) -> float:
    """Calculate the loss of a linear regression problem."""
    if not isinstance(betas, np.ndarray):
        betas = np.array(betas)

    loss = np.sum(np.square(y - X @ betas))
    return loss

In [5]:
X_auto = np.vstack((np.ones(shape=len(auto)), auto["horsepower"].values.T)).T
y_auto = auto["mpg"].values

In [6]:
# test run
auto_betas = [50, -0.2]
linear_regression_loss(X=X_auto, y=y_auto, betas=auto_betas)

22974.22

In [7]:
# fix beta_1, i.e., slope
auto_slope = -0.2
auto_losses = []
auto_beta_0s = np.linspace(start=30, stop=60, num=100)
for b in auto_beta_0s:
    auto_betas = [b, auto_slope]
    loss = linear_regression_loss(X=X_auto, y=y_auto, betas=auto_betas)
    auto_losses.append(loss)
    
# plot the loss function
plt.figure()
sns.lineplot(x=auto_beta_0s, y=auto_losses)
plt.xlabel("intercept")
plt.ylabel("Loss")
plt.tight_layout()

<IPython.core.display.Javascript object>

In [8]:
def linear_regression_loss_gradient(
    X: np.ndarray,
    y: np.ndarray,
    betas: np.ndarray
) -> np.ndarray:
    """Calculates the gradient of the loss of a linear regression problem."""
    if not isinstance(betas, np.ndarray):
        betas = np.array(betas)
    
    grad_0 = -2 * np.sum(y - X @ betas)
    grad_1 = -2 * np.sum(np.dot((y - X @ betas), X[:, 1]))
    
    return np.array([grad_0, grad_1])

In [9]:
def gradient_descent(
    X: np.ndarray,
    y: np.ndarray,
    initial_guess: Union[List, np.ndarray], 
    learning_rate: float,
    loss_function: Callable,
    gradient_function: Callable,
    verbose: bool = False,
    threshold: float = 1e-6,
    fix_guess: Dict = None,
) -> Tuple[List, List]:
    """Gradient descent routine."""
    guess_current = initial_guess
    if fix_guess:
        for k, v in fix_guess.items():
            guess_current[k] = v
            
    guess_iter = [guess_current]
    losses_iter = [loss_function(X=X, y=y, betas=guess_current)]

    difference = float("inf")
    iteration_count = 0
    while abs(difference) > threshold:
        iteration_count += 1
        guess_next = guess_current - learning_rate * gradient_function(
            X=X, y=y, betas=guess_current) 
        if fix_guess:
            for k, v in fix_guess.items():
                guess_next[k] = v
        guess_iter.append(list(guess_next))
        
        losses_next = loss_function(X=X, y=y, betas=guess_next)
        difference = losses_next - losses_iter[-1]
        losses_iter.append(losses_next)
        
        # update guess
        guess_current = guess_next
        # to print out intermediate results
        if verbose and iteration_count % 1000 == 0:
            print(guess_next, losses_next)
            
    return guess_iter, losses_iter

In [10]:
auto_guess_iter, auto_losses_iter = gradient_descent(
    X=X_auto,
    y=y_auto,
    initial_guess=[60, -0.2], 
    learning_rate=1e-4,
    loss_function=linear_regression_loss,
    gradient_function=linear_regression_loss_gradient,
    verbose=True,
    threshold=1e-3,
    fix_guess={1: -0.2}
)

In [11]:
# plot the loss function
sns.lineplot(x=auto_beta_0s, y=auto_losses)

# plot the iterative updates
sns.scatterplot(
    x=np.array(auto_guess_iter)[1:, 0], y=auto_losses_iter[1:], color="red", alpha=0.3)

plt.xlabel("intercept")
plt.ylabel("Loss")
plt.tight_layout()

In [12]:
# full gradient descent
auto_beta_1s = np.linspace(start=-0.3, stop=-0.1, num=100)
auto_X, auto_Y = np.meshgrid(auto_beta_0s, auto_beta_1s)
auto_losses_2d = np.zeros(shape=auto_X.shape)  # initialize the losses
for i, x in enumerate(auto_X):
    for j in range(len(x)):
        auto_losses_2d[i][j] = linear_regression_loss(
            X=X_auto, y=y_auto, betas=[auto_X[i][j], auto_Y[i][j]])
        
# make the 3d plot
fig, ax = plt.subplots(subplot_kw={"projection": "3d"})
# plot the actual minimum
auto_betas_min = [39.9359, -0.1578]
ax.scatter(
    xs=[auto_betas_min[0],], 
    ys=[auto_betas_min[1],], 
    zs=[linear_regression_loss(X=X_auto, y=y_auto, betas=auto_betas_min)],
    s=100,
    color="black",
)
# plot the loss function
surf = ax.plot_surface(auto_X, auto_Y, auto_losses_2d, cmap=cm.coolwarm,
                       linewidth=0, antialiased=False, alpha=0.2)
plt.contour(auto_X, auto_Y, auto_losses_2d, levels=30, cmap=cm.coolwarm)

plt.tight_layout()

<IPython.core.display.Javascript object>

  return array(a, dtype, copy=False, order=order, subok=True)


In [13]:
%%time
# gradient descent with both intercept and slope
# witht he following setting, it can take about 15 seconds to converge
auto_guess_iter, auto_losses_iter = gradient_descent(
    X=X_auto,
    y=y_auto,
    initial_guess=[50, -0.1], 
    learning_rate=2e-7,
    loss_function=linear_regression_loss,
    gradient_function=linear_regression_loss_gradient,
    threshold=1e-5,
)

CPU times: user 15.6 s, sys: 405 ms, total: 16 s
Wall time: 16.5 s


In [14]:
# make the 3d plot with the gradient descent path
fig, ax = plt.subplots(subplot_kw={"projection": "3d"})

# plot the loss function
surf = ax.plot_surface(auto_X, auto_Y, auto_losses_2d, cmap=cm.coolwarm,
                       linewidth=0, antialiased=False, alpha=0.2)
plt.contour(auto_X, auto_Y, auto_losses_2d, levels=30, cmap=cm.coolwarm)

# plot the path
sample_rate = 10
ax.scatter3D(
    xs=np.array(auto_guess_iter)[:, 0][::sample_rate], 
    ys=np.array(auto_guess_iter)[:, 1][::sample_rate], 
    zs=auto_losses_iter[::sample_rate],
    s=10,
    cmap=cm.coolwarm,
)
plt.tight_layout()

<IPython.core.display.Javascript object>

In [15]:
# contour plot of the same path
plt.figure()
plt.contour(auto_X, auto_Y, auto_losses_2d, levels=30, cmap=cm.coolwarm)
plt.scatter(
    x=np.array(auto_guess_iter)[:, 0][::sample_rate], 
    y=np.array(auto_guess_iter)[:, 1][::sample_rate],
    color="green",
)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x133073550>

## Logistic regression example

In [16]:
cancer = pd.read_csv("../lecture_4/breast_cancer_data.csv")
cancer_label = cancer["diagnosis"].apply(lambda x: 0 if x == "B" else 1)
cancer.insert(2, "label", cancer_label)
cancer.head()

Unnamed: 0,id,diagnosis,label,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,...,radius_extreme,texture_extreme,perimeter_extreme,area_extreme,smoothness_extreme,compactness_extreme,concavity_extreme,concave_extreme,symmetry_extreme,fractal_extreme
0,842302,M,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [17]:
plt.figure()
sns.scatterplot(x="radius_mean", y="label", data=cancer)
plt.tight_layout()

<IPython.core.display.Javascript object>

In [18]:
X_cancer = np.vstack((np.ones(shape=len(cancer)), cancer["radius_mean"].values.T)).T
y_cancer = cancer["label"].values

In [19]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def logistic_regression_loss(
    X: np.ndarray,
    y: np.ndarray,
    betas: np.ndarray,
) -> float:
    """Calculate the loss of a logistic regression problem, i.e., cross entropy."""
    if not isinstance(betas, np.ndarray):
        betas = np.array(betas)
    
    loss = np.sum(y * np.log(sigmoid(X @ betas)) + (1 - y) * np.log(1 - sigmoid(X @ betas)))
    return -1. * loss

In [20]:
# full gradient descent
cancer_beta_0s = np.linspace(start=-20, stop=-10, num=100)
cancer_beta_1s = np.linspace(start=0.5, stop=1.5, num=100)
cancer_X, cancer_Y = np.meshgrid(cancer_beta_0s, cancer_beta_1s)
cancer_losses_2d = np.zeros(shape=cancer_X.shape)  # initialize the losses
for i, x in enumerate(cancer_X):
    for j in range(len(x)):
        cancer_losses_2d[i][j] = logistic_regression_loss(
            X=X_cancer, y=y_cancer, betas=[cancer_X[i][j], cancer_Y[i][j]])
        
# make the 3d plot
fig, ax = plt.subplots(subplot_kw={"projection": "3d"})

# plot the actual minimum
cancer_betas_min = [-15.2459, 1.0336]
ax.scatter(
    xs=[cancer_betas_min[0],], 
    ys=[cancer_betas_min[1],], 
    zs=[logistic_regression_loss(X=X_cancer, y=y_cancer, betas=cancer_betas_min)],
    s=100,
    color="black",
)
# plot the loss function
surf = ax.plot_surface(cancer_X, cancer_Y, cancer_losses_2d, cmap=cm.coolwarm,
                       linewidth=0, antialiased=False, alpha=0.2)

plt.tight_layout()

<IPython.core.display.Javascript object>

In [21]:
def logistic_regression_loss_gradient(
    X: np.ndarray,
    y: np.ndarray,
    betas: np.ndarray,
) -> float:
    """Calculate the loss of a logistic regression problem, i.e., cross entropy."""
    if not isinstance(betas, np.ndarray):
        betas = np.array(betas)
    
    grad_0 = -1. * np.sum((y - sigmoid(X @ betas)) * 1)
    grad_1 = -1. * np.sum((y - sigmoid(X @ betas)) * X[:, 1])
    
    return np.array([grad_0, grad_1])

In [22]:
%%time
# gradient descent for logistic regression
cancer_guess_iter, cancer_losses_iter = gradient_descent(
    X=X_cancer,
    y=y_cancer,
    initial_guess=[-20, 0.6], 
    learning_rate=1e-4,
    loss_function=logistic_regression_loss,
    gradient_function=logistic_regression_loss_gradient,
    threshold=1e-7,
)

CPU times: user 13.9 s, sys: 323 ms, total: 14.2 s
Wall time: 15.4 s


In [23]:
# make the 3d plot with the gradient descent path
fig, ax = plt.subplots(subplot_kw={"projection": "3d"})

# plot the loss function
surf = ax.plot_surface(cancer_X, cancer_Y, cancer_losses_2d, cmap=cm.coolwarm,
                       linewidth=0, antialiased=False, alpha=0.2)

# plot the path
sample_rate = 1
ax.scatter3D(
    xs=np.array(cancer_guess_iter)[:, 0][::sample_rate], 
    ys=np.array(cancer_guess_iter)[:, 1][::sample_rate], 
    zs=cancer_losses_iter[::sample_rate],
    s=10,
)
plt.tight_layout()

<IPython.core.display.Javascript object>

In [24]:
# plot the fitted curve
cancer_betas_fitted = np.array(cancer_guess_iter[-1])
print(cancer_betas_fitted)

plt.figure()
sns.scatterplot(x="radius_mean", y="label", data=cancer)
x = np.linspace(start=5, stop=30, num=100)
X = np.vstack((np.ones(shape=len(x)), x.T)).T
y = sigmoid(X @ cancer_betas_fitted)
sns.lineplot(x=x, y=y, color="red")
plt.tight_layout()

[-15.3017522    1.03749494]


<IPython.core.display.Javascript object>