**Chapter 4 – Training Linear Models**

In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(42)


## Random function 정리

In [2]:
# Standard normal distribution
np.random.randn(3)

array([ 0.49671415, -0.1382643 ,  0.64768854])

In [3]:
# Uniform distribution
np.random.rand(3)

array([ 0.15601864,  0.15599452,  0.05808361])

In [4]:
#random integers from low to high
np.random.randint(4)

3

In [5]:
#Randomly shuffle a sequence
np.random.permutation(3)

array([2, 1, 0])

# Linear regression using the Normal Equation

$y = 4+3x$ + Gaussian_noise

In [None]:
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)

In [None]:
plt.plot(X, y, "b.")
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$y$", rotation=0, fontsize=18)
plt.axis([0, 2, 0, 15])
plt.show()

$y = \theta_01 +\theta_1x_1$

$\begin{equation*}
X=\begin{pmatrix}
1 & x[1] \\
1 & x[2] \\
1 & x[3]
\end{pmatrix}
\end{equation*}$

In [8]:
np.ones?

In [9]:
X_b = np.c_([np.ones(100,1),X]) # add x0 = 1 to each instance

TypeError: data type not understood

$\theta=(X^T X)^{-1} X^T y$

Use inv() function from Numpy's Linear Alegbra Module (np.linalg)
and dot() method for matrix multiplication

In [None]:
theta_best = ??

In [None]:
theta_best

In [None]:
X_new = np.array([[0], [2]])
X_new_b = np.c_[np.ones((2, 1)), X_new]  # add x0 = 1 to each instance
y_predict = ??
y_predict

In [None]:
plt.plot(X_new, y_predict, "r-", linewidth=2, label="Predictions")
plt.plot(X, y, "b.")

plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$y$", rotation=0, fontsize=18)
plt.legend(loc="upper left", fontsize=14)
plt.axis([0, 2, 0, 15])
plt.show()

Use LinearRegression from sklearn.linear_model

In [None]:
from sklearn.?? import ??
lin_reg = ??()
lin_reg.??(X, y)
lin_reg.intercept_, lin_reg.coef_

predict X_new

In [None]:
lin_reg.??(X_new)

# Linear regression using batch gradient descent

\begin{align}
\nabla_{\theta}MSE(\theta) = \frac2mX^T(X\theta-y)\\
\theta^{n+1}=\theta^{n}-\eta\nabla_{\theta}MSE(\theta)
\end{align}

In [None]:
eta = 0.1
n_iterations = 1000
m = 100
theta = np.random.randn(2,1)

for iteration in range(n_iterations):
    gradients =??
    theta = ??

In [None]:
theta

In [None]:
X_new_b.dot(theta)

In [None]:
X_new_b

## 한줄 조건부
'Yes' if fruit == 'Apple' else 'No'

In [18]:
fruit = 'Apple'
if fruit == 'Apple':
    ans = True
else:
    ans = False

print(ans)

True


In [19]:
??
print(ans)

True


In [None]:
theta_path_bgd = []

def plot_gradient_descent(theta, eta, theta_path=None):
    m = len(X_b)
    plt.plot(X, y, "b.")
    n_iterations = 1000
    for iteration in range(n_iterations):
        if iteration < 10:
            y_predict = X_new_b.dot(theta)
            
            #set style as "b-" if iteration > 0 else set style as "r--"
            
            style = ??
            
            plt.plot(X_new, y_predict, style)
        elif iteration == n_iterations-1:
            y_predict = X_new_b.dot(theta)
            plt.plot(X_new, y_predict, 'k-')
            
        gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)
        theta = theta - eta * gradients
        
        if theta_path is not None:
            theta_path.append(theta)
            
    plt.xlabel("$x_1$", fontsize=18)
    plt.axis([0, 2, 0, 15])
    plt.title(r"$\eta = {}$".format(eta), fontsize=16)


In [None]:
np.random.seed(42)
theta = np.random.randn(2,1)  # random initialization

plt.figure(figsize=(10,4))
plt.subplot(131); plot_gradient_descent(theta, eta=0.02)
plt.ylabel("$y$", rotation=0, fontsize=18)
plt.subplot(132); plot_gradient_descent(theta, eta=0.1, theta_path=theta_path_bgd)
plt.subplot(133); plot_gradient_descent(theta, eta=0.5)

plt.show()

# Stochastic Gradient Descent

In [None]:
theta_path_sgd = []
m = len(X_b)
np.random.seed(42)

In [None]:
# eta_path=[]
# for epoch in range(n_epochs):
#     for i in range(m):
#         eta = learning_schedule(epoch * m + i)
#         eta_path.append(eta)
# plt.plot(eta_path)
# plt.show()

In [None]:
random_index = np.random.randint(m)
X_b[random_index].reshape(-1,2)

In [None]:
n_epochs = 50
t0, t1 = 5, 50  # learning schedule hyperparameters

def learning_schedule(t):
    return t0 / (t + t1)

theta = np.random.randn(2,1)  # random initialization

for epoch in range(n_epochs):
    for i in range(m):
        if epoch == 0 and i < 20:                    # not shown in the book
            y_predict = X_new_b.dot(theta)           # not shown
            style = "b-" if i > 0 else "r--"         # not shown
            plt.plot(X_new, y_predict, style)        # not shown
        
        # select a random index from 0 ~ m
        random_index = ??
        
        xi = X_b[random_index:random_index+1]
        yi = y[random_index:random_index+1]
        
        gradients = 2 * xi.T.dot(xi.dot(theta) - yi)
        eta = learning_schedule(epoch * m + i)
        theta = theta - eta * gradients
        theta_path_sgd.append(theta)                 # not shown

plt.plot(X, y, "b.")                                 # not shown
plt.xlabel("$x_1$", fontsize=18)                     # not shown
plt.ylabel("$y$", rotation=0, fontsize=18)           # not shown
plt.axis([0, 2, 0, 15])                              # not shown

plt.show()                                           # not shown

In [None]:
theta

Use SGDRegressor for stochastic gd

In [None]:
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor(max_iter =50, penalty=None, eta0=0.1, random_state=42)
sgd_reg.fit(X, y.ravel())

In [None]:
sgd_reg.intercept_, sgd_reg.coef_

# Mini-batch gradient descent

In [None]:
theta_path_mgd = []

n_iterations = 50
minibatch_size = 20

np.random.seed(42)
theta = np.random.randn(2,1)  # random initialization

t0, t1 = 10, 1000
def learning_schedule(t):
    return t0 / (t + t1)

t = 0
for epoch in range(n_iterations):
    # shuffle indices
    shuffled_indices = ??(m)
    X_b_shuffled = X_b??
    y_shuffled = y??
    for i in range(0, m, minibatch_size):
        t += 1
        xi = X_b_shuffled[i:i+minibatch_size]
        yi = y_shuffled[i:i+minibatch_size]
        gradients = 2 * xi.T.dot(xi.dot(theta) - yi)
        eta = learning_schedule(t)
        theta = theta - eta * gradients
        theta_path_mgd.append(theta)

In [None]:
theta

In [None]:
theta_path_bgd = np.array(theta_path_bgd)
theta_path_sgd = np.array(theta_path_sgd)
theta_path_mgd = np.array(theta_path_mgd)

In [None]:
plt.figure(figsize=(7,4))
plt.plot(theta_path_sgd[:, 0], theta_path_sgd[:, 1], "r-s", linewidth=1, label="Stochastic")
plt.plot(theta_path_mgd[:, 0], theta_path_mgd[:, 1], "g-+", linewidth=2, label="Mini-batch")
plt.plot(theta_path_bgd[:, 0], theta_path_bgd[:, 1], "b-o", linewidth=3, label="Batch")
plt.legend(loc="upper left", fontsize=16)
plt.xlabel(r"$\theta_0$", fontsize=20)
plt.ylabel(r"$\theta_1$   ", fontsize=20, rotation=0)
plt.axis([2.5, 4.5, 2.3, 3.9])
plt.show()

# Polynomial regression

In [None]:
import numpy as np
import numpy.random as rnd

np.random.seed(42)

$y = 0.5X^2 + X + 2 +$Gaussian Noise

In [None]:
m = 100
X = 6 * np.random.rand(m, 1) - 3
y = 0.5 * X**2 + X + 2 + ??

In [None]:
plt.plot(X, y, "b.")
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$y$", rotation=0, fontsize=18)
plt.axis([-3, 3, 0, 10])


feature extension with PolynomialFeatures

In [None]:
from sklearn.preprocessing import ??
poly_features = ??(degree=20, include_bias=False)
X_poly = poly_features.??(X)

Apply StandardScaler

In [None]:
from sklearn.preprocessing import ??
std_scaler = ??()
X_poly_scaled = std_scaler.??(X_poly)

sklearn Linear Regression

In [None]:
lin_reg = ??()
lin_reg.??(X_poly_scaled, y)

In [None]:
X_new=np.linspace(-3, 3, 100).reshape(100, 1)
X_new_poly = poly_features.??(X_new)
X_new_poly_scaled = std_scaler.??(X_new_poly)
y_new = lin_reg.??(X_new_poly_scaled)

plt.plot(X, y, "b.")
plt.plot(X_new, y_new, "r-", linewidth=2, label="Predictions")
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$y$", rotation=0, fontsize=18)
plt.legend(loc="upper left", fontsize=14)
plt.axis([-3, 3, 0, 10])
plt.show()

Make a Pipeline for PolynomialFeatures, StandardScaler, LinearRegression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

degree = 20
polynomial_regression = Pipeline([
            ("poly_features",  ??(degree=degree, include_bias=False)),
            ("std_scaler", ??()),
            ("lin_reg", ??()),
        ])
polynomial_regression.??(X, y)
y_newbig = polynomial_regression.??(X_new)

plt.plot(X_new, y_newbig, "r-", label=str(degree), linewidth=2)
plt.plot(X, y, "b.")


plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$y$", rotation=0, fontsize=18)
plt.axis([-3, 3, 0, 10])

# Regularized models

Ridge: $J(\theta)=MSE(\theta)+\alpha\frac12\sum\theta_i^2 $ 

Lasso: $J(\theta)=MSE(\theta)+\alpha\sum\lvert\theta_i\rvert $

Elastic Net: $J(\theta)=MSE(\theta)+r\alpha\sum\lvert\theta_i\rvert+(1-r)\alpha\frac12\sum\theta_i^2$

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor

from sklearn.pipeline import Pipeline

degree = 20
alpha=1e-1
# model = Ridge(max_iter=10000, alpha=alpha)
# model = Lasso(alpha=alpha)
# model = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)
model = SGDRegressor(max_iter=10000, penalty='l2', eta0=0.001, l1_ratio=0.5, random_state=42)

polynomial_regression = Pipeline([
            ("poly_features",  PolynomialFeatures(degree=degree, include_bias=False)),
            ("std_scaler", StandardScaler()),
            ("model", model),
        ])
??.??(X, y)  #Train
y_newbig = ??.predict(X_new) # Predict

plt.plot(X_new, y_newbig, "r-", label=str(degree), linewidth=2)
plt.plot(X, y, "b.")


plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$y$", rotation=0, fontsize=18)
plt.axis([-3, 3, 0, 10])

In [None]:
from sklearn.model_selection import ??          #split test 
from sklearn.metrics import mean_squared_error

np.random.seed(42)
m = 100
X = 6 * np.random.rand(m, 1) - 3
y = 2 + X + 0.5 * X**2 + np.random.randn(m, 1)

In [None]:
X_train, X_val, y_train, y_val = ??(X[:50], y[:50].reshape(-1), test_size=0.5, random_state=10)
plt.plot(X_train, y_train,'k.')
plt.plot(X_val, y_val,'r>')
plt.legend(['Train', 'Test'])

In [None]:
poly_scaler = Pipeline([
        ("poly_features", PolynomialFeatures(degree=90, include_bias=False)),
        ("std_scaler", StandardScaler()),
    ])
X_train_poly_scaled = poly_scaler.??(X_train)
X_val_poly_scaled = poly_scaler.??(X_val)

In [None]:
sgd = SGDRegressor(max_iter =1,
                       penalty=None,                       
                       eta0=0.0005,
                       warm_start=True,
                       learning_rate="constant",
                       random_state=42)



In [None]:
from copy import deepcopy

n_epochs = 1000
train_errors, val_errors = [], []
# minimum_val_error = ??

for epoch in range(n_epochs):
    sgd.fit(X_train_poly_scaled, y_train)
    
    
    y_train_predict = sgd.predict(X_train_poly_scaled)
    
    
    y_val_predict = sgd.predict(X_val_poly_scaled)
    
    
    train_errors.append(mean_squared_error(y_train_predict, y_train))
    
    val_errors.append(mean_squared_error(y_val_predict, y_val))
    
    
#     val_error = mean_squared_error(y_val_predict_reg, y_val)
#     if ??
#         ??
#         best_model = ??(sgd_reg)
        
    

best_epoch = np.argmin(val_errors)
best_val_rmse = np.sqrt(val_errors[best_epoch])


In [None]:
plt.annotate('Best model',
             xy=(best_epoch, best_val_rmse),
             xytext=(best_epoch, best_val_rmse + 1),
             ha="center",
             arrowprops=dict(facecolor='black', shrink=0.05),
             fontsize=16,
            )

plt.plot(np.sqrt(val_errors), "k-", linewidth=3, label="Validation set")
plt.plot(np.sqrt(train_errors), "k--", linewidth=2, label="Training set")

plt.legend(loc="upper right", fontsize=14)
plt.xlabel("Epoch", fontsize=14)
plt.ylabel("RMSE", fontsize=14)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X[:50], y[:50].reshape(-1), test_size=0.5, random_state=10)
plt.plot(X_train, y_train,'k.')
plt.plot(X_val, y_val,'r>')

X_new=np.linspace(-3, 3, 100).reshape(100, 1)
X_new_scaled = ??(X_new)
y_new = ??(X_new_scaled)
plt.plot(X_new, y_new)
plt.axis([-3, 3, 0, 15])


In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

In [None]:
t1a, t1b, t2a, t2b = -1, 3, -1.5, 1.5

# ignoring bias term
t1s = np.linspace(t1a, t1b, 500)
t2s = np.linspace(t2a, t2b, 500)
t1, t2 = np.meshgrid(t1s, t2s)
T = np.c_[t1.ravel(), t2.ravel()]
Xr = np.array([[-1, 1], [-0.3, -1], [1, 0.1]])
yr = 2 * Xr[:, :1] + 0.5 * Xr[:, 1:]

J = (1/len(Xr) * np.sum((T.dot(Xr.T) - yr.T)**2, axis=1)).reshape(t1.shape)

N1 = np.linalg.norm(T, ord=1, axis=1).reshape(t1.shape)
N2 = np.linalg.norm(T, ord=2, axis=1).reshape(t1.shape)

t_min_idx = np.unravel_index(np.argmin(J), J.shape)
t1_min, t2_min = t1[t_min_idx], t2[t_min_idx]

t_init = np.array([[0.25], [-1]])

levelsJ=(np.exp(np.linspace(0, 1, 20)) - 1) * (np.max(J) - np.min(J)) + np.min(J)

plt.contourf(t1, t2, J, levels=levelsJ, alpha=0.9)
plt.plot(t1_min,t2_min,'r*')

In [None]:
plt.contourf(t1, t2, J, levels=levelsJ, alpha=0.9)
plt.plot(t1_min,t2_min,'r*')

t_init = np.array([[0.25], [-1]])

def bgd_path(theta, X, y, l1, l2, core = 1, eta = 0.1, n_iterations = 50):
    path = [theta]
    for iteration in range(n_iterations):
        gradients = core * 2/len(X) * X.T.dot(X.dot(theta) - y) + l1 * np.sign(theta) + 2 * l2 * theta

        theta = theta - eta * gradients
        path.append(theta)
    return np.array(path)


path_J = bgd_path(t_init, Xr, yr, l1=0.1, l2=0)
plt.plot(path_J[:, 0], path_J[:, 1], "y-^")
