In [None]:
# LINEAR REGRESSION USING NORMAL EQUATION

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X, y)
lin_reg.intercept_, lin_reg.coef_
lin_reg.predict(X_new)

In [None]:
#BATCH GRADIENT DESCENT
eta = 0.1 # learning rate
n_iterations = 1000
m = 100
theta = np.random.randn(2,1)
# random initialization
for iteration in range(n_iterations):
gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)
theta = theta - eta * gradients

In [None]:
# When the cost function is very irregular , this can actually help the algorithm jump out of
# local minima, so Stochastic Gradient Descent has a better chance of finding the global minimum than
# Batch Gradient Descent does.
# Therefore randomness is good to escape from local optima, but bad because it means that the algorithm
# can never settle at the minimum. One solution to this dilemma is to gradually reduce the learning rate. The
# steps start out large (which helps make quick progress and escape local minima), then get smaller and
# smaller, allowing the algorithm to settle at the global minimum. This process is called simulatedannealing, because it resembles the process of annealing in metallurgy where molten metal is slowly
# cooled down. The function that determines the learning rate at each iteration is called the learning
# schedule. If the learning rate is reduced too quickly, you may get stuck in a local minimum, or even end up
# frozen halfway to the minimum. If the learning rate is reduced too slowly, you may jump around the
# minimum for a long time and end up with a suboptimal solution if you halt training too early.

# This code implements Stochastic Gradient Descent using a simple learning schedule:
n_epochs = 50
t0, t1 = 5, 50
# learning schedule hyperparameters
def learning_schedule(t):
    return t0 / (t + t1)
theta = np.random.randn(2,1)
# random initialization
for epoch in range(n_epochs):
    for i in range(m):
        random_index = np.random.randint(m)
        xi = X_b[random_index:random_index+1]
        yi = y[random_index:random_index+1]
        gradients = 2 * xi.T.dot(xi.dot(theta) - yi)
        eta = learning_schedule(epoch * m + i)
        theta = theta - eta * gradients

In [None]:
# SGDRegressor using Scikit-Learn

from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor(n_iter=50, penalty=None, eta0=0.1)
sgd_reg.fit(X, y.ravel())
sgd_reg.intercept_, sgd_reg.coef_

In [None]:
# POLYNOMIAL REGRESSION
from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_features.fit_transform(X)

In [None]:
# LEARNING CURVVES: 


# these are plots of the model’s performance on the training
# set and the validation set as a function of the training set size. To generate the plots, simply train the model
# several times on different sized subsets of the training set. The following code defines a function that plots
# the learning curves of a model given some training data:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
def plot_learning_curves(model, X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    train_errors, val_errors = [], []
    for m in range(1, len(X_train)):
        model.fit(X_train[:m], y_train[:m])
        y_train_predict = model.predict(X_train[:m])
        y_val_predict = model.predict(X_val)
        train_errors.append(mean_squared_error(y_train_predict, y_train[:m]))
        val_errors.append(mean_squared_error(y_val_predict, y_val))
    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
    plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
# Let’s look at the learning curves of the plain Linear Regression model (a straight line; Figure 4-15):
lin_reg = LinearRegression()
plot_learning_curves(lin_reg, X, y)

In [None]:
# RIDGE REGRESSION == REGULARIZATION

from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=1, solver="cholesky")
ridge_reg.fit(X, y)
ridge_reg.predict([[1.5]])


# REGULARIZATION IN STOCHASTIC GRADIENT DESCENT
# And using Stochastic Gradient Descent: 
sgd_reg = SGDRegressor(penalty="l2")
sgd_reg.fit(X, y.ravel())
sgd_reg.predict([[1.5]])


In [None]:
# LASSO REGRESSION
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(X, y)
lasso_reg.predict([[1.5]])

# ELASTIC NET 
from sklearn.linear_model import ElasticNet
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic_net.fit(X, y)
elastic_net.predict([[1.5]])

In [None]:
# EARLY STOPPING

from sklearn.base import clone
sgd_reg = SGDRegressor(n_iter=1, warm_start=True, penalty=None,
learning_rate="constant", eta0=0.0005)
minimum_val_error = float("inf")
best_epoch = Nonebest_model = None
for epoch in range(1000):
    sgd_reg.fit(X_train_poly_scaled, y_train) # continues where it left off
    y_val_predict = sgd_reg.predict(X_val_poly_scaled)
    val_error = mean_squared_error(y_val_predict, y_val)
    if val_error < minimum_val_error:
        minimum_val_error = val_error
        best_epoch = epoch
        best_model = clone(sgd_reg)
#     Note that with warm_start=True , when the fit() method is called, it just continues training where it left
#     off instead of restarting from scratch.

In [None]:
# Let’s use the iris dataset to illustrate Logistic Regression. This is a famous dataset that contains the sepal
# and petal length and width of 150 iris flowers of three different species: Iris-Setosa, Iris-Versicolor, and
# Iris-Virginica
# Let’s try to build a classifier to detect the Iris-Virginica type based only on the petal width feature. First
# let’s load the data:
from sklearn import datasets
iris = datasets.load_iris()
list(iris.keys())
# ['data', 'target_names', 'feature_names', 'target', 'DESCR']
X = iris["data"][:, 3:] # petal width
y = (iris["target"] == 2).astype(np.int) # 1 if Iris-Virginica, else 0
# Now let’s train a Logistic Regression model:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X, y)

X_new = np.linspace(0, 3, 1000).reshape(-1, 1)
y_proba = log_reg.predict_proba(X_new)
plt.plot(X_new, y_proba[:, 1], "g-", label="Iris-Virginica")
plt.plot(X_new, y_proba[:, 0], "b--", label="Not Iris-Virginica")
# + more Matplotlib code to make the image look pretty

# , Logistic Regression models can be regularized using l 1 or l 2 penalties.
# Scitkit-Learn actually adds an l 2 penalty by default.

In [None]:
# Softmax Regression


# The Logistic Regression model can be generalized to support multiple classes directly, without having to
# train and combine multiple binary classifiers (as discussed in Chapter 3). This is called Softmax
# Regression, or Multinomial Logistic Regression.
# Let’s use Softmax Regression to classify the iris flowers into all three classes. Scikit-Learn’s
# LogisticRegression uses one-versus-all by default when you train it on more than two classes, but you
# can set the multi_class hyperparameter to "multinomial" to switch it to Softmax Regression instead.
# You must also specify a solver that supports Softmax Regression, such as the "lbfgs" solver (see Scikit-
# Learn’s documentation for more details). It also applies l 2 regularization by default, which you can
# control using the hyperparameter C .
X = iris["data"][:, (2, 3)]  # petal length, petal width
y = iris["target"]
softmax_reg = LogisticRegression(multi_class="multinomial",solver="lbfgs", C=10)
softmax_reg.fit(X, y)
# So the next time you find an iris with 5 cm long and 2 cm wide petals, you can ask your model to tell you
# what type of iris it is, and it will answer Iris-Virginica (class 2) with 94.2% probability (or Iris-
# Versicolor with 5.8% probability):
softmax_reg.predict([[5, 2]])
# array([2])
softmax_reg.predict_proba([[5, 2]])
# array([[ 6.33134078e-07,
# 5.75276067e-02,
# 9.42471760e-01]])