In [1]:
import copy, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#sklearn
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
np.set_printoptions(precision=2)  # reduced display precision on numpy arrays

In [2]:
#sample data set
X_train = np.array([[2104, 5, 1, 45], [1416, 3, 2, 40], [852, 2, 1, 35]])
y_train = np.array([460, 232, 178])
print(f"X Shape: {X_train.shape}, X Type:{type(X_train)})")
print(X_train)
print(f"y Shape: {y_train.shape}, y Type:{type(y_train)})")
print(y_train)

X Shape: (3, 4), X Type:<class 'numpy.ndarray'>)
[[2104    5    1   45]
 [1416    3    2   40]
 [ 852    2    1   35]]
y Shape: (3,), y Type:<class 'numpy.ndarray'>)
[460 232 178]


In [3]:
b_init = 785.1811367994083
w_init = np.array([ 0.39133535, 18.75376741, -53.36032453, -26.42131618])
print(f"w_init shape: {w_init.shape}, b_init type: {type(b_init)}")

w_init shape: (4,), b_init type: <class 'float'>


The model's prediction with multiple variables is given by the linear model:

$$ f_{\mathbf{w},b}(\mathbf{x}) =  w_0x_0 + w_1x_1 +... + w_{n-1}x_{n-1} + b \tag{1}$$
or in vector notation:
$$ f_{\mathbf{w},b}(\mathbf{x}) = \mathbf{w} \cdot \mathbf{x} + b  \tag{2} $$ 
where $\cdot$ is a vector `dot product`

In [4]:
# using for loop
def predict_single_loop(x, w, b): 
    """
    single predict using linear regression
    
    Args:
      x (ndarray): Shape (n,) example with multiple features
      w (ndarray): Shape (n,) model parameters    
      b (scalar):  model parameter     
      
    Returns:
      p (scalar):  prediction
    """
    n = x.shape[0]
    p = 0
    for i in range(n):
        p_i = x[i] * w[i]  
        p = p + p_i         
    p = p + b                
    return p

In [5]:
#using dot product
def predict(x, w, b): 
    """
    single predict using linear regression
    Args:
      x (ndarray): Shape (n,) example with multiple features
      w (ndarray): Shape (n,) model parameters   
      b (scalar):             model parameter 
      
    Returns:
      p (scalar):  prediction
    """
    p = np.dot(x, w) + b     
    return p    

In [6]:
# get a row from our training data
x_vec = X_train[0,:]
print(f"x_vec shape {x_vec.shape}, x_vec value: {x_vec}")

# make a prediction
f_wb = predict_single_loop(x_vec, w_init, b_init)
f_wb_d = predict(x_vec,w_init,b_init)
print(f"f_wb shape {f_wb.shape}, prediction: {f_wb}")
print(f"f_wb shape {f_wb_d.shape}, prediction: {f_wb_d}, using dot product")

x_vec shape (4,), x_vec value: [2104    5    1   45]
f_wb shape (), prediction: 459.9999976194083
f_wb shape (), prediction: 459.9999976194083, using dot product


# Compute Cost With Multiple Variables
The equation for the cost function with multiple variables $J(\mathbf{w},b)$ is:
$$J(\mathbf{w},b) = \frac{1}{2m} \sum\limits_{i = 0}^{m-1} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)})^2 \tag{3}$$ 
where:
$$ f_{\mathbf{w},b}(\mathbf{x}^{(i)}) = \mathbf{w} \cdot \mathbf{x}^{(i)} + b  \tag{4} $$ 

In [7]:
# calc cost using for loop
def compute_cost(X, y, w, b): 
    """
    compute cost
    Args:
      X (ndarray (m,n)): Data, m examples with n features
      y (ndarray (m,)) : target values
      w (ndarray (n,)) : model parameters  
      b (scalar)       : model parameter
      
    Returns:
      cost (scalar): cost
    """
    m = X.shape[0]
    cost = 0.0
    for i in range(m):                                
        f_wb_i = np.dot(X[i], w) + b           #(n,)(n,) = scalar (see np.dot)
        cost = cost + (f_wb_i - y[i])**2       #scalar
    cost = cost / (2 * m)                      #scalar    
    return cost

In [8]:
# Compute and display cost in a location
cost = compute_cost(X_train, y_train, w_init, b_init)
print(f'Cost at optimal w : {cost}')

Cost at optimal w : 1.5578904428966628e-12


# 5 Gradient Descent With Multiple Variables
Gradient descent for multiple variables:

$$\begin{align*} \text{repeat}&\text{ until convergence:} \; \lbrace \newline\;
& w_j = w_j -  \alpha \frac{\partial J(\mathbf{w},b)}{\partial w_j} \tag{5}  \; & \text{for j = 0..n-1}\newline
&b\ \ = b -  \alpha \frac{\partial J(\mathbf{w},b)}{\partial b}  \newline \rbrace
\end{align*}$$

where, n is the number of features, parameters $w_j$,  $b$, are updated simultaneously and where  

$$
\begin{align}
\frac{\partial J(\mathbf{w},b)}{\partial w_j}  &= \frac{1}{m} \sum\limits_{i = 0}^{m-1} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)})x_{j}^{(i)} \tag{6}  \\
\frac{\partial J(\mathbf{w},b)}{\partial b}  &= \frac{1}{m} \sum\limits_{i = 0}^{m-1} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)}) \tag{7}
\end{align}
$$
* m is the number of training examples in the data set

    
*  $f_{\mathbf{w},b}(\mathbf{x}^{(i)})$ is the model's prediction, while $y^{(i)}$ is the target value


- outer loop over all m examples. 
    - $\frac{\partial J(\mathbf{w},b)}{\partial b}$ for the example can be computed directly and accumulated
    - in a second loop over all n features:
        - $\frac{\partial J(\mathbf{w},b)}{\partial w_j}$ is computed for each $w_j$.

In [9]:
def compute_gradient(X, y, w, b): 
    """
    Computes the gradient for linear regression 
    Args:
      X (ndarray (m,n)): Data, m examples with n features
      y (ndarray (m,)) : target values
      w (ndarray (n,)) : model parameters  
      b (scalar)       : model parameter
      
    Returns:
      dj_dw (ndarray (n,)): The gradient of the cost w.r.t. the parameters w. 
      dj_db (scalar):       The gradient of the cost w.r.t. the parameter b. 
    """
    m,n = X.shape           #(number of examples, number of features)
    dj_dw = np.zeros((n,))
    dj_db = 0.

    for i in range(m):                             
        err = (np.dot(X[i], w) + b) - y[i]   
        for j in range(n):                         
            dj_dw[j] = dj_dw[j] + err * X[i, j]    
        dj_db = dj_db + err                        
    dj_dw = dj_dw / m                                
    dj_db = dj_db / m                                
        
    return dj_db, dj_dw

In [10]:
#Compute and display gradient 
tmp_dj_db, tmp_dj_dw = compute_gradient(X_train, y_train, w_init, b_init)
print(f'dj_db at initial w,b: {tmp_dj_db}')
print(f'dj_dw at initial w,b: \n {tmp_dj_dw}')

dj_db at initial w,b: -1.6739251501955248e-06
dj_dw at initial w,b: 
 [-2.73e-03 -6.27e-06 -2.22e-06 -6.92e-05]


In [11]:
def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters): 
    """
    Performs batch gradient descent to learn w and b. Updates w and b by taking 
    num_iters gradient steps with learning rate alpha
    
    Args:
      X (ndarray (m,n))   : Data, m examples with n features
      y (ndarray (m,))    : target values
      w_in (ndarray (n,)) : initial model parameters  
      b_in (scalar)       : initial model parameter
      cost_function       : function to compute cost
      gradient_function   : function to compute the gradient
      alpha (float)       : Learning rate
      num_iters (int)     : number of iterations to run gradient descent
      
    Returns:
      w (ndarray (n,)) : Updated values of parameters 
      b (scalar)       : Updated value of parameter 
      """
    
    # An array to store cost J and w's at each iteration primarily for graphing later
    J_history = []
    w = copy.deepcopy(w_in)  #avoid modifying global w within function
    b = b_in
    
    for i in range(num_iters):

        # Calculate the gradient and update the parameters
        dj_db,dj_dw = gradient_function(X, y, w, b)   ##None

        # Update Parameters using w, b, alpha and gradient
        w = w - alpha * dj_dw               ##None
        b = b - alpha * dj_db               ##None
      
        # Save cost J at each iteration
        if i<100000:      # prevent resource exhaustion 
            J_history.append( cost_function(X, y, w, b))

        # Print cost every at intervals 10 times or as many iterations if < 10
        if i% math.ceil(num_iters / 10) == 0:
            print(f"Iteration {i:4d}: Cost {J_history[-1]:8.2f}   ")
        
    return w, b, J_history #return final w,b and J history for graphing

In [12]:
# initialize parameters
initial_w = np.zeros_like(w_init)
initial_b = 0.
# some gradient descent settings
iterations = 1000
alpha = 5.0e-7
# run gradient descent 
w_final, b_final, J_hist = gradient_descent(X_train, y_train, initial_w, initial_b,
                                                    compute_cost, compute_gradient, 
                                                    alpha, iterations)
print(f"b,w found by gradient descent: {b_final:0.2f},{w_final} ")
m,_ = X_train.shape
for i in range(m):
    print(f"prediction: {np.dot(X_train[i], w_final) + b_final:0.2f}, target value: {y_train[i]}")

Iteration    0: Cost  2529.46   
Iteration  100: Cost   695.99   
Iteration  200: Cost   694.92   
Iteration  300: Cost   693.86   
Iteration  400: Cost   692.81   
Iteration  500: Cost   691.77   
Iteration  600: Cost   690.73   
Iteration  700: Cost   689.71   
Iteration  800: Cost   688.70   
Iteration  900: Cost   687.69   
b,w found by gradient descent: -0.00,[ 0.2   0.   -0.01 -0.07] 
prediction: 426.19, target value: 460
prediction: 286.17, target value: 232
prediction: 171.47, target value: 178


In [13]:
# plot cost versus iteration  
# fig, (ax1, ax2) = plt.subplots(1, 2, constrained_layout=True, figsize=(12, 4))
# ax1.plot(J_hist)
# ax2.plot(100 + np.arange(len(J_hist[100:])), J_hist[100:])
# ax1.set_title("Cost vs. iteration");  ax2.set_title("Cost vs. iteration (tail)")
# ax1.set_ylabel('Cost')             ;  ax2.set_ylabel('Cost') 
# ax1.set_xlabel('iteration step')   ;  ax2.set_xlabel('iteration step') 
# plt.show()

# Applying all this on wine data

In [14]:
wine = pd.read_csv(r"C:\Users\zhang\Downloads\wine+quality\winequality-red.csv",sep=';')
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [15]:
def zscore_normalize_features(X):
    """
    computes  X, zcore normalized by column
    
    Args:
      X (ndarray (m,n))     : input data, m examples, n features
      
    Returns:
      X_norm (ndarray (m,n)): input normalized by column
      mu (ndarray (n,))     : mean of each feature
      sigma (ndarray (n,))  : standard deviation of each feature
    """
    # find the mean of each column/feature
    mu     = np.mean(X, axis=0)                 # mu will have shape (n,)
    # find the standard deviation of each column/feature
    sigma  = np.std(X, axis=0)                  # sigma will have shape (n,)
    # element-wise, subtract mu for that column from each example, divide by std for that column
    X_norm = (X - mu) / sigma      

    return (X_norm, mu, sigma)

In [16]:
#convert to numpy and setting up data
wine_np = wine.to_numpy()
wine_feature = wine.columns
wine_X = wine_np[:,:11]
wine_y = wine_np[:,-1]
wine_X_norm, mu, sigma = zscore_normalize_features(wine_X)
#wine_X_norm
wine_feature

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [17]:
# initialize parameters
initial_wine_w = np.zeros_like(wine_X[:][0])
initial_wine_b = 0.
# some gradient descent settings
iterations = 10000
alpha = 0.1
# run gradient descent 
wine_w_final, wine_b_final, wine_J_hist = gradient_descent(wine_X_norm, wine_y, initial_wine_w, initial_wine_b,
                                                    compute_cost, compute_gradient, 
                                                    alpha, iterations)
print(f"b,w found by gradient descent: {wine_b_final:0.2f},{wine_w_final} ")
m,_ = wine_X.shape
for i in range(m):
    print(f"prediction: {np.dot(wine_X_norm[i], wine_w_final) + wine_b_final:0.2f}, target value: {wine_y[i]}")

Iteration    0: Cost    13.16   
Iteration 1000: Cost     0.21   
Iteration 2000: Cost     0.21   
Iteration 3000: Cost     0.21   
Iteration 4000: Cost     0.21   
Iteration 5000: Cost     0.21   
Iteration 6000: Cost     0.21   
Iteration 7000: Cost     0.21   
Iteration 8000: Cost     0.21   
Iteration 9000: Cost     0.21   
b,w found by gradient descent: 5.64,[ 0.04 -0.19 -0.04  0.02 -0.09  0.05 -0.11 -0.03 -0.06  0.16  0.29] 
prediction: 5.03, target value: 5.0
prediction: 5.14, target value: 5.0
prediction: 5.21, target value: 5.0
prediction: 5.69, target value: 6.0
prediction: 5.03, target value: 5.0
prediction: 5.07, target value: 5.0
prediction: 5.11, target value: 5.0
prediction: 5.34, target value: 7.0
prediction: 5.34, target value: 7.0
prediction: 5.66, target value: 5.0
prediction: 5.06, target value: 5.0
prediction: 5.66, target value: 5.0
prediction: 5.13, target value: 5.0
prediction: 5.96, target value: 5.0
prediction: 5.15, target value: 5.0
prediction: 5.19, target 

In [18]:
predict = [7,	0.60,	0.01	,1.3,	0.067,	12.0	,38.0	,0.9878	,3.59,	0.46,	9.0	]

print(f"prediction: {np.dot(predict, wine_w_final) + wine_b_final:0.2f}")

prediction: 4.77


##Regression using SciKit-learn

In [19]:
#data where loaded into wine_x , wine_y , and wine_feature
scaler = StandardScaler()
sk_X_norm = scaler.fit_transform(wine_X)
print(f"Peak to Peak range by column in Raw        X:{np.ptp(wine_X,axis=0)}")   
print(f"Peak to Peak range by column in Normalized X:{np.ptp(sk_X_norm,axis=0)}")

Peak to Peak range by column in Raw        X:[1.13e+01 1.46e+00 1.00e+00 1.46e+01 5.99e-01 7.10e+01 2.83e+02 1.36e-02
 1.27e+00 1.67e+00 6.50e+00]
Peak to Peak range by column in Normalized X:[ 6.49  8.16  5.14 10.36 12.73  6.79  8.61  7.22  8.23  9.86  6.1 ]


In [20]:
sgdr = SGDRegressor(max_iter=10000)
sgdr.fit(sk_X_norm, wine_y)
print(sgdr)
print(f"number of iterations completed: {sgdr.n_iter_}, number of weight updates: {sgdr.t_}")

SGDRegressor(max_iter=10000)
number of iterations completed: 8, number of weight updates: 12793.0


In [23]:
sk_b_norm = sgdr.intercept_
sk_w_norm = sgdr.coef_
print(f"model parameters:                   w: {sk_w_norm}, b:{sk_b_norm}")
print(f"model parameters from previous lab: w: {wine_w_final}, b: {wine_b_final:0.2f}")

model parameters:                   w: [ 0.05 -0.2  -0.03  0.03 -0.09  0.05 -0.1  -0.06 -0.05  0.16  0.3 ], b:[5.65]
model parameters from previous lab: w: [ 0.04 -0.19 -0.04  0.02 -0.09  0.05 -0.11 -0.03 -0.06  0.16  0.29], b: 5.64


In [24]:
# make a prediction using sgdr.predict()
y_pred_sgd = sgdr.predict(sk_X_norm)
# make a prediction using w,b. 
y_pred = np.dot(sk_X_norm, sk_w_norm) + sk_b_norm  
print(f"prediction using np.dot() and sgdr.predict match: {(y_pred == y_pred_sgd).all()}")

print(f"Prediction on training set:\n{y_pred_sgd[:4]}" )
print(f"Prediction on training set (dot):\n{y_pred[:4]}" )
print(f"Target values \n{wine_y[:4]}")

prediction using np.dot() and sgdr.predict match: True
Prediction on training set:
[5.02 5.12 5.19 5.7 ]
Prediction on training set (dot):
[5.02 5.12 5.19 5.7 ]
Target values 
[5. 5. 5. 6.]
