In [None]:
# Gradient Descent is a smart way to adjust our prediction line so that it fits the data points well, 
# helping us make accurate predictions in linear regression.

In [1]:
# It's used on large data sets(millions of rows). It requires much less memory and compute power.

In [None]:
# The goal is to find the optimal value for bias and weights

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [4]:
df = pd.read_csv("Advertising.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,TV,radio,newspaper,sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [5]:
df.columns

Index(['Unnamed: 0', 'TV', 'radio', 'newspaper', 'sales'], dtype='object')

In [6]:
X = df[['TV', 'radio', 'newspaper']]

In [7]:
y = df['sales']

In [10]:
# Feature Engineering :standardization: convert all variables to one scale by subtracting the mean from each variable value 
# and dividing it by the Standard Diviation of the variable

y = np.array((y - y.mean()) / y.std())
X = X.apply(lambda row: (row - row.mean()) / row.std(), axis=0)

In [11]:
# When we implement Gradient Descent Algorithm we will require 4 methods

# M1 --> Initialise the values of parameters
# M2 --> Calculate y_prediction values
# M3 --> Calculate the loss (error)
# M4 --> Calculating the gradient, adjusting the weights by updating parameters (intercept and coefficients)

In [12]:
# M1

import random

def initialize(dim):
    random.seed(42)
    np.random.seed(42)
    b = random.random()      # intercept
    w = np.random.rand(dim)  # coefficients, number of dimensions which we get as an input for this function
    return b,w

In [13]:
b, w = initialize(3)  # 3 dimensions for w

print(f"bias/intercept is {b}, and weights/coefficients are {w}")

bias/intercept is 0.6394267984578837, and weights/coefficients are [0.37454012 0.95071431 0.73199394]


In [15]:
# M2
# inputs --> b,w and x

def predict(b,w,X):
    return b + np.matmul(X,w)

In [17]:
X.shape # 3 features/weights

(200, 3)

In [16]:
b, w = initialize(3)

y_predict = predict(b,w,X)
y_predict[:5]

0    3.231496
1    1.707849
2    2.824761
3    2.753090
4    0.924486
dtype: float64

In [20]:
# M3 calculate the loss

import math

def cal_cost(y,y_predict):
    residual = y - y_predict
    return np.sum(np.matmul(residual, residual.T))/len(residual)   # MSE

In [21]:
b, w = initialize(3)
y_predict = predict(b,w,X)
cal_cost(y,y_predict)      # MSE

1.5302767801991894

In [23]:
# M4 Adaptation/Update of bias and weights

def update_beta(x,y,y_predict,b_0,w_0,lr):
    db = (np.sum(y_predict - y)*2)/len(y)    # gradiend b
    
    dw = (np.dot((y_predict - y),x)*2)/len(y)
    
    b_1 = b_0 - lr * db
    w_1 = w_0 - lr * dw
    
    return b_1, w_1
    

In [25]:
b, w = initialize(3)  # 3 dimensions for w
print(f"Initial values are: bias/intercept is {b}, and weights/coefficients are {w}")

y_predict = predict(b,w,X)
b,w = update_beta(X,y,y_predict,b,w,0.01)
print(f"After update, the values are: bias: {b}, and weights are: {w}")

# cal_cost(y,y_predict)      # MSE

Initial values are: bias/intercept is 0.6394267984578837, and weights/coefficients are [0.37454012 0.95071431 0.73199394]
After update, the values are: bias: 0.6266382624887261, and weights are: [0.38082999 0.93772407 0.71486023]


In [28]:
# M5

def run_gradient_descent(X, y, alpha=0.01, num_iter = 100):
    b,w = initialize(X.shape[1])
    inter_num = 0
    gd_iterations_df = pd.DataFrame(columns=['iteration', 'loss'])
    result_idx = 0
    
    for iter in range(num_iter):
        y_predict = predict(b,w,X)
        iter_cost = cal_cost(y,y_predict)
        prev_b = b
        prev_w = w
        
        b,w = update_beta(X,y,y_predict, prev_b, prev_w, alpha)
        
        if (iter %10 == 0):
            gd_iterations_df.loc[result_idx] = [inter_num, iter_cost]
            result_idx = result_idx + 1
        inter_num += 1
    print(f"Final estimate of b and w are: {b}, {w}")
    return gd_iterations_df, b, w

In [31]:
gd_iterations_df, b, w = run_gradient_descent(X,y,alpha=0.001, num_iter=400)

Final estimate of b and w are: 0.28708291582872, [0.56296358 0.64333271 0.29483568]


In [30]:
gd_iterations_df

Unnamed: 0,iteration,loss
0,0.0,1.530277
1,10.0,1.46523
2,20.0,1.403233
3,30.0,1.344138
4,40.0,1.287807
5,50.0,1.234106
6,60.0,1.18291
7,70.0,1.1341
8,80.0,1.08756
9,90.0,1.043183
