In [113]:
import pandas
import numpy as np
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression

#- Normal Equation
def normal_equation(data_x, data_y):
    #- Calculate the number of rows of data_x, labeled as m
    m = len(data_x)
    #- Add a spacer column to x
    x = np.c_[np.ones((m,1)), data_x]
    
    #- Theta = (xTx)^(-1)xTy 
    xTx = x.transpose().dot(x)
    theta = np.linalg.inv(xTx).dot(x.transpose()).dot(y)
    return theta


#- preprocess data
rawData = load_boston()
bostonDataset = pandas.DataFrame(rawData.data, columns=rawData.feature_names)
x = np.array(bostonDataset)
bostonDataset['MEDV'] = rawData.target
y = np.array(bostonDataset['MEDV'])[:, np.newaxis]
print(bostonDataset.head())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  


In [116]:
#- Find theta using Normal Equation
theta_ne = normal_equation(x, y)
print('theta_ne ', theta_ne)

theta_ne  [[ 3.64594884e+01]
 [-1.08011358e-01]
 [ 4.64204584e-02]
 [ 2.05586264e-02]
 [ 2.68673382e+00]
 [-1.77666112e+01]
 [ 3.80986521e+00]
 [ 6.92224640e-04]
 [-1.47556685e+00]
 [ 3.06049479e-01]
 [-1.23345939e-02]
 [-9.52747232e-01]
 [ 9.31168327e-03]
 [-5.24758378e-01]]


In [115]:
# - Find theta using Scikit-learn
reg = LinearRegression().fit(x, y)
theta_scikit = np.c_[reg.intercept_, reg.coef_]
print('theta_scikit ', theta_scikit.transpose())

theta_scikit  [[ 3.64594884e+01]
 [-1.08011358e-01]
 [ 4.64204584e-02]
 [ 2.05586264e-02]
 [ 2.68673382e+00]
 [-1.77666112e+01]
 [ 3.80986521e+00]
 [ 6.92224640e-04]
 [-1.47556685e+00]
 [ 3.06049479e-01]
 [-1.23345939e-02]
 [-9.52747232e-01]
 [ 9.31168327e-03]
 [-5.24758378e-01]]
