In [1]:
import numpy as np
from sklearn import datasets, metrics
from sklearn.preprocessing import StandardScaler
from numpy.linalg import inv, pinv, LinAlgError
from sklearn.model_selection import train_test_split

In [2]:
(Xtemp,y)=datasets.fetch_california_housing(return_X_y=True)
#20640 instances, 8 numeric features + target is  the median house value for California districts,
#expressed in hundreds of thousands of dollars, no missing data
X=np.ones((Xtemp.shape[0], Xtemp.shape[1]+1))
print(type(X), X.shape)
X[:,1:]=Xtemp;
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
print("Data:")
print("X:", X.shape, "y:", y.shape, "X_train:", X_train.shape, "X_test:", X_test.shape, "y_train:", y_train.shape, "y_test:", y_test.shape);


ss=StandardScaler();
ss.fit(X_train[:,1:])
X_train[:,1:]=ss.transform(X_train[:,1:])
X_test[:,1:]=ss.transform(X_test[:,1:])

#Linear Regression using Batch Gradient Descent using numpy
print("#Linear Regression using Batch Gradient Descent using numpy#");
niterations=1000;
m=X_train.shape[0]
n=X_train.shape[1]
lr=0.01
#theta=np.random.uniform(0,1,size=(n))
theta=np.zeros(n)
update=np.zeros(n)
for i in range(niterations):
    ypred=np.dot(X_train,theta);
    error=ypred - y_train; #be mindful of this order in difference
    for j in range(n):
        update[j]=np.sum(error*((X_train.T)[j]))
    theta=theta-(lr)*(1/m)*update
print("Thetas:", theta)
print("Thetas Shape:", theta.shape)
pred=np.dot(X_test,theta);
print("MAE:", metrics.mean_absolute_error(y_test,pred))
print("MSE:", metrics.mean_squared_error(y_test,pred))


#Linear Regression using Normal Equation Method using numpy
print("#Linear Regression using Normal Equation Method using numpy#")
theta=np.zeros(X_train.shape[1])
try:
    XTXi=inv(np.dot(X_train.T,X_train))
except LinAlgError:
    XTXi=pinv(np.dot(X_train.T,X_train))
XTy=np.dot(X_train.T,y_train)
theta=np.dot(XTXi,XTy)
print("Thetas:", theta)
print("Thetas Shape:", theta.shape)
predictions=np.dot(theta,X_test.T)
print("MAE:", metrics.mean_absolute_error(y_true=y_test,y_pred=predictions))
print("MSE:", metrics.mean_squared_error(y_true=y_test,y_pred=predictions))

<class 'numpy.ndarray'> (20640, 9)
Data:
X: (20640, 9) y: (20640,) X_train: (16512, 9) X_test: (4128, 9) y_train: (16512,) y_test: (4128,)
#Linear Regression using Batch Gradient Descent using numpy#
Thetas: [ 2.07185749  0.82894365  0.17853146 -0.13794939  0.15669182  0.01681517
 -0.04522857 -0.48705563 -0.45147126]
Thetas Shape: (9,)
MAE: 0.5476758462432642
MSE: 0.5671852986082033
#Linear Regression using Normal Equation Method using numpy#
Thetas: [ 2.07194694  0.85438303  0.12254624 -0.29441013  0.33925949 -0.00230772
 -0.0408291  -0.89692888 -0.86984178]
Thetas Shape: (9,)
MAE: 0.5332001304956557
MSE: 0.5558915986952441
