In [1]:
import numpy as np
from sklearn import datasets, metrics
from sklearn.preprocessing import StandardScaler
#from sklearn.datasets import fetch_california_housing
from numpy.linalg import inv, pinv, LinAlgError
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression;
from sklearn.linear_model import SGDRegressor;
#from matplotlib import pyplot as plt
#import seaborn as sns
#from seaborn import heatmap;
#from statsmodels.stats.outliers_influence import variance_inflation_factor

(Xtemp,y)=datasets.fetch_california_housing(return_X_y=True)
#20640 instances, 8 numeric features + target is  the median house value for California districts,
#expressed in hundreds of thousands of dollars, no missing data

#adding a dummy variable
X=np.ones((Xtemp.shape[0], Xtemp.shape[1]+1))
print(type(X), X.shape)
X[:,1:]=Xtemp;

#train-test split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
#16512 in training set, 4128 in test set
print("Data:")
print("X:", X.shape, "y:", y.shape, "X_train:", X_train.shape, "X_test:", X_test.shape, "y_train:", y_train.shape, "y_test:", y_test.shape);

#scale data
ss=StandardScaler();
ss.fit(X_train[:,1:])
X_train[:,1:]=ss.transform(X_train[:,1:])
X_test[:,1:]=ss.transform(X_test[:,1:])

#Linear Regression using Batch Gradient Descent using numpy
print("#Linear Regression using Batch Gradient Descent using numpy#");
niterations=1000;
m=X_train.shape[0]
n=X_train.shape[1]
lr=0.01
#theta=np.random.uniform(0,1,size=(n))
theta=np.zeros(n)
update=np.zeros(n)
for i in range(niterations):
    ypred=np.dot(X_train,theta);
    error=ypred - y_train; #be mindful of this order in difference
    for j in range(n):
        update[j]=np.sum(error*((X_train.T)[j]))
    theta=theta-(lr)*(1/m)*update
print("Thetas:", theta)
print("Thetas Shape:", theta.shape)
pred=np.dot(X_test,theta);
print("MAE:", metrics.mean_absolute_error(y_test,pred))
print("MSE:", metrics.mean_squared_error(y_test,pred))


#Linear Regression using Normal Equation Method using numpy
print("#Linear Regression using Normal Equation Method using numpy#")
theta=np.zeros(X_train.shape[1])
try:
    XTXi=inv(np.dot(X_train.T,X_train))
except LinAlgError:
    XTXi=pinv(np.dot(X_train.T,X_train))
XTy=np.dot(X_train.T,y_train)
theta=np.dot(XTXi,XTy)
print("Thetas:", theta)
print("Thetas Shape:", theta.shape)
predictions=np.dot(theta,X_test.T)
print("MAE:", metrics.mean_absolute_error(y_true=y_test,y_pred=predictions))
print("MSE:", metrics.mean_squared_error(y_true=y_test,y_pred=predictions))


####Now using sklearn##########################################

#Please note, it automatically adds dummy variable
#so, removing dummy variable
X_train=X_train[:,1:]
X_test=X_test[:,1:]

#Linear Regression using Normal Equation Method using sklearn
print("#Linear Regression using Normal Equation Method using sklearn")
lr=LinearRegression();
lr.fit(X_train,y_train);
print("X_train's shape:",X_train.shape)
print("coef shape",lr.coef_.shape)
print("Theta1...Thetan:", lr.coef_)
print("Intercept:",lr.intercept_)
predictions=lr.predict(X_test);
print("MAE:", metrics.mean_absolute_error(y_true=y_test,y_pred=predictions))
print("MSE:", metrics.mean_squared_error(y_true=y_test,y_pred=predictions))

#Linear Regression using Stochastic Gradient Descent using sklearn
print("#Linear Regression using Stochastic Gradient Descent using sklearn")
sgd=SGDRegressor(eta0=0.01, max_iter=1000)
sgd.fit(X_train,y_train)
print("Theta1...Thetan:", sgd.coef_)
print("Intercept:",sgd.intercept_)
predictions=sgd.predict(X_test)
print("MAE:", metrics.mean_absolute_error(y_true=y_test,y_pred=predictions))
print("MSE:", metrics.mean_squared_error(y_true=y_test,y_pred=predictions))


#Linear Regression using Stochastic Gradient Descent and GridSearchCV using sklearn
print("#Linear Regression using Stochastic Gradient Descent and GridSearchCV using sklearn")
sgd1=SGDRegressor();
param_grid={'eta0':[0.01, 0.001, 0.0001], 'max_iter':[5000,10000,20000, 30000] }
gs=GridSearchCV(estimator=sgd1,param_grid=param_grid, cv=5)
gs.fit(X_train, y_train);
print("Best Hyper-Parameters:", gs.best_params_)


predictions=gs.predict(X_test);
print("MAE:", metrics.mean_absolute_error(y_true=y_test,y_pred=predictions))
print("MSE:", metrics.mean_squared_error(y_true=y_test,y_pred=predictions))

<class 'numpy.ndarray'> (20640, 9)
Data:
X: (20640, 9) y: (20640,) X_train: (16512, 9) X_test: (4128, 9) y_train: (16512,) y_test: (4128,)
#Linear Regression using Batch Gradient Descent using numpy#
Thetas: [ 2.07185749  0.82894365  0.17853146 -0.13794939  0.15669182  0.01681517
 -0.04522857 -0.48705563 -0.45147126]
Thetas Shape: (9,)
MAE: 0.5476758462432642
MSE: 0.5671852986082033
#Linear Regression using Normal Equation Method using numpy#
Thetas: [ 2.07194694  0.85438303  0.12254624 -0.29441013  0.33925949 -0.00230772
 -0.0408291  -0.89692888 -0.86984178]
Thetas Shape: (9,)
MAE: 0.5332001304956557
MSE: 0.5558915986952441
#Linear Regression using Normal Equation Method using sklearn
X_train's shape: (16512, 8)
coef shape (8,)
Theta1...Thetan: [ 0.85438303  0.12254624 -0.29441013  0.33925949 -0.00230772 -0.0408291
 -0.89692888 -0.86984178]
Intercept: 2.071946937378619
MAE: 0.5332001304956566
MSE: 0.5558915986952442
#Linear Regression using Stochastic Gradient Descent using sklearn
Th