In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDRegressor
import plotly.graph_objs as go
import plotly.figure_factory as ff

# Importing dataset and examining it
dataset = pd.read_csv("/content/drive/MyDrive/Datasets/HousePrices.csv")
pd.set_option('display.max_columns', None) # to make sure you can see all the columns in output window
print(dataset.head())
print(dataset.shape)
print(dataset.info())
print(dataset.describe())

# Converting Categorical features into Numerical features
dataset['CentralAir'] = dataset['CentralAir'].map({'Y': 1, 'N':0})
dataset['PavedDrive'] = dataset['PavedDrive'].map({'Y': 1, 'N':0})
print(dataset.info())

# Plotting Correlation Heatmap
corrs = dataset.corr()
figure = ff.create_annotated_heatmap(
    z=corrs.values,
    x=list(corrs.columns),
    y=list(corrs.index),
    annotation_text=corrs.round(2).values,
    showscale=True)
figure.show()

# Dividing dataset into label and feature sets
X = dataset.drop(['TotRmsAbvGrd','GarageCars','SalePrice'], axis = 1) # Features
Y = dataset['SalePrice'] # Labels
print(type(X))
print(type(Y))
print(X.shape)
print(Y.shape)

# Normalizing numerical features so that each feature has mean 0 and variance 1
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X)

# Linear Regression without Regularization
# Tuning the SGDRegressor parameters 'eta0' (learning rate) and 'max_iter' using Grid Search
sgdr = SGDRegressor(random_state = 1, penalty = None)
grid_param = {'eta0': [.0001, .001, .01, .1, 1], 'max_iter':[10000, 20000, 30000, 40000]}

gd_sr = GridSearchCV(estimator=sgdr, param_grid=grid_param, scoring='r2', cv=5)

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print("Best parameters: ", best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print("r2: ", best_result)

Adj_r2 = 1-(1-best_result)*(1168-1)/(1168-19-1)
print("Adjusted r2: ", Adj_r2)

'''
Adj_r2 = 1-(1-r2)*(n-1)/(n-p-1)

where, n = number of observations in training data, p = number of features
'''

best_model = gd_sr.best_estimator_
print("Intercept: ", best_model.intercept_)

print(pd.DataFrame(zip(X.columns, best_model.coef_), columns=['Features','Coefficients']).sort_values(by=['Coefficients'],ascending=False))

# Linear Regression with Regularization
# Tuning the SGDRegressor parameters 'eta0' (learning rate) and 'max_iter', along with the regularization parameter alpha using Grid Search
sgdr = SGDRegressor(random_state = 1, penalty = 'elasticnet')
grid_param = {'eta0': [.0001, .001, .01, .1, 1], 'max_iter':[10000, 20000, 30000, 40000],'alpha': [.001, .01, .1, 1,10, 100], 'l1_ratio': [0,0.25,0.5,0.75,1]}

gd_sr = GridSearchCV(estimator=sgdr, param_grid=grid_param, scoring='r2', cv=5)

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print("Best parameters: ", best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print("r2: ", best_result)

Adj_r2 = 1-(1-best_result)*(1168-1)/(1168-19-1)
print("Adjusted r2: ", Adj_r2)

'''
Adj_r2 = 1-(1-r2)*(n-1)/(n-p-1)

where, n = number of observations in training data, p = number of features
'''

best_model = gd_sr.best_estimator_
print("Intercept: ", best_model.intercept_)

print(pd.DataFrame(zip(X.columns, best_model.coef_), columns=['Features','Coefficients']).sort_values(by=['Coefficients'],ascending=False))

   LotArea  YearBuilt  YearRemodelled  TotalBsmtSF CentralAir  1stFlrSF  \
0     8450       2003            2003          856          Y       856   
1     9600       1976            1976         1262          Y      1262   
2    11250       2001            2002          920          Y       920   
3     9550       1915            1970          756          Y       961   
4    14260       2000            2000         1145          Y      1145   

   2ndFlrSF  GrLivArea  BsmtFullBath  BsmtHalfBath  FullBath  HalfBath  \
0       854       1710             1             0         2         1   
1         0       1262             0             1         2         0   
2       866       1786             1             0         2         1   
3       756       1717             1             0         1         0   
4      1053       2198             1             0         2         1   

   BedroomAbvGr  KitchenAbvGr  TotRmsAbvGrd  Fireplaces  GarageCars  \
0             3             1    

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
(1460, 19)
(1460,)
Best parameters:  {'eta0': 0.01, 'max_iter': 10000}
r2:  0.7437240385799859
Adjusted r2:  0.7394825374763446
Intercept:  [180580.34711775]
          Features  Coefficients
7        GrLivArea  18692.622369
6         2ndFlrSF  12888.195009
1        YearBuilt  11997.309482
5         1stFlrSF  10825.003291
15      GarageArea  10424.875641
2   YearRemodelled   9582.931919
3      TotalBsmtSF   9383.038394
14      Fireplaces   5931.729659
8     BsmtFullBath   4123.005784
10        FullBath   4019.164375
17      WoodDeckSF   2430.688451
9     BsmtHalfBath    703.995451
11        HalfBath     47.333067
0          LotArea   -427.434406
4       CentralAir   -761.184942
16      PavedDrive   -989.748035
18        PoolArea  -5932.080585
12    BedroomAbvGr  -6208.138233
13    KitchenAbvGr  -7524.207862
Best parameters:  {'alpha': 0.1, 'eta0': 0.01, 'l1_ratio': 0, 'max_iter': 10000}
r2:  0.7461282904941251
Adj