In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
from scipy.stats import skew
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.preprocessing import PolynomialFeatures

In [16]:
df = pd.read_csv("BostonHousing.csv")

In [17]:
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [18]:
#X = df.drop(["medv"],axis=1)
#y = df["medv"]

X = df.iloc[:,:-1]
y = df.iloc[:,-1]

### Standardization

In [19]:
from sklearn.preprocessing import StandardScaler

In [20]:
for col in X:
    ss = StandardScaler()
    X[col] = ss.fit_transform(X[[col]])

In [21]:
X.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
0,-0.419782,0.28483,-1.287909,-0.272599,-0.144217,0.413672,-0.120013,0.140214,-0.982843,-0.666608,-1.459,0.441052,-1.075562
1,-0.417339,-0.487722,-0.593381,-0.272599,-0.740262,0.194274,0.367166,0.55716,-0.867883,-0.987329,-0.303094,0.441052,-0.492439
2,-0.417342,-0.487722,-0.593381,-0.272599,-0.740262,1.282714,-0.265812,0.55716,-0.867883,-0.987329,-0.303094,0.396427,-1.208727
3,-0.41675,-0.487722,-1.306878,-0.272599,-0.835284,1.016303,-0.809889,1.077737,-0.752922,-1.106115,0.113032,0.416163,-1.361517
4,-0.412482,-0.487722,-1.306878,-0.272599,-0.835284,1.228577,-0.51118,1.077737,-0.752922,-1.106115,0.113032,0.441052,-1.026501


### Baseline model

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=1)
model = LinearRegression()
model.fit(X_train,y_train)

print("intercept: ",model.intercept_)
c = -1
for col in X:
    c = c + 1
    print(f"Coef of {col}:",model.coef_[c])    

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse)

r2 = r2_score(y_test,y_pred)

print("\nmse: {},\nrmse: {},\nr2: {}".format(mse,rmse,r2))

intercept:  22.589670302295588
Coef of crim: -0.8467795340326705
Coef of zn: 1.4162331203706096
Coef of indus: 0.40553632636772985
Coef of chas: 0.6190210827332898
Coef of nox: -2.485430031811334
Coef of rm: 1.9624446903872248
Coef of age: 0.10052138048272596
Coef of dis: -3.189673162703297
Coef of rad: 2.67519834795224
Coef of tax: -1.8992219849542487
Coef of ptratio: -2.174627955316392
Coef of b: 0.5882865417488481
Coef of lstat: -4.058066528881199

mse: 19.831323672063156,
rmse: 4.453237437198151,
r2: 0.783629538507629


### Gradient Descent

In [23]:
from sklearn import linear_model

In [29]:
gdm = linear_model.SGDRegressor(max_iter=1000, tol=1e-3)
gdm.fit(X_train,y_train)

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=1000,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [30]:
gdm.intercept_

array([22.60452632])

In [31]:
c = -1
for col in X:
    c = c + 1
    print(f"Coef of {col}:",gdm.coef_[c])

Coef of crim: -0.7471923688514134
Coef of zn: 1.1981781290451814
Coef of indus: 0.1125253719042599
Coef of chas: 0.6738794171138166
Coef of nox: -2.3170892028739063
Coef of rm: 2.062273148566758
Coef of age: 0.04469053219651615
Coef of dis: -2.9827410508466987
Coef of rad: 1.834117868524956
Coef of tax: -1.0343465248867307
Coef of ptratio: -2.175260605633628
Coef of b: 0.603211005662462
Coef of lstat: -4.028587505367298


In [32]:
y_pred = gdm.predict(X_test)
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse)

r2 = r2_score(y_test,y_pred)

print("\nmse: {},\nrmse: {},\nr2: {}".format(mse,rmse,r2))


mse: 19.710376908289707,
rmse: 4.439637024384956,
r2: 0.7849491330806596
