In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score

In [72]:
df = pd.read_csv("BostonHousing.csv")

In [73]:
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [31]:
#X = df.drop(["medv"],axis=1)
#y = df["medv"]

X = df.iloc[:,:-1]
y = df.iloc[:,-1]

### Standardization

In [74]:
from sklearn.preprocessing import StandardScaler

In [75]:
for col in X:
    ss = StandardScaler()
    X[col] = ss.fit_transform(X[[col]])

In [76]:
X.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
0,-0.419782,0.28483,-1.287909,-0.272599,-0.144217,0.413672,-0.120013,0.140214,-0.982843,-0.666608,-1.459,0.441052,-1.075562
1,-0.417339,-0.487722,-0.593381,-0.272599,-0.740262,0.194274,0.367166,0.55716,-0.867883,-0.987329,-0.303094,0.441052,-0.492439
2,-0.417342,-0.487722,-0.593381,-0.272599,-0.740262,1.282714,-0.265812,0.55716,-0.867883,-0.987329,-0.303094,0.396427,-1.208727
3,-0.41675,-0.487722,-1.306878,-0.272599,-0.835284,1.016303,-0.809889,1.077737,-0.752922,-1.106115,0.113032,0.416163,-1.361517
4,-0.412482,-0.487722,-1.306878,-0.272599,-0.835284,1.228577,-0.51118,1.077737,-0.752922,-1.106115,0.113032,0.441052,-1.026501


### Baseline model

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=1)
model = LinearRegression()
model.fit(X_train,y_train)

print("intercept: ",model.intercept_)
c = -1
for col in X:
    c = c + 1
    print(f"Coef of {col}:",model.coef_[c])    

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse)

r2 = r2_score(y_test,y_pred)

print("\nmse: {},\nrmse: {},\nr2: {}".format(mse,rmse,r2))

intercept:  22.589670302295588
Coef of crim: -0.8467795340326705
Coef of zn: 1.4162331203706096
Coef of indus: 0.40553632636772985
Coef of chas: 0.6190210827332898
Coef of nox: -2.485430031811334
Coef of rm: 1.9624446903872248
Coef of age: 0.10052138048272596
Coef of dis: -3.189673162703297
Coef of rad: 2.67519834795224
Coef of tax: -1.8992219849542487
Coef of ptratio: -2.174627955316392
Coef of b: 0.5882865417488481
Coef of lstat: -4.058066528881199

mse: 19.831323672063156,
rmse: 4.453237437198151,
r2: 0.783629538507629


In [78]:
# Train 
model.score(X_train,y_train)

0.7103879080674731

In [79]:
# Test
model.score(X_test,y_test)

0.783629538507629

In [25]:
# this train and test scores gives hint of overfitting

### Regularization

In [80]:
from sklearn.linear_model import Lasso # lambda*sum(abs(coef))
from sklearn.linear_model import Ridge # lambda*sum(square(coef))

In [81]:
# Ridge: Reduces values of less important features coef
l2 = Ridge(10)  # random lambda value = 10
l2.fit(X_train,y_train)
l2.coef_

array([-7.37431526e-01,  1.15370364e+00,  1.30752903e-01,  6.59367477e-01,
       -2.10309974e+00,  2.08649367e+00, -7.07818866e-04, -2.76918002e+00,
        1.86541312e+00, -1.16120975e+00, -2.08031239e+00,  6.01854686e-01,
       -3.85603637e+00])

In [87]:
# Lasso helps in feature selection : it completely reduce coef values to zero
l1 = Lasso(100) 
l1.fit(X_train,y_train)
l1.coef_

array([-0.,  0., -0.,  0., -0.,  0., -0.,  0., -0., -0., -0.,  0., -0.])

In [83]:
l1.coef_.round()

array([-0.,  0., -0.,  0., -0.,  0., -0.,  0., -0., -0., -0.,  0., -0.])

<p>Finding right lambda value</p>

<p>Ridge</p>

In [84]:
for i in range(50):
    l2 = Ridge(i)
    l2.fit(X_train,y_train)
    print(l2.score(X_test,y_test))

0.7836295385076292
0.7838387187540423
0.7839620554972144
0.7840142325806498
0.7840067969561306
0.7839489331676968
0.7838480250175017
0.7837100682170127
0.783539977221919
0.7833418159451742
0.7831189730475356
0.7828742964227626
0.7826101973245045
0.7823287316856521
0.7820316641450696
0.7817205188494297
0.7813966200582486
0.781061124825982
0.7807150494827411
0.7803592912271947
0.7799946458413467
0.7796218223087347
0.7792414549450688
0.7788541135189018
0.7784603117391521
0.7780605144085264
0.7776551434814901
0.7772445832182698
0.7768291845893088
0.7764092690553417
0.775985131825017
0.7755570446734668
0.7751252583903597
0.7746900049140116
0.7742514991984392
0.7738099408523818
0.7733655155828939
0.7729183964708508
0.7724687451013872
0.7720167125687165
0.7715624403718154
0.7711060612149998
0.7706476997253663
0.7701874730973415
0.7697254916731515
0.7692618594667894
0.7687966746380397
0.7683300299222359
0.7678620130206896
0.7673927069560891


<p>Lasso</p>

In [89]:
for i in range(0,50,1):
    l1 = Lasso(i)
    l1.fit(X_train,y_train)
    print(l1.score(X_test,y_test))

0.7836295385076297
0.6734933956327489
0.5867517835508683
0.4655610145837574
0.33676752243360863
0.22068575797736978
0.10205025878591267
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262274585616
-0.0045026262

### Final Model Regularization

In [90]:
l2 = Ridge(alpha=3)
l2.fit(X_train,y_train)
print(l2.score(X_test,y_test))

0.7840142325806498


In [91]:
l2.coef_

array([-0.80596183,  1.31952331,  0.29860739,  0.63499052, -2.35504719,
        2.00761986,  0.06504653, -3.04756794,  2.3627285 , -1.60354086,
       -2.1443643 ,  0.59311307, -3.99232021])

In [92]:
l1 = Lasso(alpha=0)
l1.fit(X_train,y_train)
print(l1.score(X_test,y_test))

0.7836295385076297


In [93]:
l1.coef_

array([-0.84677953,  1.41623312,  0.40553633,  0.61902108, -2.48543003,
        1.96244469,  0.10052138, -3.18967316,  2.67519835, -1.89922198,
       -2.17462796,  0.58828654, -4.05806653])

In [94]:
l1.coef_.round(3)
# Feature selection

array([-0.847,  1.416,  0.406,  0.619, -2.485,  1.962,  0.101, -3.19 ,
        2.675, -1.899, -2.175,  0.588, -4.058])

### Cross Validation
<p>cross validation let us know which model is working perfectly on overall data</p>
<p>Ridge and Lasso are working on training data not on entire data</p>

In [95]:
from sklearn.model_selection import cross_val_score

In [96]:
l2_cross = cross_val_score(l2,X,y,cv=4)

In [97]:
l2_cross

array([ 0.61236514,  0.60829726,  0.36016978, -0.96438488])

In [98]:
l2_cross.mean()

0.15411182725352557

In [99]:
l1_cross = cross_val_score(l1,X,y,cv=4)

In [100]:
l1_cross

array([ 0.60217169,  0.60398145,  0.35873597, -1.10867706])

In [101]:
l1_cross.mean()

0.11405301290099479

### Gradient Descent

In [3]:
df = pd.read_csv("BostonHousing.csv")

In [4]:
X = df[['crim', 'zn', 'chas', 'nox', 'rm', 'dis', 'rad', 'tax',
       'ptratio', 'b', 'lstat']]
y = df["medv"]

In [5]:
from sklearn import linear_model

In [6]:
gdm = linear_model.SGDRegressor(max_iter=100, tol=1e-3)
gdm.fit(X,y)

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=100,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [7]:
gdm.intercept_

array([1.37106503e+10])

In [8]:
gdm.coef_.round(3)

array([ 2.32217379e+11, -3.45112199e+10,  6.63887932e+10,  5.70056363e+09,
        1.19567122e+11,  1.33561005e+11, -1.36984182e+11, -3.06180851e+11,
        2.31574569e+09, -5.07948079e+10, -5.69230858e+10])

In [9]:
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [10]:
y_pred = gdm.predict(X.iloc[:5,:])
y_pred.round()

array([-1.10445560e+14, -9.35553729e+13, -9.29664164e+13, -8.68820633e+13,
       -8.71070805e+13])

In [12]:
def gradient_descent(x,y):
    
    
    n = len(x)
    learning_rate = 0.08
    intercept = slope = 0
    iterations = 50
    
    for i in range(iterations):
        y_pred = intercept + x*slope
        
        mse = mean_squared_error(y_pred,y)
        
        md = -(2/n)*sum(x*(y-y_pred))
        bd = -(2/n)*sum(y-y_pred)

        slope = slope - (learning_rate * md)
        intercept = intercept - (learning_rate * bd)
        
        print("slope: ",slope,"\nintercept: ",intercept,"\nmse: ",mse,"\niteration: ",i,"\n-------------------")

In [13]:
gradient_descent(X,y)

ValueError: y_true and y_pred have different number of output (11!=1)