# Boston Housing Data

## Import modules and Data set

In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_boston
from sklearn import cross_validation
from sklearn.metrics import mean_squared_error
from sklearn import grid_search
%matplotlib inline

boston = load_boston()
housing = pd.DataFrame.from_records(boston.data)
housing.columns = boston.feature_names
housing['cost'] = boston.target
housing.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,cost
0,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


## Split into training and test set

In [70]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(boston.data, boston.target, test_size=0.2, random_state=0)

## Fit Linear Regression

In [71]:
lr_rgr = linear_model.LinearRegression()
lr_rgr.fit(X_train,y_train)
print("coefficient {}".format(lr_rgr.coef_))
print("intercept {}".format(lr_rgr.intercept_))

LR_MSE = mean_squared_error(y_test, lr_rgr.predict(X_test))
print('Mean squared error for liner regression: {}'.format(LR_MSE))

coefficient [ -1.18410318e-01   4.47550643e-02   5.85674689e-03   2.34230117e+00
  -1.61634024e+01   3.70135143e+00  -3.04553661e-03  -1.38664542e+00
   2.43784171e-01  -1.09856157e-02  -1.04699133e+00   8.22014729e-03
  -4.93642452e-01]
intercept 38.1386927134
Mean squared error for liner regression: 33.4507089677


## Scale and fit Ridge and Lasso Regression

In [72]:
from sklearn import preprocessing
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = cross_validation.train_test_split(preprocessing.scale(boston.data), boston.target, test_size=0.2, random_state=0)

In [73]:
param_grid = [{'alpha':np.linspace(1e-8, 1, 3000)}]

#Ridge
ridge_rgr = linear_model.Ridge(normalize = True)
ridge_cv = grid_search.GridSearchCV(ridge_rgr, param_grid,cv=5)
ridge_cv.fit(X_train_scaled, y_train_scaled)

#Lasso
lasso_rgr = linear_model.Lasso(normalize = True)
lasso_cv = grid_search.GridSearchCV(lasso_rgr, param_grid, cv=5)
lasso_cv.fit(X_train_scaled, y_train_scaled)

GridSearchCV(cv=5, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid=[{'alpha': array([  1.00000e-08,   3.33454e-04, ...,   9.99667e-01,   1.00000e+00])}],
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [74]:
Ridge_MSE = mean_squared_error(y_test_scaled, ridge_cv.best_estimator_.predict(X_test_scaled))
Lasso_MSE = mean_squared_error(y_test_scaled, lasso_cv.best_estimator_.predict(X_test_scaled))

print('Mean squared error for Ridge regression: {}'.format(Ridge_MSE))
print('Mean squared error for Lasso regression: {}'.format(Lasso_MSE))

Mean squared error for Ridge regression: 34.2466856137
Mean squared error for Lasso regression: 33.9447762006


## Try using a different scoring function

In [77]:
from sklearn.metrics import make_scorer, mean_squared_error
mse = make_scorer(mean_squared_error, greater_is_better = False)

param_grid = [{'alpha':np.linspace(1e-8, 1, 3000)}]

#Ridge
ridge_rgr = linear_model.Ridge()
ridge_cv = grid_search.GridSearchCV(ridge_rgr, param_grid,cv=5, scoring=mse)
ridge_cv.fit(X_train_scaled, y_train_scaled)

#Lasso
lasso_rgr = linear_model.Lasso()
lasso_cv = grid_search.GridSearchCV(lasso_rgr, param_grid, cv=5, scoring=mse)
lasso_cv.fit(X_train_scaled, y_train_scaled)


Ridge_MSE = mean_squared_error(y_test_scaled, ridge_cv.best_estimator_.predict(X_test_scaled))
Lasso_MSE = mean_squared_error(y_test_scaled, lasso_cv.best_estimator_.predict(X_test_scaled))

print('Mean squared error for Ridge regression: {}'.format(Ridge_MSE))
print('Mean squared error for Lasso regression: {}'.format(Lasso_MSE))

Mean squared error for Ridge regression: 33.5384524938
Mean squared error for Lasso regression: 33.734886772


In [84]:
print(boston.feature_names)
lasso_cv.best_estimator_.fit(X_train_scaled, y_train_scaled).coef_

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


array([-0.94869572,  0.97451365, -0.        ,  0.5934874 , -1.77404117,
        2.62402232, -0.0470927 , -2.76052172,  1.82096803, -1.60545356,
       -2.24039543,  0.72262279, -3.53482256])

## Add in polynomial features

In [86]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import make_scorer, mean_squared_error

mse = make_scorer(mean_squared_error, greater_is_better = False)
poly = PolynomialFeatures(2)


#Polynomial
poly_rgr = linear_model.LinearRegression()
poly_rgr.fit(poly.fit_transform(X_train),y_train)
prediction = poly_rgr.predict(poly.fit_transform(X_test))
Poly_MSE = mean_squared_error(y_test, prediction )

#Polynomial Ridge 
ridge_poly_rgr = linear_model.Ridge()
ridge_poly_cv = grid_search.GridSearchCV(ridge_poly_rgr, param_grid,cv=5, scoring=mse)
ridge_poly_cv.fit(poly.fit_transform(X_train_scaled), y_train_scaled)
Ridge_Poly_MSE = mean_squared_error(y_test_scaled, ridge_poly_cv.best_estimator_.predict(poly.fit_transform(X_test_scaled)))

#Polynomial Lasso
lasso_poly_rgr = linear_model.Lasso()
lasso_poly_cv = grid_search.GridSearchCV(lasso_poly_rgr, param_grid, cv=5, scoring=mse)
lasso_poly_cv.fit(poly.fit_transform(X_train_scaled), y_train_scaled)
Lasso_Poly_MSE = mean_squared_error(y_test_scaled, lasso_poly_cv.best_estimator_.predict(poly.fit_transform(X_test_scaled)))


print('Mean squared error for Polynomial regression: {}'.format(Poly_MSE))
print('Mean squared error for Polynomial Ridge regression: {}'.format(Ridge_Poly_MSE))
print('Mean squared error for Polynomial Lasso regression: {}'.format(Lasso_Poly_MSE))

Mean squared error for Polynomial regression: 31.4494981089
Mean squared error for Polynomial Ridge regression: 26.3538789603
Mean squared error for Polynomial Lasso regression: 18.5255307863


In [91]:
abs(lasso_poly_cv.best_estimator_.fit(poly.fit_transform(X_train_scaled), y_train_scaled).coef_) > .5

array([False, False, False, False, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False, False, False,  True,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False,  True,  True, False, False,
       False, False, False, False,  True, False,  True,  True, False,
       False, False,  True, False, False, False, False, False, False,
        True,  True, False, False, False, False, False,  True, False,
        True,  True,  True, False, False, False, False,  True, False,
       False, False,  True,  True, False, False, False, False, False,
       False,  True, False, False,  True, False,  True, False,  True,
       False, False, False, False, False,  True], dtype=bool)

## Add in higher polynomial features

In [88]:
poly3 = PolynomialFeatures(3)

#Polynomial
poly3_rgr = linear_model.LinearRegression()
poly3_rgr.fit(poly3.fit_transform(X_train),y_train)
Poly3_MSE = mean_squared_error(y_test, poly3_rgr.predict(poly3.fit_transform(X_test)))

#Polynomial Ridge 
ridge_poly3_rgr = linear_model.Ridge()
ridge_poly3_cv = grid_search.GridSearchCV(ridge_poly3_rgr, param_grid,cv=5, scoring=mse)
ridge_poly3_cv.fit(poly3.fit_transform(X_train_scaled), y_train_scaled)
Ridge_Poly3_MSE = mean_squared_error(y_test_scaled, ridge_poly3_cv.best_estimator_.predict(poly3.fit_transform(X_test_scaled)))

#Polynomial Lasso
lasso_poly3_rgr = linear_model.Lasso()
lasso_poly3_cv = grid_search.GridSearchCV(lasso_poly3_rgr, param_grid, cv=5, scoring=mse)
lasso_poly3_cv.fit(poly3.fit_transform(X_train_scaled), y_train_scaled)
Lasso_Poly3_MSE = mean_squared_error(y_test_scaled, lasso_poly3_cv.best_estimator_.predict(poly3.fit_transform(X_test_scaled)))


print('Mean squared error for Polynomial 3 regression: {}'.format(Poly3_MSE))
print('Mean squared error for Polynomial 3 Ridge regression: {}'.format(Ridge_Poly3_MSE))
print('Mean squared error for Polynomial 3 Lasso regression: {}'.format(Lasso_Poly3_MSE))

Mean squared error for Polynomial 3 regression: 1038786.70557
Mean squared error for Polynomial 3 Ridge regression: 26.7484195055
Mean squared error for Polynomial 3 Lasso regression: 21.5094113017


In [90]:
lasso_poly_cv.best_estimator_.fit(poly.fit_transform(X_train_scaled), y_train_scaled).coef_

array([ 0.        , -0.        , -0.        , -0.03248798,  0.        ,
       -0.71405843,  3.16009786, -0.94274203, -1.19646111,  0.00352687,
       -0.70316459, -0.89208375,  1.08569341, -3.46547951,  0.06343218,
        0.        ,  0.        ,  1.33448253, -0.1292301 ,  0.1759247 ,
       -0.        ,  0.        , -0.56331031, -0.        , -0.        ,
        0.        ,  0.        ,  0.09863326,  0.        ,  0.        ,
        0.        ,  0.18661152,  0.        , -0.        ,  0.        ,
        0.4548113 ,  0.10680522, -0.        , -0.        ,  0.57356129,
        0.        ,  1.18890317,  0.56496678,  0.18514058,  0.37949528,
        0.        ,  0.        , -0.40215992,  0.        , -0.83078275,
        0.28971801, -0.85744057, -0.65934531,  0.22864212, -0.        ,
        0.        ,  0.68837561,  0.        ,  0.43212307, -0.3437424 ,
       -0.        , -0.05668273, -0.        ,  1.43171363, -1.34457322,
       -0.        , -0.03741514,  0.        ,  0.13433339,  0.24

In [92]:
boston.data[]

array([[  6.32000000e-03,   1.80000000e+01,   2.31000000e+00, ...,
          1.53000000e+01,   3.96900000e+02,   4.98000000e+00],
       [  2.73100000e-02,   0.00000000e+00,   7.07000000e+00, ...,
          1.78000000e+01,   3.96900000e+02,   9.14000000e+00],
       [  2.72900000e-02,   0.00000000e+00,   7.07000000e+00, ...,
          1.78000000e+01,   3.92830000e+02,   4.03000000e+00],
       ..., 
       [  6.07600000e-02,   0.00000000e+00,   1.19300000e+01, ...,
          2.10000000e+01,   3.96900000e+02,   5.64000000e+00],
       [  1.09590000e-01,   0.00000000e+00,   1.19300000e+01, ...,
          2.10000000e+01,   3.93450000e+02,   6.48000000e+00],
       [  4.74100000e-02,   0.00000000e+00,   1.19300000e+01, ...,
          2.10000000e+01,   3.96900000e+02,   7.88000000e+00]])

In [102]:
tmp_train = poly.fit_transform(X_train)[:,abs(lasso_poly_cv.best_estimator_.fit(poly.fit_transform(X_train_scaled), y_train_scaled).coef_) > .5]
tmp_test = poly.fit_transform(X_test)[:,abs(lasso_poly_cv.best_estimator_.fit(poly.fit_transform(X_train_scaled), y_train_scaled).coef_) > .5]

In [103]:
tmp_rgr = linear_model.LinearRegression()
tmp_rgr.fit(tmp_train,y_train)
tmp_MSE = mean_squared_error(y_test, tmp_rgr.predict(tmp_test))

In [104]:
tmp_MSE

19.635923923565741

In [106]:
print('Mean squared error for Polynomial 3 regression: {}'.format(Poly3_MSE))
print('Mean squared error for Polynomial 3 Ridge regression: {}'.format(Ridge_Poly3_MSE))
print('Mean squared error for Polynomial 3 Lasso regression: {}'.format(Lasso_Poly3_MSE))

print(mean_squared_error(y_train, poly3_rgr.predict(poly3.fit_transform(X_train))))
print(mean_squared_error(y_train_scaled, ridge_poly3_cv.best_estimator_.predict(poly3.fit_transform(X_train_scaled))))
print(mean_squared_error(y_train_scaled, lasso_poly3_cv.best_estimator_.predict(poly3.fit_transform(X_train_scaled))))

Mean squared error for Polynomial 3 regression: 1038786.70557
Mean squared error for Polynomial 3 Ridge regression: 26.7484195055
Mean squared error for Polynomial 3 Lasso regression: 21.5094113017
1.14026884237e-11
0.920643329882
5.85804144377
