# Boston dataset - Regression

## Initial imports

In [119]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [120]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

In [121]:
boston = load_boston()

In [122]:
print("boston keys: \n{}".format(boston.keys()))

boston keys: 
dict_keys(['data', 'target', 'feature_names', 'DESCR'])


In [123]:
boston.data.shape

(506, 13)

In [124]:
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target,
                                                   test_size=0.3, random_state=0)

# OLS

In [125]:
from sklearn.linear_model import LinearRegression

In [126]:
%%timeit
lm = LinearRegression()
lm.fit(X_train, y_train)

475 µs ± 8.95 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [127]:
#Coeffs df
coefficients = lm.coef_.reshape(1,13)
pd.DataFrame(coefficients, columns=boston.feature_names)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.119859,0.044423,0.011861,2.512951,-16.271037,3.849099,-0.009855,-1.500027,0.241508,-0.011067,-1.018977,0.006953,-0.488111


In [128]:
lm.intercept_

37.992592770342782

In [129]:
print("training set score: {:.2f}".format(lm.score(X_train, y_train)))
print('test set score: {:.2f}'.format(lm.score(X_test, y_test)))

training set score: 0.76
test set score: 0.67


# Ridge

In [130]:
from sklearn.linear_model import Ridge

In [131]:
ridge = Ridge(alpha=1)
ridge.fit(X_train, y_train)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [132]:
print("train set score: {:.2f}".format(ridge.score(X_train, y_train)))
print("test set score: {:.2f}".format(ridge.score(X_test, y_test)))

train set score: 0.76
test set score: 0.67


In [133]:
ridge_coeffs = ridge.coef_.reshape(1,13)
pd.DataFrame(ridge_coeffs, columns=boston.feature_names)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.116678,0.046081,-0.020415,2.460735,-8.278639,3.888008,-0.017804,-1.396752,0.217637,-0.01163,-0.932674,0.007406,-0.495456


# Lasso

In [134]:
from sklearn.linear_model import Lasso

In [135]:
lasso = Lasso(alpha=1)
lasso.fit(X_train, y_train)

Lasso(alpha=1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [136]:
print("train set score: {:.2f}".format(lasso.score(X_train, y_train)))
print("test set score: {:.2f}".format(lasso.score(X_test, y_test)))
print("number of features used: {}".format(np.sum(lasso.coef_ != 0)))

train set score: 0.71
test set score: 0.61
number of features used: 10


In [137]:
lasso01 = Lasso(alpha=0.1)
lasso01.fit(X_train, y_train)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [138]:
print("train set score: {:.2f}".format(lasso01.score(X_train, y_train)))
print("test set score: {:.2f}".format(lasso01.score(X_test, y_test)))
print("number of features used: {}".format(np.sum(lasso01.coef_ != 0)))

train set score: 0.75
test set score: 0.65
number of features used: 12


# Feature Engineering

In [139]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

In [140]:
import warnings
warnings.filterwarnings('ignore')

In [141]:
pipe = Pipeline([('preprocessing', StandardScaler()), 
                 ('polynomialfeatures', PolynomialFeatures()),
                 ('regressor', LinearRegression())])

In [142]:
param_grid = [{'regressor': [LinearRegression()], 'preprocessing': [StandardScaler(), MinMaxScaler(), None],
               'polynomialfeatures__degree': [1,2,3]},
              
              {'regressor': [Ridge()], 'preprocessing': [StandardScaler(), MinMaxScaler(), None],
               'regressor__alpha': [0.001,0.01,0.1,1,10,100], 'polynomialfeatures__degree': [1,2,3]},
              
              {'regressor': [Lasso()], 'preprocessing': [StandardScaler(), MinMaxScaler(), None],
               'regressor__alpha': [0.001,0.01,0.1,1,10,100], 'polynomialfeatures__degree': [1,2,3]}]

In [143]:
%%timeit
grid = GridSearchCV(pipe, param_grid, cv=10, n_jobs=-1)
grid.fit(X_train, y_train)

16.3 s ± 300 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [144]:
print('Best parameters: {}'.format(grid.best_params_))
print('\ntrain set score: {:.2f}'.format(grid.score(X_train, y_train)))
print('test set score: {:.2f}'.format(grid.score(X_test, y_test)))

Best parameters: {'polynomialfeatures__degree': 3, 'preprocessing': MinMaxScaler(copy=True, feature_range=(0, 1)), 'regressor': Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001), 'regressor__alpha': 0.1}

train set score: 0.95
test set score: 0.82


In [147]:
pipe_ridge = make_pipeline(MinMaxScaler(), PolynomialFeatures(), Ridge())

param_grid_ridge = {'polynomialfeatures__degree': [1,2,3],
                    'ridge__alpha': [0.001,0.01,0.1,1,10,100],
                    'ridge__max_iter': [100,1000,10000,100000]}

gridge = GridSearchCV(pipe_ridge, param_grid_ridge, cv=10)
gridge.fit(X_train, y_train)

print('Best parameters: {}'.format(gridge.best_params_))
print('\ntrain set score: {:.2f}'.format(gridge.score(X_train, y_train)))
print('test set score: {:.2f}'.format(gridge.score(X_test, y_test)))

Best parameters: {'polynomialfeatures__degree': 3, 'ridge__alpha': 0.1, 'ridge__max_iter': 1000}

train set score: 0.95
test set score: 0.82
