In [24]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
import warnings
warnings.filterwarnings("ignore")

In [2]:
boston = load_boston()

In [3]:
boston.data.shape

(506, 13)

In [4]:
boston.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [5]:
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

### Training testing split

In [6]:
boston.target

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21

In [7]:
#transform target in 2-D array
boston.target = np.reshape(boston.target, (-1,1))
boston.target

array([[24. ],
       [21.6],
       [34.7],
       [33.4],
       [36.2],
       [28.7],
       [22.9],
       [27.1],
       [16.5],
       [18.9],
       [15. ],
       [18.9],
       [21.7],
       [20.4],
       [18.2],
       [19.9],
       [23.1],
       [17.5],
       [20.2],
       [18.2],
       [13.6],
       [19.6],
       [15.2],
       [14.5],
       [15.6],
       [13.9],
       [16.6],
       [14.8],
       [18.4],
       [21. ],
       [12.7],
       [14.5],
       [13.2],
       [13.1],
       [13.5],
       [18.9],
       [20. ],
       [21. ],
       [24.7],
       [30.8],
       [34.9],
       [26.6],
       [25.3],
       [24.7],
       [21.2],
       [19.3],
       [20. ],
       [16.6],
       [14.4],
       [19.4],
       [19.7],
       [20.5],
       [25. ],
       [23.4],
       [18.9],
       [35.4],
       [24.7],
       [31.6],
       [23.3],
       [19.6],
       [18.7],
       [16. ],
       [22.2],
       [25. ],
       [33. ],
       [23.5],
       [19

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size = 0.25, random_state = 33)

In [9]:
#Feature Scaling

from sklearn.preprocessing import StandardScaler
scalerX = StandardScaler().fit(X_train)
scalery = StandardScaler().fit(y_train)
X_train = scalerX.transform(X_train)
y_train = scalery.transform(y_train)
X_test = scalerX.transform(X_test)
y_test = scalery.transform(y_test)

### Function to train an evaluate

In [10]:
from sklearn.model_selection import *

In [11]:
def train_and_evaluate(clf, X_train, y_train):
    '''
    function to training and evaluate model
    params:
    clf: model object
    X_train: features train dataset
    y_train: target train
    '''
    clf.fit(X_train, y_train)
    print("Training Score: ", clf.score(X_train, y_train))
    #create K-fold with 5 splits
    cv = KFold(shuffle = True, random_state = 33)
    scores = cross_val_score(clf, X_train, y_train, cv = cv)
    print("Average coefficient of determination using 5-fold crossvalidation: ",np.mean(scores))

### First Model

In [25]:
from sklearn import linear_model
clf_sgd = linear_model.SGDRegressor(loss = 'squared_loss', penalty = None, random_state = 42)
train_and_evaluate(clf_sgd, X_train, y_train)

Training Score:  0.7504778181232462
Average coefficient of determination using 5-fold crossvalidation:  0.7100151093068392


In [26]:
clf_sgd = linear_model.SGDRegressor(loss = 'squared_loss', penalty = 'l2', random_state = 42)
train_and_evaluate(clf_sgd, X_train, y_train)

Training Score:  0.7504743060147638
Average coefficient of determination using 5-fold crossvalidation:  0.7100210879140052


### Second Model

In [27]:
from sklearn import svm

In [28]:
clf_svr = svm.SVR(kernel = 'linear')
train_and_evaluate(clf_svr, X_train, y_train)

Training Score:  0.7179419427730855
Average coefficient of determination using 5-fold crossvalidation:  0.7089015083945411


In [30]:
#polynomial
clf_svr_pol = svm.SVR(kernel = 'poly')
train_and_evaluate(clf_svr, X_train, y_train)

Training Score:  0.7179419427730855
Average coefficient of determination using 5-fold crossvalidation:  0.7089015083945411


In [32]:
#radial basis function
clf_svr_rbf = svm.SVR(kernel = 'rbf')
train_and_evaluate(clf_svr_rbf, X_train, y_train)

Training Score:  0.900198918588965
Average coefficient of determination using 5-fold crossvalidation:  0.8336907691882237


### Third Model

In [33]:
from sklearn import ensemble

In [39]:
clf_et = ensemble.ExtraTreesRegressor(n_estimators = 10, random_state = 42)
train_and_evaluate(clf_et, X_train, y_train)

Training Score:  1.0
Average coefficient of determination using 5-fold crossvalidation:  0.8421583150350583


### Measure Performance

In [42]:
from sklearn import metrics
def measure_performance(X, y, clf, show_accuracy=True, show_classification_report=True, show_confusion_matrix=True,
show_r2_score=False):
    y_pred = clf.predict(X)
    if show_accuracy:
        print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)))
    if show_classification_report:
        print("Classification report")
        print(metrics.classification_report(y, y_pred))
    if show_confusion_matrix:
        print("Confusion matrix")
        print(metrics.confusion_matrix(y, y_pred))
    if show_r2_score:
        print( "Coefficient of determination:{0:.3f}".format(metrics.r2_score(y, y_pred)))

In [47]:
measure_performance(X_test, y_test, clf_et,
show_accuracy=False, show_classification_report=False,
show_confusion_matrix=False, show_r2_score=True)

Coefficient of determination:0.790
