In [1]:
# 12.1 Algorithm Overview

In [2]:
# - Linear algorithms:
    # - Linear Regression
    # - Ridge Regression
    # - Lasso Linear Regression
    # - Elastic Net Regression
# - Nonlinear algorithms:
    # - k-Nearest Neighbors
    # - Classification and Regression Trees
    # - Support Vector Machines

In [3]:
# - The mean squared error values are inverted (negative). 
# - This is a quirk of the cross val score() function used that requires all algorithm metrics to be 
# sorted in ascending order (larger value is better).

In [4]:
from pandas import read_csv

In [5]:
import numpy

In [6]:
import sys

In [7]:
def print_data(_data):
    return numpy.savetxt(sys.stdout, _data[:5,:], '%5.3f')

In [8]:
_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'

In [9]:
_col_names = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT','MEDV']

In [10]:
_dataframe = read_csv(_uri, delim_whitespace=True, names=_col_names)

In [11]:
_dataframe.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [12]:
_array = _dataframe.values

In [13]:
print_data(_array)

0.006 18.000 2.310 0.000 0.538 6.575 65.200 4.090 1.000 296.000 15.300 396.900 4.980 24.000
0.027 0.000 7.070 0.000 0.469 6.421 78.900 4.967 2.000 242.000 17.800 396.900 9.140 21.600
0.027 0.000 7.070 0.000 0.469 7.185 61.100 4.967 2.000 242.000 17.800 392.830 4.030 34.700
0.032 0.000 2.180 0.000 0.458 6.998 45.800 6.062 3.000 222.000 18.700 394.630 2.940 33.400
0.069 0.000 2.180 0.000 0.458 7.147 54.200 6.062 3.000 222.000 18.700 396.900 5.330 36.200


In [14]:
_X = _array[:,0:13]

In [15]:
print_data(_X)

0.006 18.000 2.310 0.000 0.538 6.575 65.200 4.090 1.000 296.000 15.300 396.900 4.980
0.027 0.000 7.070 0.000 0.469 6.421 78.900 4.967 2.000 242.000 17.800 396.900 9.140
0.027 0.000 7.070 0.000 0.469 7.185 61.100 4.967 2.000 242.000 17.800 392.830 4.030
0.032 0.000 2.180 0.000 0.458 6.998 45.800 6.062 3.000 222.000 18.700 394.630 2.940
0.069 0.000 2.180 0.000 0.458 7.147 54.200 6.062 3.000 222.000 18.700 396.900 5.330


In [16]:
_Y = _array[:,13:]

In [17]:
print_data(_Y)

24.000
21.600
34.700
33.400
36.200


In [18]:
_Y = numpy.ravel(_Y)

In [19]:
_Y[:5]

array([ 24. ,  21.6,  34.7,  33.4,  36.2])

In [20]:
from sklearn.model_selection import KFold

In [21]:
_kfold = KFold(n_splits=10, random_state=7)

In [22]:
from sklearn.model_selection import cross_val_score

In [23]:
_scoring = 'neg_mean_squared_error'

In [24]:
# 12.2 Linear Machine Learning Algorithms

In [25]:
# 12.2.1 Linear Regression

In [26]:
# - Linear regression assumes that the input variables have a Gaussian distribution. 

# - It is also assumed that input variables are relevant to the output variable and 
# that they are not highly correlated with each other (a problem called collinearity).

In [27]:
from sklearn.linear_model import LinearRegression

In [28]:
_model = LinearRegression()

In [29]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold, scoring=_scoring)

In [30]:
'{:.3f}'.format(_score.mean())

'-34.705'

In [31]:
'{:.3f}'.format(_score.std())

'45.574'

In [32]:
# 12.2.2 Ridge Regression

In [33]:
# - Ridge regression is an extension of linear regression where the loss function is modified 
# to minimize the complexity of the model measured as the sum squared value of the coefficient 
# values (also called the L2-norm).

In [34]:
from sklearn.linear_model import Ridge

In [35]:
_model = Ridge()

In [36]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold, scoring=_scoring)

In [37]:
'{:.3f}'.format(_score.mean())

'-34.078'

In [38]:
'{:.3f}'.format(_score.std())

'45.900'

In [39]:
# 12.2.3 LASSO Regression

In [40]:
# - The Least Absolute Shrinkage and Selection Operator (or LASSO for short) is a modification
# of linear regression, like ridge regression, where the loss function is modified to minimize 
# the complexity of the model measured as the sum absolute value of the coefficient values 
# (also called the L1-norm)

In [41]:
from sklearn.linear_model import Lasso

In [42]:
_model = Lasso()

In [43]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold, scoring=_scoring)

In [44]:
'{:.3f}'.format(_score.mean())

'-34.464'

In [45]:
'{:.3f}'.format(_score.std())

'27.889'

In [46]:
# 12.2.4 ElasticNet Regression

In [47]:
# - ElasticNet is a form of regularization regression that combines the properties of both 
# Ridge Regression and LASSO regression. 

# - It seeks to minimize the complexity of the regression model (magnitude and number of 
# regression coefficients) by penalizing the model using both the L2-norm (sum squared 
# coefficient values) and the L1-norm (sum absolute coefficient values).

In [48]:
from sklearn.linear_model import ElasticNet

In [49]:
_model = ElasticNet()

In [50]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold, scoring=_scoring)

In [51]:
'{:.3f}'.format(_score.mean())

'-31.165'

In [52]:
'{:.3f}'.format(_score.std())

'22.709'

In [53]:
# 12.3 Nonlinear Machine Learning Algorithms

In [54]:
# 12.3.1 k-Nearest Neighbors

In [55]:
# - The k-Nearest Neighbors algorithm (or KNN) locates the k most similar instances in the training 
# dataset for a new data instance. 

# - From the k neighbors, a mean or median output variable is taken as the prediction. 

# - Of note is the distance metric used (the metric argument). The Minkowski distance is used by default, 
# which is a generalization of both the Euclidean distance (used when all inputs have the same scale) 
# and Manhattan distance (for when the scales of the input variables differ).

In [56]:
from sklearn.neighbors import KNeighborsRegressor

In [57]:
_model = KNeighborsRegressor()

In [58]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold, scoring=_scoring)

In [59]:
'{:.3f}'.format(_score.mean())

'-107.287'

In [60]:
'{:.3f}'.format(_score.std())

'79.840'

In [61]:
# 12.3.2 Classification and Regression Trees

In [62]:
# - Decision trees or the Classification and Regression Trees (CART as they are known) use the training 
# data to select the best points to split the data in order to minimize a cost metric. 

# - The default cost metric for regression decision trees is the mean squared error, specified in the 
# criterion parameter.

In [63]:
from sklearn.tree import DecisionTreeRegressor

In [64]:
_model = DecisionTreeRegressor()

In [65]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold, scoring=_scoring)

In [66]:
'{:.3f}'.format(_score.mean())

'-39.243'

In [67]:
'{:.3f}'.format(_score.std())

'29.878'

In [68]:
# 12.3.3 Support Vector Machines

In [69]:
# - Support Vector Machines (SVM) were developed for binary classification. 

# - The technique has been extended for the prediction real-valued problems called Support Vector Regression (SVR). 

# - Like the classification example, SVR is built upon the LIBSVM library. 

In [70]:
from sklearn.svm import SVR

In [71]:
_model = SVR()

In [72]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold, scoring=_scoring)

In [73]:
'{:.3f}'.format(_score.mean())

'-91.048'

In [74]:
'{:.3f}'.format(_score.std())

'71.102'