In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Read data

In [2]:
filename = "housing.csv"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
data = pd.read_csv(filename, names=names, delim_whitespace=True)

data.head(20)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2
5,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222.0,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311.0,15.2,395.6,12.43,22.9
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311.0,15.2,396.9,19.15,27.1
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311.0,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311.0,15.2,386.71,17.1,18.9


In [3]:
data.shape

(506, 14)

In [4]:
data.dtypes

CRIM       float64
ZN         float64
INDUS      float64
CHAS         int64
NOX        float64
RM         float64
AGE        float64
DIS        float64
RAD          int64
TAX        float64
PTRATIO    float64
B          float64
LSTAT      float64
MEDV       float64
dtype: object

### Separate dataset

In [5]:
data = data.values

In [6]:
X = data[:, 0:13]
Y = data[:, 13]

### Split dataset

In [7]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=10, random_state=7)

## Linear Algorithms

### Linear Regression
-  assumes that the input variables have a Gaussian distribution

In [8]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

In [9]:
from sklearn.model_selection import cross_val_score

results = cross_val_score(model, X, Y, cv=kfold, scoring='neg_mean_squared_error')
print(results.mean())

-34.705255944523735


### Ridge Regression
- an extension of linear regression where the loss function is modified to minimize the complexity of the model measured as the sum squared value of the coefficient values (also called the L2-norm)

In [10]:
from sklearn.linear_model import Ridge

model = Ridge()

In [11]:
from sklearn.model_selection import cross_val_score

results = cross_val_score(model, X, Y, cv=kfold, scoring='neg_mean_squared_error')
print(results.mean())

-34.07824620925929


### LASSO Regression
- where the loss function is modified to minimize the complexity of the model measured as the sum absolute value of the coefficient values (also called the L1-norm)

In [12]:
from sklearn.linear_model import Lasso

model = Lasso()

In [13]:
from sklearn.model_selection import cross_val_score

results = cross_val_score(model, X, Y, cv=kfold, scoring='neg_mean_squared_error')
print(results.mean())

-34.46408458830232


### ElasticNet Regression
-  a form of regularization regression that combines the properties of both Ridge Regression and LASSO regression
-  minimize the complexity of the regression model (magnitude and number of regression coefficients) by penalizing the model using both the L2-norm (sum squared coefficient values) and the L1-norm (sum absolute coefficient values)

In [14]:
from sklearn.linear_model import ElasticNet

model = ElasticNet()

In [15]:
from sklearn.model_selection import cross_val_score

results = cross_val_score(model, X, Y, cv=kfold, scoring='neg_mean_squared_error')
print(results.mean())

-31.164573714249762


## Nonlinear Algorithms

### K-Nearest Neighbors
-  locates the k most similar instances in the training dataset for a new data instance
- From the k neighbors, a mean or median output variable is taken as the prediction
-  Minkowski distance is used by default, which is a generalization of both the Euclidean distance (used when all inputs have the same scale) and Manhattan distance (used when the scales of the input variables differ)

In [16]:
from sklearn.neighbors import KNeighborsRegressor

model = KNeighborsRegressor()

In [17]:
from sklearn.model_selection import cross_val_score

results = cross_val_score(model, X, Y, cv=kfold, scoring='neg_mean_squared_error')
print(results.mean())

-107.28683898039215


### Classification and Regression Trees
- use the training data to select the best points to split the data in order to minimize a cost metric
-  default cost metric for regression decision trees is the mean squared error(criterion)

In [18]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor()

In [19]:
from sklearn.model_selection import cross_val_score

results = cross_val_score(model, X, Y, cv=kfold, scoring='neg_mean_squared_error')
print(results.mean())

-39.550734509803924


### Support Vector Machines
-  SVM extended for the prediction real-valued problems called Support Vector Regression (SVR)

In [20]:
from sklearn.svm import SVR

model = SVR()

In [21]:
from sklearn.model_selection import cross_val_score

results = cross_val_score(model, X, Y, cv=kfold, scoring='neg_mean_squared_error')
print(results.mean())

-91.04782433324428
