In [1]:
from pandas import read_csv
import numpy

In [2]:
# The Boston house price dataset describes properties of houses in Boston suburbs and is concerned with 
# modeling the price of houses in those suburbs in thousands of dollars. 
# As such, this is a regression predictive modeling problem. 
# There are 13 input variables that describe the properties of a given Boston suburb. 
# The full list of attributes in this dataset are as follows:
# 1. CRIM: per capita crime rate by town.
# 2. ZN: proportion of residential land zoned for lots over 25,000 sq.ft.
# 3. INDUS: proportion of non-retail business acres per town.
# 4. CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise). 
# 5. NOX: nitric oxides concentration (parts per 10 million).
# 6. RM: average number of rooms per dwelling.
# 7. AGE: proportion of owner-occupied units built prior to 1940. 
# 8. DIS: weighted distances to five Boston employment centers. 
# 9. RAD: index of accessibility to radial highways.
# 10. TAX: full-value property-tax rate per $10,000.
# 11. PTRATIO: pupil-teacher ratio by town.
# 12. B: 1000(Bk − 0.63)2 where Bk is the proportion of blacks by town. 
# 13. LSTAT: % lower status of the population.
# 14. MEDV: Median value of owner-occupied homes in $1000s.

In [3]:
_seed = 7
numpy.random.seed(_seed)

In [4]:
_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'

In [5]:
_dataframe = read_csv(_uri, delim_whitespace=True, header=None)

In [6]:
print('{}\nShape:\n{}\n{}'.format(100*'=',_dataframe.shape,100*'='))

Shape:
(506, 14)


In [7]:
_array = _dataframe.values

In [8]:
print('_array:\n{}\n{}'.format(_array[:5,:],100*'='))

_array:
[[  6.32000000e-03   1.80000000e+01   2.31000000e+00   0.00000000e+00
    5.38000000e-01   6.57500000e+00   6.52000000e+01   4.09000000e+00
    1.00000000e+00   2.96000000e+02   1.53000000e+01   3.96900000e+02
    4.98000000e+00   2.40000000e+01]
 [  2.73100000e-02   0.00000000e+00   7.07000000e+00   0.00000000e+00
    4.69000000e-01   6.42100000e+00   7.89000000e+01   4.96710000e+00
    2.00000000e+00   2.42000000e+02   1.78000000e+01   3.96900000e+02
    9.14000000e+00   2.16000000e+01]
 [  2.72900000e-02   0.00000000e+00   7.07000000e+00   0.00000000e+00
    4.69000000e-01   7.18500000e+00   6.11000000e+01   4.96710000e+00
    2.00000000e+00   2.42000000e+02   1.78000000e+01   3.92830000e+02
    4.03000000e+00   3.47000000e+01]
 [  3.23700000e-02   0.00000000e+00   2.18000000e+00   0.00000000e+00
    4.58000000e-01   6.99800000e+00   4.58000000e+01   6.06220000e+00
    3.00000000e+00   2.22000000e+02   1.87000000e+01   3.94630000e+02
    2.94000000e+00   3.34000000e+01]
 [  

In [9]:
_X = _array[:,0:13]

In [10]:
print('_X\n{}\n{}'.format(_X[:5,:],100*'='))

_X
[[  6.32000000e-03   1.80000000e+01   2.31000000e+00   0.00000000e+00
    5.38000000e-01   6.57500000e+00   6.52000000e+01   4.09000000e+00
    1.00000000e+00   2.96000000e+02   1.53000000e+01   3.96900000e+02
    4.98000000e+00]
 [  2.73100000e-02   0.00000000e+00   7.07000000e+00   0.00000000e+00
    4.69000000e-01   6.42100000e+00   7.89000000e+01   4.96710000e+00
    2.00000000e+00   2.42000000e+02   1.78000000e+01   3.96900000e+02
    9.14000000e+00]
 [  2.72900000e-02   0.00000000e+00   7.07000000e+00   0.00000000e+00
    4.69000000e-01   7.18500000e+00   6.11000000e+01   4.96710000e+00
    2.00000000e+00   2.42000000e+02   1.78000000e+01   3.92830000e+02
    4.03000000e+00]
 [  3.23700000e-02   0.00000000e+00   2.18000000e+00   0.00000000e+00
    4.58000000e-01   6.99800000e+00   4.58000000e+01   6.06220000e+00
    3.00000000e+00   2.22000000e+02   1.87000000e+01   3.94630000e+02
    2.94000000e+00]
 [  6.90500000e-02   0.00000000e+00   2.18000000e+00   0.00000000e+00
    4.5

In [11]:
_Y = _array[:,13:]

In [12]:
print('_Y\n{}\n{}'.format(_Y[:5],100*'='))

_Y
[[ 24. ]
 [ 21.6]
 [ 34.7]
 [ 33.4]
 [ 36.2]]


In [13]:
_Y = numpy.ravel(_Y)

In [14]:
print('_Y raveled\n{}\n{}'.format(_Y[:5],100*'='))

_Y raveled
[ 24.   21.6  34.7  33.4  36.2]
