## Boston Housing Dataset

In [1]:
import numpy as np
import pandas as pd
import sklearn
import statsmodels.api as sm

from sklearn.datasets import load_boston
boston = load_boston()

In [2]:
# Check the keys
print('Keys')
print(boston.keys())

Keys
dict_keys(['data', 'target', 'feature_names', 'DESCR'])


In [3]:
# Print the feature names and the description
print('Feature Names')
print(boston.feature_names)
print()
print('Description')
print(boston.DESCR)

Feature Names
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']

Description
Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways

In [4]:
# Print the shape of the data and the target
print('Data Shape')
print(boston.data.shape)
print()
print('Target Shape')
print(boston.target.shape)

Data Shape
(506, 13)

Target Shape
(506,)


In [5]:
# Create a dataframe of the data and add the Median Value target
boston_dataframe = pd.DataFrame(boston.data)
boston_dataframe.columns = boston.feature_names
boston_dataframe['MEDV'] = boston.target
print('Dataframe Head')
print(boston_dataframe.head())

Dataframe Head
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  


In [6]:
# Print descriptives
print(boston_dataframe.describe())

             CRIM          ZN       INDUS        CHAS         NOX          RM  \
count  506.000000  506.000000  506.000000  506.000000  506.000000  506.000000   
mean     3.593761   11.363636   11.136779    0.069170    0.554695    6.284634   
std      8.596783   23.322453    6.860353    0.253994    0.115878    0.702617   
min      0.006320    0.000000    0.460000    0.000000    0.385000    3.561000   
25%      0.082045    0.000000    5.190000    0.000000    0.449000    5.885500   
50%      0.256510    0.000000    9.690000    0.000000    0.538000    6.208500   
75%      3.647423   12.500000   18.100000    0.000000    0.624000    6.623500   
max     88.976200  100.000000   27.740000    1.000000    0.871000    8.780000   

              AGE         DIS         RAD         TAX     PTRATIO           B  \
count  506.000000  506.000000  506.000000  506.000000  506.000000  506.000000   
mean    68.574901    3.795043    9.549407  408.237154   18.455534  356.674032   
std     28.148861    2.1057

In [7]:
# Ordinary Least Squares Model
ols = sm.OLS(endog = boston.target, exog = boston.data).fit()
ols.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.959
Model:,OLS,Adj. R-squared:,0.958
Method:,Least Squares,F-statistic:,891.1
Date:,"Wed, 25 Jul 2018",Prob (F-statistic):,0.0
Time:,19:43:54,Log-Likelihood:,-1523.8
No. Observations:,506,AIC:,3074.0
Df Residuals:,493,BIC:,3129.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-0.0916,0.034,-2.675,0.008,-0.159,-0.024
x2,0.0487,0.014,3.379,0.001,0.020,0.077
x3,-0.0038,0.064,-0.059,0.953,-0.130,0.123
x4,2.8564,0.904,3.160,0.002,1.080,4.633
x5,-2.8808,3.359,-0.858,0.392,-9.481,3.720
x6,5.9252,0.309,19.168,0.000,5.318,6.533
x7,-0.0072,0.014,-0.523,0.601,-0.034,0.020
x8,-0.9680,0.196,-4.947,0.000,-1.352,-0.584
x9,0.1704,0.067,2.554,0.011,0.039,0.302

0,1,2,3
Omnibus:,204.05,Durbin-Watson:,0.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1372.527
Skew:,1.609,Prob(JB):,9.11e-299
Kurtosis:,10.399,Cond. No.,8500.0
