In [4]:
# load_boston will return a Bunch object
from sklearn.datasets import load_boston
boston = load_boston() 

In [8]:
# The Bunch Object's data member store data as 2 dimensional numpy ndarray. The shape of which is m * n
# where m is the number of training examples and n is the number of features
# this can be converted to the Pandas DataFrame Object by simply calling the pd.DataFrame method
type(boston.data)

numpy.ndarray

In [10]:
# 506 training examples and 13 features
boston.data.shape

(506, 13)

In [13]:
# A Bunch object is a dictionary like object that stores various metadata about data. We can see all the keys by 
# calling keys()
boston.keys()

dict_keys(['data', 'DESCR', 'target', 'feature_names'])

In [14]:
# since it's a dictionary like object boston['data'] and boston.data are equivalent
boston['data'].shape

(506, 13)

In [16]:
# DESCR gives a brief description about the data
print(boston['DESCR'])

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [17]:
# Let's now convert it Pandas DataFrame for better visualisation and also that we can perfrom some preprocessing
import pandas as pd
housing = pd.DataFrame(boston.data)
housing.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [18]:
# let's rename the columns
housing.columns = boston.feature_names

In [19]:
housing.head(1)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98


In [20]:
# Let's create another DataFrame object to store target value or labels
X = housing # following ML convention
y = pd.DataFrame(boston.target)
y.head()

Unnamed: 0,0
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


In [25]:
# rename column to PRICE
y.columns = ["PRICE"]
y.head()

Unnamed: 0,PRICE
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


In [30]:
# sklearn.linear_model.LinearRegression
# important functions:-

# fit(X, y[, sample_weight])	Fit linear model.
# get_params([deep])	Get parameters for this estimator.
# predict(X)	Predict using the linear model
# score(X, y[, sample_weight])	Returns the coefficient of determination R^2 of the prediction.
# set_params(**params)	Set the parameters of this estimator.

from sklearn.linear_model import LinearRegression
linearRegression = LinearRegression(fit_intercept = True, normalize = True)

In [31]:
# Fit Linear Regression
linearRegression.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [33]:
# get the intercept and the predicted coefficients
print(linearRegression.coef_)

[[ -1.07170557e-01   4.63952195e-02   2.08602395e-02   2.68856140e+00
   -1.77957587e+01   3.80475246e+00   7.51061703e-04  -1.47575880e+00
    3.05655038e-01  -1.23293463e-02  -9.53463555e-01   9.39251272e-03
   -5.25466633e-01]]


In [34]:
# print the intercept
print(linearRegression.intercept_)

[ 36.49110328]


In [52]:
# We will now make a prediction
y_predicted = linearRegression.predict(X)

In [65]:
# let's create a DataFrame and compare actual against predicted
df_compare = pd.DataFrame(list(zip(y['PRICE'], y_predicted.reshape(506))), columns = ['Actual', 'Predicted'])

In [66]:
# let's checkout the created DataFrame now
df_compare.head(10)

Unnamed: 0,Actual,Predicted
0,24.0,30.008213
1,21.6,25.029861
2,34.7,30.570232
3,33.4,28.608141
4,36.2,27.942882
5,28.7,25.2594
6,22.9,23.00434
7,27.1,19.534756
8,16.5,11.516965
9,18.9,18.919815


In [68]:
# But this is not a good practice, right? Training and testing on the same dataset. Let's randomly split them using
# methods provided in the scikit-learn library
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

In [69]:
# we are good to go now... let's first print all the splitted datum sizes
for i in [X_train, X_test, y_train, y_test]:
    print(i.shape)

(354, 13)
(152, 13)
(354, 1)
(152, 1)


In [75]:
# Okay.... so let's build our model again :D
linearRegression = LinearRegression(fit_intercept = True, normalize = True)
linearRegression.fit(X_train, y_train) # notice X_train and y_train instead of X_test and y_test
y_predicted = linearRegression.predict(X_test)
y_predicted.reshape(y_predicted.shape[0] * y_predicted.shape[1])

array([  5.51144747,  26.21494568,   5.01552482,   8.16618605,
        30.83809148,  29.35471021,  15.15095316,   2.34498545,
        29.11847882,  23.00303343,  41.2234041 ,  17.06493404,
        30.52439615,  24.88251104,  10.45896366,  24.73880324,
        33.0778955 ,  32.94019901,  31.37957232,  35.77419655,
        18.76218944,  32.07297022,  22.47649114,  26.10280707,
        18.63061477,  19.03352733,  24.2300146 ,  20.20580238,
        15.46010222,  23.15002819,  29.40468827,  20.29996264,
        27.03073604,  20.26158591,  21.38757929,  27.54859631,
        11.97322436,  15.42258568,  29.81748689,  28.41582054,
         5.48625661,  18.19045179,  28.394476  ,  22.07611996,
        27.70873745,  21.60072749,  19.81560365,  12.46154601,
        19.0489386 ,  15.53599719,  10.81641532,  19.53283454,
        19.2782111 ,  25.29075556,   8.23404527,  30.5669914 ,
        20.92641678,  20.38679546,  28.51007467,  29.82943857,
        29.86561498,  20.2042052 ,  15.80743083,  13.14

In [86]:
# let's calculate the mean square error on the test set
import numpy as np
y_test = np.array(y_test)
y_test.reshape(y_test.shape[0] * y_test.shape[1])

np.mean(np.square(y_predicted - y_test))

24.218297619019538

In [None]:
# So pat yourself on the back if you have reached here..... ;)