## Author : Gaurav Khandave
## Title : Boston Housing Assignment
## Version : 1.0.0
## Date : 09/26/2016

In [1]:
from sklearn import datasets
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression

In [2]:
bean = datasets.load_boston()
print bean.DESCR

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [3]:
bean

{'DESCR': "Boston House Prices dataset\n\nNotes\n------\nData Set Characteristics:  \n\n    :Number of Instances: 506 \n\n    :Number of Attributes: 13 numeric/categorical predictive\n    \n    :Median Value (attribute 14) is usually the target\n\n    :Attribute Information (in order):\n        - CRIM     per capita crime rate by town\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\n        - INDUS    proportion of non-retail business acres per town\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n        - NOX      nitric oxides concentration (parts per 10 million)\n        - RM       average number of rooms per dwelling\n        - AGE      proportion of owner-occupied units built prior to 1940\n        - DIS      weighted distances to five Boston employment centres\n        - RAD      index of accessibility to radial highways\n        - TAX      full-value property-tax rate per $10,000\n        - PTRATIO  

In [4]:
def load_boston():
    scaler = StandardScaler()
    boston = datasets.load_boston()
    X=boston.data
    y=boston.target
    X = scaler.fit_transform(X)
    return train_test_split(X,y)

In [5]:
X_train, X_test, y_train, y_test = load_boston()

In [6]:
X_train.shape

(379, 13)

In [7]:
clf = LinearRegression()
clf.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [8]:
zip (y_test, clf.predict(X_test))

[(19.600000000000001, 21.154439127303295),
 (10.199999999999999, 17.054652171654951),
 (21.0, 21.271598461359069),
 (19.300000000000001, 22.054596109505539),
 (18.399999999999999, 19.435218406835325),
 (18.300000000000001, 20.973768194369658),
 (43.5, 38.594035910977439),
 (41.299999999999997, 33.080260486776666),
 (19.399999999999999, 19.92802908960342),
 (32.399999999999999, 34.656771334568944),
 (27.899999999999999, 20.692113860659781),
 (23.699999999999999, 27.285080561492453),
 (36.100000000000001, 32.187148537836428),
 (11.0, 14.330751568643306),
 (17.399999999999999, 22.873264203261215),
 (48.799999999999997, 39.765367944772855),
 (16.199999999999999, 20.853267650983625),
 (36.5, 35.253874658773285),
 (14.5, 18.20458215571184),
 (17.199999999999999, 17.011933788956359),
 (23.300000000000001, 28.328857564501238),
 (20.100000000000001, 21.652332449814622),
 (20.800000000000001, 18.944502259209504),
 (22.800000000000001, 28.495390364843814),
 (21.399999999999999, 21.568378475973642

### We will try to measure the performance of the linear regressor

In [9]:
# We will try to find r2 score 
r2_score(y_test,clf.predict(X_test))

0.78623764315448441

In [10]:
# Mean squared error of linear regressor
mean_squared_error(y_test,clf.predict(X_test))

22.535571433436711

### Implimentation of sklearn.linear_model.Ridge

In [11]:
from sklearn.linear_model import Ridge

In [12]:
# Try changing alpha value for optimize result
clfRidge = Ridge(alpha=0,normalize=True)

In [13]:
clfRidge.fit(X_train, y_train)

Ridge(alpha=0, copy_X=True, fit_intercept=True, max_iter=None, normalize=True,
   random_state=None, solver='auto', tol=0.001)

In [14]:
clfRidge.predict(X_test)

array([ 21.15443913,  17.05465217,  21.27159846,  22.05459611,
        19.43521841,  20.97376819,  38.59403591,  33.08026049,
        19.92802909,  34.65677133,  20.69211386,  27.28508056,
        32.18714854,  14.33075157,  22.8732642 ,  39.76536794,
        20.85326765,  35.25387466,  18.20458216,  17.01193379,
        28.32885756,  21.65233245,  18.94450226,  28.49539036,
        21.56837848,  14.51903008,  32.86660549,  23.07424522,
        22.84084765,  23.75103274,  14.35147206,  24.74278446,
        27.53481645,  27.40685168,  13.67468898,  23.30986977,
        18.55771535,  18.70252658,  27.40307744,  22.0848263 ,
        19.79472523,  30.64192104,  29.37090008,  18.83859464,
        30.92420989,  20.27072201,  17.15004225,  39.64444856,
        11.48365832,  11.65315059,  19.62665187,  16.35456075,
        32.38524471,  34.69434518,  23.63348477,   8.84539463,
        31.87217872,  22.2638637 ,  25.15641324,  12.38534672,
        19.94983848,  33.35632456,  32.13649112,  19.32

In [15]:
zip (y_test, clfRidge.predict(X_test))

[(19.600000000000001, 21.154439127303281),
 (10.199999999999999, 17.054652171654951),
 (21.0, 21.27159846135908),
 (19.300000000000001, 22.054596109505567),
 (18.399999999999999, 19.435218406835329),
 (18.300000000000001, 20.973768194369669),
 (43.5, 38.594035910977453),
 (41.299999999999997, 33.080260486776581),
 (19.399999999999999, 19.928029089603459),
 (32.399999999999999, 34.656771334568973),
 (27.899999999999999, 20.692113860659752),
 (23.699999999999999, 27.285080561492443),
 (36.100000000000001, 32.187148537836471),
 (11.0, 14.330751568643331),
 (17.399999999999999, 22.873264203261151),
 (48.799999999999997, 39.765367944772876),
 (16.199999999999999, 20.853267650983611),
 (36.5, 35.253874658773299),
 (14.5, 18.20458215571184),
 (17.199999999999999, 17.011933788956341),
 (23.300000000000001, 28.328857564501213),
 (20.100000000000001, 21.652332449814651),
 (20.800000000000001, 18.9445022592095),
 (22.800000000000001, 28.495390364843793),
 (21.399999999999999, 21.568378475973581),

In [16]:
# We will try to find r2 score 
r2_score(y_test,clfRidge.predict(X_test))

0.78623764315448452

In [17]:
# Mean squared error of linear regressor
mean_squared_error(y_test,clfRidge.predict(X_test))

22.535571433436697