## Boston Housing Assignment

Graham Bullard CS 570

In [1]:
from sklearn import datasets
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from math import sqrt
from sklearn.linear_model import Lasso

In [2]:
def load_boston():
    scaler = StandardScaler()
    boston = datasets.load_boston()
    X=boston.data
    y=boston.target
    X = scaler.fit_transform(X)
    return train_test_split(X,y)

In [3]:
X_train, X_test, y_train, y_test = load_boston()

In [4]:
X_train.shape

(379L, 13L)

#Initial Model

In [5]:

clf = LinearRegression()
clf.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

MRSE:

In [6]:
sqrt(mean_squared_error(y_test, clf.predict(X_test)))

4.037964379313236

Coefficient of Determination:

In [7]:
r2_score(y_test, clf.predict(X_test))

0.74902106616233721

#Implementation of linear_model.Lasso

In [8]:

clf2 = Lasso()
clf2.fit(X_train, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

MRSE:

In [9]:
sqrt(mean_squared_error(y_test, clf2.predict(X_test)))

4.551574584414494

Coefficient of Determination:

In [10]:
r2_score(y_test, clf2.predict(X_test))

0.68111386964628995

#Optimization of linear_model.Lasso

MRSE:

In [11]:
clf3 = Lasso(alpha=0.1)
clf3.fit(X_train, y_train)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

MRSE:

In [12]:
sqrt(mean_squared_error(y_test, clf3.predict(X_test)))

4.098556463281469

Coefficient of Determination:

In [13]:
r2_score(y_test, clf3.predict(X_test))

0.74143237403428053

#Predictions

Initial Model:

In [14]:
zip (y_test, clf.predict(X_test))

[(21.399999999999999, 24.25745807360617),
 (11.699999999999999, 15.908297820272725),
 (18.0, 19.212880262896377),
 (21.699999999999999, 21.89877840292009),
 (18.5, 25.472199123902076),
 (15.0, 19.131298430619957),
 (38.700000000000003, 36.5860159811171),
 (18.5, 19.44274924867106),
 (21.199999999999999, 23.618868388880411),
 (35.200000000000003, 35.600423733324696),
 (26.5, 26.035546125424691),
 (13.1, 20.668958140145108),
 (28.100000000000001, 25.488655399333631),
 (18.100000000000001, 17.487507301520118),
 (24.5, 20.723169065669506),
 (13.1, 16.401009016464055),
 (25.0, 24.767792050439105),
 (21.0, 23.296039283302179),
 (24.100000000000001, 25.793291310381495),
 (22.0, 27.602299458035468),
 (37.0, 31.118992164078701),
 (17.100000000000001, 17.589879341407638),
 (16.699999999999999, 20.822610208270699),
 (24.399999999999999, 23.957452822427285),
 (24.699999999999999, 24.849853997092929),
 (50.0, 40.905961814717088),
 (22.800000000000001, 26.927523851162864),
 (21.899999999999999, 15.3

Lasso Implementation

In [15]:
zip (y_test, clf2.predict(X_test))

[(21.399999999999999, 24.550145863437926),
 (11.699999999999999, 18.959242971751134),
 (18.0, 20.500362730737855),
 (21.699999999999999, 22.416443216717202),
 (18.5, 24.237844772723431),
 (15.0, 21.371462312148424),
 (38.700000000000003, 34.716732310860948),
 (18.5, 21.284505646881037),
 (21.199999999999999, 23.148940140396434),
 (35.200000000000003, 31.596120702803514),
 (26.5, 25.944244767122036),
 (13.1, 20.388546284299611),
 (28.100000000000001, 24.770160959787145),
 (18.100000000000001, 19.316683914268101),
 (24.5, 20.522571274653551),
 (13.1, 16.778057256851053),
 (25.0, 23.966657195704858),
 (21.0, 22.600219784994202),
 (24.100000000000001, 25.620586544255794),
 (22.0, 27.693960096498632),
 (37.0, 31.278861811202141),
 (17.100000000000001, 18.903709334023677),
 (16.699999999999999, 19.687471810863055),
 (24.399999999999999, 23.820881685151896),
 (24.699999999999999, 25.519078606180042),
 (50.0, 36.713231787640815),
 (22.800000000000001, 26.309454518343586),
 (21.899999999999999,

Optimized Lasso Implementation

In [49]:
zip (y_test, clf3.predict(X_test))

[(32.399999999999999, 34.108524768061748),
 (17.600000000000001, 16.393515936874831),
 (20.300000000000001, 21.653851030499936),
 (19.600000000000001, 19.316868141873165),
 (19.300000000000001, 21.021085889253342),
 (14.9, 17.488567183834675),
 (28.399999999999999, 29.263011216337198),
 (21.199999999999999, 22.897441543737934),
 (21.699999999999999, 21.734102302463764),
 (19.5, 19.83795753698309),
 (21.100000000000001, 20.798200291269204),
 (10.199999999999999, 5.7587017598443602),
 (17.300000000000001, 15.44060485712065),
 (24.5, 27.316255415971948),
 (8.3000000000000007, 12.690157332968468),
 (21.899999999999999, 37.297552920264359),
 (36.0, 36.095377578098137),
 (22.300000000000001, 26.97236580501934),
 (26.600000000000001, 28.300753269486762),
 (16.5, 11.532263935971553),
 (13.800000000000001, 15.433053809323788),
 (28.699999999999999, 28.072893615324109),
 (19.600000000000001, 22.111613919649173),
 (12.1, 18.05704090961428),
 (27.0, 31.69602988258216),
 (33.100000000000001, 32.311

#DATA

In [50]:
bean = datasets.load_boston()
print bean.DESCR

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      