In [1]:
from sklearn import datasets
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression

In [2]:
bean = datasets.load_boston()
print bean.DESCR

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [3]:
def load_boston():
    scaler = StandardScaler()
    boston = datasets.load_boston()
    X=boston.data
    y=boston.target
    X = scaler.fit_transform(X)
    return train_test_split(X,y)

In [4]:
X_train, X_test, y_train, y_test = load_boston()

In [5]:
X_train.shape

(379L, 13L)

In [6]:
clf = LinearRegression()
clf.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [7]:
zip (y_test, clf.predict(X_test))

[(22.0, 27.684669932170635),
 (22.800000000000001, 28.894092534579542),
 (15.6, 21.541619860450904),
 (26.399999999999999, 22.849840283594617),
 (13.9, 13.20955207709423),
 (25.0, 24.447507101015709),
 (17.199999999999999, 14.404218637258861),
 (23.0, 23.42131921347368),
 (19.5, 18.345837879560484),
 (17.399999999999999, 23.375814000179044),
 (16.199999999999999, 20.746956720503107),
 (18.399999999999999, 19.295028486197531),
 (17.100000000000001, 19.38855624229102),
 (37.899999999999999, 33.959606505919837),
 (16.0, 18.894201552502466),
 (19.300000000000001, 20.783069270240134),
 (11.300000000000001, 13.790929585533188),
 (33.399999999999999, 35.801346433647865),
 (8.0999999999999996, 3.523801040724905),
 (10.199999999999999, 16.765986399707064),
 (24.0, 25.432697548350706),
 (12.6, 18.514163756779109),
 (28.600000000000001, 28.663522555686484),
 (15.4, 18.313890606563852),
 (8.5, 16.81150318018841),
 (21.399999999999999, 24.514577232218304),
 (27.5, 15.854132356637026),
 (30.10000000

In [11]:
r2 = r2_score(y_test, clf.predict(X_test))
print r2

0.747783986629


In [12]:
mse = mean_squared_error(y_test, clf.predict(X_test))
print mse

17.4820934637


In [14]:
import math
rmse = math.sqrt(mse)
print rmse

4.18115934446


In [37]:
from sklearn.linear_model import Ridge
clf = Ridge(alpha=1.0)
clf.fit(X_train, y_train)
r2 = r2_score(y_test, clf.predict(X_test))
mse = mean_squared_error(y_test, clf.predict(X_test))
rmse = math.sqrt(mse)
print r2, mse, rmse

0.748788984889 17.4124330434 4.17282075381


In [38]:
clf = Ridge(alpha=10.0)
clf.fit(X_train, y_train)
r2 = r2_score(y_test, clf.predict(X_test))
mse = mean_squared_error(y_test, clf.predict(X_test))
rmse = math.sqrt(mse)
print r2, mse, rmse

0.753844214964 17.062034972 4.13062162053


In [40]:
clf = Ridge(alpha=100.0)
clf.fit(X_train, y_train)
r2 = r2_score(y_test, clf.predict(X_test))
mse = mean_squared_error(y_test, clf.predict(X_test))
rmse = math.sqrt(mse)
print r2, mse, rmse

 0.752534672327 17.1528045725 4.1415944481


In [50]:
clf = Ridge(alpha=50.0)
clf.fit(X_train, y_train)
r2 = r2_score(y_test, clf.predict(X_test))
mse = mean_squared_error(y_test, clf.predict(X_test))
rmse = math.sqrt(mse)
print r2, mse, rmse

0.75652222522 16.8764114465 4.1080909735


In [41]:
clf = Ridge(alpha=30.0)
clf.fit(X_train, y_train)
r2 = r2_score(y_test, clf.predict(X_test))
mse = mean_squared_error(y_test, clf.predict(X_test))
rmse = math.sqrt(mse)
print r2, mse, rmse

0.756631745474 16.8688201628 4.10716692658


In [51]:
clf = Ridge(alpha=29.0)
clf.fit(X_train, y_train)
r2 = r2_score(y_test, clf.predict(X_test))
mse = mean_squared_error(y_test, clf.predict(X_test))
rmse = math.sqrt(mse)
print r2, mse, rmse

0.756591772427 16.8715908534 4.10750421222


In [53]:
clf = Ridge(alpha=32.0)
clf.fit(X_train, y_train)
r2 = r2_score(y_test, clf.predict(X_test))
mse = mean_squared_error(y_test, clf.predict(X_test))
rmse = math.sqrt(mse)
print r2, mse, rmse

0.756694495936 16.8644706791 4.10663739319


In [55]:
clf = Ridge(alpha=34.0)
clf.fit(X_train, y_train)
r2 = r2_score(y_test, clf.predict(X_test))
mse = mean_squared_error(y_test, clf.predict(X_test))
rmse = math.sqrt(mse)
print r2, mse, rmse

0.756736416116 16.8615650249 4.10628360259


In [58]:
clf = Ridge(alpha=36.0)
clf.fit(X_train, y_train)
r2 = r2_score(y_test, clf.predict(X_test))
mse = mean_squared_error(y_test, clf.predict(X_test))
rmse = math.sqrt(mse)
print r2, mse, rmse

0.756759658734 16.8599539868 4.10608743049


In [69]:
clf = Ridge(alpha=38.0)
clf.fit(X_train, y_train)
r2 = r2_score(y_test, clf.predict(X_test))
mse = mean_squared_error(y_test, clf.predict(X_test))
rmse = math.sqrt(mse)
print r2, mse, rmse

0.756766034526 16.8595120554 4.10603361596


In [70]:
clf = Ridge(alpha=38.1)
clf.fit(X_train, y_train)
r2 = r2_score(y_test, clf.predict(X_test))
mse = mean_squared_error(y_test, clf.predict(X_test))
rmse = math.sqrt(mse)
print r2, mse, rmse

0.756765939113 16.8595186688 4.10603442129


In [73]:
clf = Ridge(alpha=37.9)
clf.fit(X_train, y_train)
r2 = r2_score(y_test, clf.predict(X_test))
mse = mean_squared_error(y_test, clf.predict(X_test))
rmse = math.sqrt(mse)
print r2, mse, rmse

0.756766091653 16.8595080957 4.10603313377


In [74]:
clf = Ridge(alpha=37.8)
clf.fit(X_train, y_train)
r2 = r2_score(y_test, clf.predict(X_test))
mse = mean_squared_error(y_test, clf.predict(X_test))
rmse = math.sqrt(mse)
print r2, mse, rmse

0.756766110318 16.8595068019 4.10603297623


In [75]:
clf = Ridge(alpha=37.7)
clf.fit(X_train, y_train)
r2 = r2_score(y_test, clf.predict(X_test))
mse = mean_squared_error(y_test, clf.predict(X_test))
rmse = math.sqrt(mse)
print r2, mse, rmse

0.756766090341 16.8595081865 4.10603314484


Best alpha is 37.8