In [1]:
import california_housing_data

housing = california_housing_data.load_data()
train_set, test_set = california_housing_data.split_train_test(housing, test_ratio=0.2)

In [2]:
train_data, train_labels = california_housing_data.split_sample_and_label(train_set)
train_data = california_housing_data.preprocess(train_data)

test_data, test_labels = california_housing_data.split_sample_and_label(test_set)
test_data = california_housing_data.preprocess(test_data)

In [3]:
from sklearn.linear_model import LinearRegression

# Step 1: Prediction with the linear regression model.

lin_reg = LinearRegression()
lin_reg.fit(train_data, train_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [4]:
import numpy as np

some_data = train_data[:5]
some_labels = train_labels[:5]

housing_predictions = lin_reg.predict(some_data)
targets = np.array(some_labels)
diff = targets-housing_predictions
# Absolute error
error = np.abs(diff/housing_predictions)

print('Housing prediction with linear regression:\n', housing_predictions)
print('Targets:', targets)
print('Error:', error)

Housing prediction with linear regression:
 [210644.60459286 317768.80697211 210956.43331178  59218.98886849
 189747.55849879]
Targets: [286600. 340600. 196900.  46300. 254500.]
Error: [0.36058553 0.07184844 0.06663193 0.21815619 0.34125573]


In [5]:
from sklearn.metrics import mean_squared_error

# Measure a RMSE, Root Mean Square Error.

housing_predictions = lin_reg.predict(train_data)
lin_mse = mean_squared_error(y_true=train_labels,
                             y_pred=housing_predictions)
lin_rmse = np.sqrt(lin_mse)

# Error is too big! Use more powerful model to predict housing price.
print('Linera regression model, RMSE:', lin_rmse)

Linera regression model, RMSE: 68628.19819848922


In [6]:
from sklearn.tree import DecisionTreeRegressor

# Step 2: Decision tree regression

tree_reg = DecisionTreeRegressor()
tree_reg.fit(train_data, train_labels)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [7]:
housing_predictions = tree_reg.predict(train_data)
tree_mse = mean_squared_error(y_true=train_labels,
                              y_pred=housing_predictions)
tree_rmse = np.sqrt(tree_mse)

# Model is overfitting! Regression curve extremly is fitted to the train data.
print('Decision tree regression model, RMSE:', tree_rmse)

test_predictions = tree_reg.predict(test_data)
test_mse = mean_squared_error(y_true=test_labels,
                              y_pred=test_predictions)
test_rmse = np.sqrt(test_mse)

# Regression curve does not match to the test samples.
print('Decision tree regression model, test set, RMSE:', test_rmse)

Decision tree regression model, RMSE: 0.0
Decision tree regression model, test set, RMSE: 111945.7978216257


In [8]:
from sklearn.model_selection import cross_val_score

# Step 3: Estimate a decision tree regressor with K-fold cross validation

# Cross validation module of sklearn use a utility function, not cost or loss function.
scores = cross_val_score(estimator=tree_reg,
                         X=train_data,
                         y=train_labels,
                         scoring='neg_mean_squared_error',
                         cv=10) # cv is the iteration of cross-validation.
tree_rmse_scores = np.sqrt(np.abs(scores))

print('Scores:', tree_rmse_scores)
print('Mean:', tree_rmse_scores.mean())
print('Standard deviation:', tree_rmse_scores.std())

Scores: [70143.55050121 66301.28547665 70989.69822149 69170.67696083
 72792.05913736 73890.29137486 71147.32287011 70821.3281629
 75569.79007473 70265.70659548]
Mean: 71109.17093756306
Standard deviation: 2430.6341941137107


In [9]:
# Step 4: Estimate a linear regressor with K-fold cross validation

scores = cross_val_score(estimator=lin_reg,
                         X=train_data,
                         y=train_labels,
                         scoring='neg_mean_squared_error',
                         cv=10) # cv is the iteration of cross-validation.
lin_rmse_scores = np.sqrt(np.abs(scores))

print('Scores:', lin_rmse_scores)
print('Mean:', lin_rmse_scores.mean())
print('Standard deviation:', lin_rmse_scores.std())

Scores: [66782.73843989 66960.118071   70347.95244419 74739.57052552
 68031.13388938 71193.84183426 64969.63056405 68281.61137997
 71552.91566558 67665.10082067]
Mean: 69052.46136345083
Standard deviation: 2731.674001798348


In [11]:
from sklearn.ensemble import RandomForestRegressor

# Step 5: Random forest regressor

forest_reg = RandomForestRegressor(n_estimators=10)
forest_reg.fit(train_data, train_labels)

scores = cross_val_score(estimator=forest_reg,
                         X=train_data,
                         y=train_labels,
                         scoring='neg_mean_squared_error',
                         cv=10) # cv is the iteration of cross-validation.
forest_rmse_scores = np.sqrt(np.abs(scores))

print('Scores:', forest_rmse_scores)
print('Mean:', forest_rmse_scores.mean())
print('Standard deviation:', forest_rmse_scores.std())

Scores: [51873.2105203  49773.9333049  53404.89104529 55083.20130695
 51198.93074718 56006.30095138 51353.87616934 50304.13060578
 54813.68849675 53110.34576746]
Mean: 52692.25089153416
Standard deviation: 2019.7649276830316


In [13]:
from sklearn.externals import joblib

# Save the sklearn models.

joblib.dump(lin_reg, 'linear_regressor.pkl')
joblib.dump(tree_reg, 'decision_tree_regressor.pkl')
joblib.dump(forest_reg, 'random_forest_regressor.pkl')

['random_forest_regressor.pkl']