# Chapter 2 Exercises

## Exercises
(code from the book ommitted, for all code see https://github.com/chrismarch/handson-ml2/tree/scratch/my_exercises)

### 1.
_Try a Support Vector Machine regressor (sklearn.svm.SVR) with various hyperparameters, such as kernel="linear" (with various values for the C hyperparameter) or kernel="rbf" (with various values for the C and gamma hyperparameters). Don’t worry about what these hyperparameters mean for now. How does the best SVR predictor perform?_

In [207]:
from sklearn.svm import SVR

''' 
linear C=1
Mean: 111814.1069260169
Standard deviation: 2600.8402690892553

linear C=.1
Mean: 118146.91107625126
Standard deviation: 2599.0466057977374

linear C=10
Mean: 83057.56340284644
Standard deviation: 2788.761505866687

rbf C=10 gamma=0.2
Mean: 116574.18557994801
Standard deviation: 2635.2238286729885

rbf C=1 gamma=.2
Mean: 118668.47850878662
Standard deviation: 2610.862036182229
'''

sv_reg = SVR(kernel="linear", C=1)
sv_reg.fit(housing_prepared, housing_labels)

sv_scores = cross_val_score(sv_reg, housing_prepared, housing_labels,
                              scoring="neg_mean_squared_error", cv=10)

sv_rmse_scores = np.sqrt(-sv_scores)
display_scores(sv_rmse_scores)

Scores: [110944.44375543 113447.25887258 107493.96870224 114116.08150269
 108232.88967584 116446.43416569 113762.33996303 112157.45620497
 113544.34715549 112375.60753671]
Mean: 112252.08275346753
Standard deviation: 2587.9343077143317


In [208]:
import joblib

joblib.dump(housing, "housing_test.pkl")

['housing_test.pkl']

### 5. (out of order)
_Automatically explore some preparation options using GridSearchCV._

In [228]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

### 2. 
_Try replacing GridSearchCV with RandomizedSearchCV._

In [287]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

n_est = [*range(3, 31, 1)]
print(n_est)
max_feat = [*range(2, 9, 1)]
print(max_feat)
distributions = dict(n_estimators=n_est,
                      max_features=max_feat)

forest_reg = RandomForestRegressor()

grid_search = RandomizedSearchCV(forest_reg, distributions, cv=5, random_state=42, n_iter=18,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(housing_prepared, housing_labels)


[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
[2, 3, 4, 5, 6, 7, 8]


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=18,
                   param_distributions={'max_features': [2, 3, 4, 5, 6, 7, 8],
                                        'n_estimators': [3, 4, 5, 6, 7, 8, 9,
                                                         10, 11, 12, 13, 14, 15,
                                                         16, 17, 18, 19, 20, 21,
                                                         22, 23, 24, 25, 26, 27,
                                                         28, 29, 30]},
                   random_state=42, return_train_score=True,
                   scoring='neg_mean_squared_error')

### 3.
_Try adding a transformer in the preparation pipeline to select only the most important attributes._

In [229]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_encoder = col_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
list(zip(feature_importances, attributes))
#sorted(zip(feature_importances, attributes), reverse=True)

[(0.08251281948693298, 'longitude'),
 (0.07276896667079569, 'latitude'),
 (0.0488151659923637, 'housing_median_age'),
 (0.3499295013881789, 'total_rooms'),
 (0.05681923619581421, 'total_bedrooms'),
 (0.12382776136051535, 'population'),
 (0.0913804106897131, 'households'),
 (0.17394613821568616, 'median_income')]

### 4.
_Try creating a single pipeline that does the full data preparation plus the final prediction._

In [230]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)   # => evaluates to 47,730.2
final_rmse

47336.121936843396

In [214]:
from scipy import stats
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                          loc=squared_errors.mean(),
                          scale=stats.sem(squared_errors)))


array([45265.35214861, 49282.97892411])