# Model Evaluation & Fine-Tuning
* Choosing the right model, and improving its performance

In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.utils import shuffle
from sklearn import metrics
from pandas.tools.plotting import scatter_matrix
from IPython.display import display

%matplotlib inline

In [None]:
boston = datasets.load_boston()
print(type(boston)) # A 'Bunch' is a special obj in python, similar to a dict
print(boston.keys())

In [None]:
print(boston.DESCR)

In [None]:
features = boston['feature_names']
boston_df = pd.DataFrame(boston['data'], columns=features)
boston_df['MEDV'] = pd.Series(boston['target'])
print(features)

In [None]:
target = pd.Series(boston['target'])
print('min, max = ', target.min(), ',', target.max())
print('mean = ', target.mean())
print('std = ', target.std())

# target values are in thousands of USD

In [None]:
# too many features to neatly visualize in a scatter mx. Let's pick a few we think will be good predictors
features_we_care_abt = ['RM', 'NOX', 'DIS', 'TAX', 'MEDV']
scatter_matrix(boston_df[features_we_care_abt], figsize=(12,8))

In [None]:
corr_mx = boston_df.corr()
corr_mx['MEDV'].sort_values(ascending=False)

In [None]:
data, target = shuffle(boston['data'], boston['target'])

X_train, X_test = data[:400], data[400:]
y_train, y_test = target[:400], target[400:]
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

In [None]:
lin_reg = LinearRegression()
d_tree = DecisionTreeRegressor()

lin_reg.fit(X_train, y_train)
d_tree.fit(X_train, y_train)

# Evaluate on Test Set

In [None]:
y_pred_lin = lin_reg.predict(X_test)
y_pred_tree = d_tree.predict(X_test)

def RMSE(MSE):
    """Root-Mean-Squared Error"""
    return np.sqrt(abs(MSE))

MSE_lin = metrics.mean_squared_error(y_test, y_pred_lin)
MSE_tree = metrics.mean_squared_error(y_test, y_pred_tree)

print('LinReg MSE: ', MSE_lin)
print('D Tree MSE: ', MSE_tree)
print()
print('LinReg RMSE: ', RMSE(MSE_lin))
print('D Tree RMSE: ', RMSE(MSE_tree))

# Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score

def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Std Dev: ", scores.std())
    print("Mean RMSE: ", RMSE(scores.mean()))

scores_tree = cross_val_score(d_tree, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
scores_lin = cross_val_score(lin_reg, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print("Decision Tree:")
display_scores(scores_tree)
print("\nLinear Regression:")
display_scores(scores_lin)

# Fine Tuning with Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'max_depth': [2,5,None], 'min_samples_leaf': [1, 5, 10, 20]}
]

grid_search = GridSearchCV(d_tree, param_grid, cv=5, scoring='neg_mean_squared_error')

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
best_tree = grid_search.best_estimator_
best_tree.fit(X_train, y_train) # refit on whole training set
y_pred_gridSearch = best_tree.predict(X_test)

cv_res = pd.DataFrame(grid_search.cv_results_)
display(cv_res[['params', 'mean_train_score', 'std_train_score', 'mean_test_score', 'std_test_score']])

print('Grid Search MSE: ', metrics.mean_squared_error(y_test, y_pred_gridSearch))

# Ensemble Methods

* Averaging predictions from several models to get a better result
* "Wisdom of the crowd"

**Random Forest**: A collection of Decision Trees, all with randomized hyper-parameters

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor()
forest.fit(X_train, y_train)

scores_forest = cross_val_score(forest, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
display_scores(scores_forest)

y_pred_forest = forest.predict(X_test)
MSE_forest = metrics.mean_squared_error(y_test, y_pred_forest)
print('Test MSE: ', MSE_forest)
print('Test RMSE: ', RMSE(MSE_forest))

# Grid Search CV on Random Forest

In [None]:
param_grid_forest = [
    {'n_estimators': [3, 10, 30], 'max_features': [2,4,6,8]}
]

grid_search_forest = GridSearchCV(forest, param_grid_forest, cv=5, scoring='neg_mean_squared_error')
grid_search_forest.fit(X_train, y_train)

In [None]:
print(grid_search_forest.best_params_)
print(grid_search_forest.best_estimator_)

In [None]:
cv_res_forest = pd.DataFrame(grid_search_forest.cv_results_)
display(cv_res_forest[['params', 'mean_train_score', 'std_train_score', 'mean_test_score', 'std_test_score']])

# Feature Importance

In [None]:
feature_importances = grid_search_forest.best_estimator_.feature_importances_
feature_importances = dict(sorted(zip(feature_importances, features), reverse=True))
feature_importances

In [None]:
# let's try re-training our model on only the 'most important' features
data_pruned = boston_df[['LSTAT', 'RM', 'INDUS', 'PTRATIO']]
data_pruned = data_pruned.values
X_train_pruned, X_test_pruned = data_pruned[:400], data_pruned[400:]
print(X_train_pruned.shape, X_test_pruned.shape)

In [None]:
forest_p = RandomForestRegressor(max_features=6, n_estimators=30)
forest = RandomForestRegressor(max_features=6, n_estimators=30)

scores_forest = cross_val_score(forest, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
scores_forest_p = cross_val_score(forest_p, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

print('Pruned dataset: ')
display_scores(scores_forest_p)
print()
print('Unmodified dataset: ')
display_scores(scores_forest)

# Final performance evaluation on Test Set

In [None]:
forest_p.fit(X_train, y_train)
y_pred = forest_p.predict(X_test)

MSE = metrics.mean_squared_error(y_test, y_pred)
print('final MSE: ', MSE)
print('final RMSE: ', RMSE(MSE))
