# Simple notebook to get to a boosted tree regression with some CV tuning

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')
# get pandas
import pandas as pd
# we'll do some visual checks, get the necessary packages
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import numpy as np

In [2]:
# get the input df
df_raw = pd.read_csv('../data/train.csv')

In [3]:
# wrapper function to perform some simple cleanup
def clean_df(df):
    # keep only certain variable types
    df = df.select_dtypes(include=['int64','float64'])
    # drop everything with more than 10% of missing values
    df = df[df.columns[df.isnull().mean() < 0.1]]
    # looks like Garage built goes with Year built, replace NA using that
    df.loc[df['GarageYrBlt'].isnull(),'GarageYrBlt'] = df['YearBuilt']
    # put zero for MasVnrArea
    df.loc[df['MasVnrArea'].isnull(),'MasVnrArea'] = 0.0
    # drop id
    df = df.drop(['Id'], axis=1)    
    # remove outliers
    df = df[df['GrLivArea'] < 4676] 
    return df

In [4]:
df = clean_df(df_raw)

In [5]:
# now we will look at the variable ranking with a different approach, GBT
from sklearn import ensemble
# prep scikit
input_features = list(df.columns)
input_features.remove('SalePrice')
X = df[input_features].values
y = np.log(df['SalePrice']+1).values

In [6]:
# Fit regression model
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}
clf = ensemble.GradientBoostingRegressor(**params)
clf.fit(X, y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=4, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=500, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [7]:
# to have an idea of the score
from sklearn.model_selection import cross_val_score
# check model performance
scores = cross_val_score(clf, X, y, cv=3, scoring='neg_mean_squared_error')
# transform scores into the root mean square
scores = np.sqrt(-scores)
print("Mean Squared Error (on Log): %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Mean Squared Error (on Log): 0.13 (+/- 0.01)


## allrighty, we have a baseline for this model

In [8]:
# now we will perform some simple model tuning
from sklearn.model_selection import GridSearchCV

In [19]:
# create score
from sklearn.metrics import mean_squared_error
from sklearn.metrics.scorer import make_scorer

def my_function(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

custom_scorer = make_scorer(my_function, greater_is_better=False)

In [38]:
# simple test to check machinery
params_grid = {
    'n_estimators': [600], 
    'max_depth': [4], 
    'min_samples_split': [2],#20
    'learning_rate': [0.01*2],#0.01
    'loss': ['ls'],
    'criterion': ['friedman_mse'],
    'min_samples_leaf': [3],
    'max_features': ['log2'],#auto,
    'warm_start': [False],#true
}

gs_clf = GridSearchCV(clf, params_grid, 
                      n_jobs=-1, cv=3,
                      scoring=custom_scorer)
gs_clf.fit(X, y)

print("Best parameters set found on development set:")
print()
print(gs_clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = gs_clf.cv_results_['mean_test_score']
stds = gs_clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gs_clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

Best parameters set found on development set:

{'max_depth': 4, 'criterion': 'friedman_mse', 'min_samples_split': 2, 'n_estimators': 600, 'max_features': 'log2', 'min_samples_leaf': 3, 'loss': 'ls', 'warm_start': False, 'learning_rate': 0.02}

Grid scores on development set:

-0.125 (+/-0.012) for {'max_depth': 4, 'criterion': 'friedman_mse', 'min_samples_split': 2, 'n_estimators': 600, 'max_features': 'log2', 'min_samples_leaf': 3, 'loss': 'ls', 'warm_start': False, 'learning_rate': 0.02}


In [39]:
# looks good, redo the model
params = {'max_depth': 4, 'criterion': 'friedman_mse', 'min_samples_split': 2, 'n_estimators': 600, 'max_features': 'log2', 'min_samples_leaf': 3, 'loss': 'ls', 'warm_start': False, 'learning_rate': 0.02}
clf = ensemble.GradientBoostingRegressor(**params)
clf.fit(X, y)
scores = cross_val_score(clf, X, y, cv=3, scoring=custom_scorer)
print("Mean Squared Error (on Log): %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

Mean Squared Error (on Log): -0.125 (+/- 0.013)


In [40]:
# looks good save the model
import pickle
pickle.dump(clf, open('../models/gbt_reg_v2.pkl', 'wb'))