# House Price Hybrid Boosting Model

In [None]:
from house_price_utils import *

setup_notebook()

In [None]:
data, data_test = load_data()
data = data[sorted(data)]
X_raw = data.drop(columns=["SalePrice"])
y = data.SalePrice

In [None]:
full_data = pd.concat([X_raw, data_test])
features = group_features(full_data)
full_clean_data = clean(full_data, features)
X = full_clean_data[lambda x: x.index.isin(X_raw.index)]
X_test = full_clean_data[lambda x: x.index.isin(data_test.index)]

# Model Experiments


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_absolute_error

# 1. Absolute Error Loss

In [None]:
model = RandomForestRegressor(random_state=0, criterion="absolute_error")

In [None]:
scores = cross_validate(
    model,
    X,
    y,
    cv=10,
    scoring="neg_mean_absolute_error",
    return_train_score=True,
    n_jobs=3,
)

In [None]:
-scores["train_score"].mean(), scores["train_score"].std()

In [None]:
-scores["test_score"].mean(), scores["test_score"].std()

# Observations
* Still overfitting.
* MAE is way slower to fit.

# 2. Grid Search
Let's see if some parameter tuning can help.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=0, shuffle=True
)

In [None]:
model = RandomForestRegressor(random_state=0)

In [None]:
model.get_params()

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "min_samples_leaf": (1, 2, 4, 8, 16),
    "max_depth": (None, 5, 10, 20, 30),
    "max_features": ("log2", "sqrt"),
    "n_estimators": (10, 50, 100, 200),
}
model_grid_search = GridSearchCV(
    model, param_grid=param_grid, n_jobs=4, cv=2, scoring="neg_mean_absolute_error"
)
model_grid_search.fit(X_train, y_train)

In [None]:
model_grid_search.best_params_

In [None]:
model_grid_search.best_score_

In [None]:
mean_absolute_error(y_val, model_grid_search.predict(X_val))

In [None]:
model.set_params(**model_grid_search.best_params_)
model.set_params(criterion="absolute_error")

## Observations
* No improvement in MAE.

# Diagnostics

In [None]:
from sklearn.compose import TransformedTargetRegressor

In [None]:
log_model = TransformedTargetRegressor(model, func=np.log, inverse_func=np.exp)

In [None]:
log_model.fit(X_train, y_train)

In [None]:
yhat = log_model.predict(X_val)
mean_absolute_error(y_val, yhat)

In [None]:
fig = residual_plots(y_val, yhat)

# 3. Additive Residual Model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
class TwoStageModel:
    def __init__(self, model, resid_model):
        self.model = model
        self.resid_model = resid_model

    def fit(self, X, y):
        self.model.fit(X, y)
        yhat = self.model.predict(X)
        X2 = np.column_stack([yhat, yhat**2])
        self.resid_model.fit(X2, y - yhat)

    def predict(self, X):
        yhat = self.model.predict(X)
        X2 = np.column_stack([yhat, yhat**2])
        return yhat + self.resid_model.predict(X2)

In [None]:
resid_model = LinearRegression()

In [None]:
two_stage_model = TwoStageModel(log_model, resid_model)

In [None]:
two_stage_model.fit(X_train, y_train)
yhat = two_stage_model.predict(X_train)

In [None]:
mean_absolute_error(y_train, yhat)

In [None]:
fig = residual_plots(y_train, yhat)

In [None]:
yhat = two_stage_model.predict(X_val)

In [None]:
mean_absolute_error(y_val, yhat)

In [None]:
fig = residual_plots(y_val, yhat)

In [None]:
two_stage_model.fit(X, y)
yhat = two_stage_model.predict(X)

In [None]:
mean_absolute_error(y, yhat)

In [None]:
fig = residual_plots(y, yhat)

In [None]:
resid_df = pd.DataFrame({"resid": y - yhat, "y": y, "yhat": yhat}).join(X_raw)

In [None]:
resid_df.sort_values("resid")

In [None]:
X_raw.SaleCondition.value_counts()

In [None]:
data_test.SaleCondition.value_counts()

# Submit

In [None]:
if False:
    import kaggle

    create_submission(two_stage_model, X_test)
    kaggle.api.competition_submit(
        "submission.csv", "Two stage model", "home-data-for-ml-course"
    )