# House Prices Regularized Regression

In [None]:
from house_price_utils import *

setup_notebook()

# Data

In [None]:
data, data_test = load_data()
data = data[sorted(data)]
X_raw = data.drop(columns=["SalePrice"])
y = data.SalePrice

In [None]:
from sklearn.compose import TransformedTargetRegressor, make_column_transformer
from sklearn import linear_model
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

# Feature Grouping

In [None]:
features = group_features(X_raw)

# CV Split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X_raw,
    y,
    test_size=0.2,
    random_state=42,
    shuffle=True,
)

# Preprocessor

In [None]:
continuous_transformer = make_pipeline(SimpleImputer(strategy="mean"), StandardScaler())
nominal_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"),
)
ordinal_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
)
preprocessor = make_column_transformer(
    (continuous_transformer, features["continuous"]),
    (nominal_transformer, features["nominal"]),
    (ordinal_transformer, features["ordinal"]),
)

# Model

In [None]:
regressor = linear_model.ElasticNetCV(n_jobs=4)
model = make_pipeline(
    preprocessor,
    TransformedTargetRegressor(regressor, func=np.log, inverse_func=np.exp),
)

# Evaluation

In [None]:
model.fit(X_train, y_train)

In [None]:
report(model, X_train, y_train, X_val, y_val)

In [None]:
fig = residual_plots(y_val, model.predict(X_val))

# Summary
* Significantly less over-fitting with `ElasticNetCV` compared to `RandomForestRegressor`. This is likely due better handling of correlated features and regularization.
* `ElasticNetCV` was better than other CV linear models such as `LassoCV` and `RidgeCV`.
* Best submission score so far.

# Submit

In [None]:
model.fit(X_raw, y)
print(f"Full data MAE = {evaluate(model, X_raw, y):.2f}")

In [None]:
create_submission(model, data_test)

In [None]:
if False:
    import kaggle

    kaggle.api.competition_submit(
        "submission.csv",
        f"{regressor} with log transform of target",
        "home-data-for-ml-course",
    )