# Ансамбли

In [None]:
import time
import warnings

import catboost as cb
import numpy as np
import pandas as pd
import yaml
from houses_data_engineering import prepare_houses_data
from sklearn.ensemble import (
    GradientBoostingRegressor,
    HistGradientBoostingRegressor,
    RandomForestRegressor,
    StackingRegressor,
)
from sklearn.linear_model import (
    ElasticNetCV,
    LassoCV,
    LinearRegression,
    RidgeCV,
)
from sklearn.metrics import (
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
    r2_score,
)
from sklearn.model_selection import (
    RepeatedKFold,
    train_test_split,
)

SEED = 314159
TRAIN_TEST_SPLIT = 0.80


with open("../config.yaml", "r") as f:
    cfg = yaml.safe_load(f)

### Данные

In [None]:
df_houses_init = pd.read_csv(cfg["house_prices"]["train_dataset"])
y_houses = df_houses_init["SalePrice"]
df_houses_init.drop(columns=["SalePrice", "Id"], inplace=True)

df_houses_train_init, df_houses_test, y_houses_train, y_houses_test = train_test_split(
    df_houses_init, y_houses, test_size=1 - TRAIN_TEST_SPLIT, random_state=SEED
)

df_houses_train, df_houses_test, y_houses_train, y_houses_test = prepare_houses_data(
    df_train=df_houses_train_init,
    df_test=df_houses_test,
    y_train=y_houses_train,
    y_test=y_houses_test,
)

# Исходные датасеты для бустинга
df_houses_boosting_train = df_houses_init.loc[df_houses_train.index]
df_houses_boosting_test = df_houses_init.loc[df_houses_test.index]
cat_cols = list(df_houses_boosting_train.select_dtypes(include=["object"]).columns)

df_houses_boosting_train[cat_cols] = df_houses_boosting_train[cat_cols].fillna(
    "nan", inplace=False
)
df_houses_boosting_test[cat_cols] = df_houses_boosting_test[cat_cols].fillna(
    "nan", inplace=False
)

### Моделирование

In [None]:
linreg = LinearRegression()
lasso_cv = LassoCV(cv=RepeatedKFold(n_splits=5, n_repeats=3), random_state=SEED)
ridge_cv = RidgeCV(cv=RepeatedKFold(n_splits=5, n_repeats=3))
elastic_net_cv = ElasticNetCV(
    cv=RepeatedKFold(n_splits=5, n_repeats=3),
    random_state=SEED,
    max_iter=1000,
    l1_ratio=1.0,
    alphas=[0.01, 0.1, 1.0],
)

n_estimators = 250
random_forest = RandomForestRegressor(
    n_estimators=n_estimators, random_state=SEED, n_jobs=4
)
sk_gradient_boosting = GradientBoostingRegressor(
    n_estimators=n_estimators, random_state=SEED
)
hist_gradient_boosting = HistGradientBoostingRegressor(
    max_iter=n_estimators, random_state=SEED
)
cb_gradient_boosting = cb.CatBoostRegressor(
    n_estimators=n_estimators, random_state=SEED, verbose=0
)

stacking_random_forest = RandomForestRegressor(
    n_estimators=100, random_state=SEED, n_jobs=4
)
stacking_sk_gradient_boosting = GradientBoostingRegressor(
    n_estimators=100, random_state=SEED
)
stacking_hist_gradient_boosting = HistGradientBoostingRegressor(
    max_iter=100, random_state=SEED
)
stacking_meta_regressor = GradientBoostingRegressor(n_estimators=50, random_state=SEED)
stacking_regressor = StackingRegressor(
    estimators=[
        ("random_forest", stacking_random_forest),
        ("sk_gradient_boosting", stacking_sk_gradient_boosting),
        ("hist_gradient_boosting", stacking_hist_gradient_boosting),
    ],
    final_estimator=stacking_meta_regressor,
    cv=5,
    n_jobs=4,
)

models = dict(
    zip(
        [
            "linreg",
            "ridge_cv",
            "lasso_cv",
            "elastic_net_cv",
            "random_forest",
            "sk_gradient_boosting",
            "hist_gradient_boosting",
            "cb_gradient_boosting",
            "stacking",
        ],
        [
            linreg,
            ridge_cv,
            lasso_cv,
            elastic_net_cv,
            random_forest,
            sk_gradient_boosting,
            hist_gradient_boosting,
            cb_gradient_boosting,
            stacking_regressor,
        ],
    )
)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for model_name, model in models.items():
        start = time.time()
        if model_name == "cb_gradient_boosting":
            data = cb.Pool(
                data=df_houses_boosting_train,
                label=y_houses_train,
                cat_features=cat_cols,
            )
            model.fit(data)
        else:
            model.fit(df_houses_train, y_houses_train)

        print(f"{model_name} fit in {time.time() - start:.4f} sec")

In [None]:
y_houses_preds = {}
for model_name, model in models.items():
    if model_name == "cb_gradient_boosting":
        y_houses_preds[model_name] = model.predict(df_houses_boosting_test)
    else:
        y_houses_preds[model_name] = model.predict(df_houses_test)

data = []
for model_name, y_pred in y_houses_preds.items():
    data.append(
        [
            model_name,
            np.sqrt(mean_squared_error(y_true=y_houses_test, y_pred=y_pred)),
            mean_absolute_error(y_true=y_houses_test, y_pred=y_pred),
            r2_score(y_true=y_houses_test, y_pred=y_pred),
            mean_absolute_percentage_error(y_true=y_houses_test, y_pred=y_pred),
        ]
    )

df_res = pd.DataFrame(data, columns=["model_name", "RMSE", "MAE", "R2", "MAPE"])

In [None]:
df_res.sort_values(by="R2", ascending=False)