# Model

It's solution for housing prices competition.

In [1]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import (
    OneHotEncoder,
    TargetEncoder,
    StandardScaler
)
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

## Data loading

In [2]:
train_data_link = "https://storage.googleapis.com/kagglesdsdata/competitions/10211/111096/train.csv?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1700329433&Signature=xWLQoWm5AFNswwtoSdz4fB5VtSvQRE4PESVJQE5oygFXQG4blCuSv36pLcY4An00SDKa6%2FdGgd6OBiNrJOAwqjmoNpq2Z44YXx6OpEcXAWi%2BTMV1ld7Uks10yaneHRDyxUvHnTqgREnyLrQ%2FExcFsf9%2FRy%2F5GhunNUQGocgD05CKTqsIefmWXC777xbD8aOpX7qIGw1LV3%2B09A53L%2BcM70ci9d%2Bs9TIfOYWd5EB8fSBM9lD2JUcEcuRDNDzgIE8hexPaz0C13CppBcYDmeVptQHJT2FqHMq2Et5xttJZixd4ERuST6EuBLdC6UOwfTMOoLfwoACb6ZN5kmdBo2BgnQ%3D%3D&response-content-disposition=attachment%3B+filename%3Dtrain.csv"
test_data_link = "https://storage.googleapis.com/kagglesdsdata/competitions/10211/111096/test.csv?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1700329692&Signature=q4JbSnDbjzH5gl2d%2BEG%2B66jCygY%2BCg8EyYFni8aBYo5SWVVG1q9tD4ktcWVSlpuu4wiIqXDek5LGNdCcQmHW%2BKCrCl4aIdGXWjCzkNnO9lxCAHHsSiHi8q%2BjDw4JzIbjzD%2FC0FzEjif3wWeeVg5u701YMMsH1UbzMXf2L%2F%2BtQxIEzv7LjuN%2BlmJDwznyY6sw8wYKSU77nCzR%2Bs%2B8An8bIFHgbXeBsznmyhf5nglkeeg7%2B0t37m%2Bz4OqYrb9rM5A8iAFYZF6DSJg%2F%2B7zUuASgBSjAw%2F3EBdFqerEk%2F%2FRhMqkx4RWQqpr7hJkM7c0FwEi8OXmoAGUslt430tbGiLko3w%3D%3D&response-content-disposition=attachment%3B+filename%3Dtest.csv"

train_data = pd.read_csv(train_data_link, index_col = 0)
X_test = pd.read_csv(test_data_link, index_col = 0)

X_train = train_data.drop("SalePrice", axis = 1)
y_train = train_data["SalePrice"]

X_full = pd.concat([X_train, X_test])

## Empty values

Start with simple approaches:

- Work with all your data off-pipeline;
- Columns with more than `empty_fraction_treshold` will just be dropped;
- Empty values for numeric columns are replaced with the median;
- Empty values for categorical columns will be replaced with the most popular value.

In [3]:
# drop all columns that have more than 15% empty values
empty_fraction_treshold = 0.15
X_full = X_full.loc[:, X_full.isna().sum()/X_full.shape[0] <= empty_fraction_treshold]

# we will deal with categorical and numerical columns in different ways
numeric_columns = X_full.select_dtypes("number").columns.to_numpy()
categorial_columns = np.setdiff1d(X_full.columns, numeric_columns)

X_full[list(numeric_columns)] = (
    X_full[list(numeric_columns)].
    fillna(X_full[list(numeric_columns)].mean())
)
X_full[list(categorial_columns)] = X_full[
    list(categorial_columns)
].fillna(
    X_full[list(categorial_columns)].mode().iloc[0, :]
)

# finally map changes to the train/test dataframe
X_train = X_full.loc[X_train.index]
X_test = X_full.loc[X_test.index]

## Model pipeline

Here are some `sklearn` tools that allow you to build such an object that can take almost raw data and return final prediction to you.

Main features:

- Categorical columns that have less than or equal to `ohe_max_counts` unique values are processed with `OneHotEncoder`;
- Other categorical columns are processed with `TargetEncoder`;
- `StandardScaler` is used for all numeric columns;
- `RandomForest` is used as basic model.

In [6]:
ohe_max_counts = 5
ohe_cat_columns = categorial_columns[X_full[categorial_columns].nunique() < ohe_max_counts]
mse_cat_columns = np.setdiff1d(categorial_columns, ohe_cat_columns)

get_cols_inds = lambda cols: [X_full.columns.get_loc(col) for col in cols]

columns_transformer = ColumnTransformer(
    transformers = [
        (
            "one_hot_encoder", 
            OneHotEncoder(
                categories=X_full[ohe_cat_columns].apply(lambda X: X.unique()).to_list()
            ), 
            get_cols_inds(ohe_cat_columns)
        ),
        (
            "mean_traget_encoder", 
            TargetEncoder(
                target_type = "continuous"), 
                get_cols_inds(mse_cat_columns)
        ),
        ("standart_scaler", StandardScaler(), get_cols_inds(numeric_columns))
    ]
)

pipe = Pipeline([
    ("columns_transformer", columns_transformer),
    ("model", RandomForestRegressor())
])

## Grid search

Looking for best hyper parameters for the model.

In [11]:
param_grid = {
    "model__max_depth": [10, 15, 20],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 3, 5]
}

gs_result = GridSearchCV(
    estimator = pipe,
    param_grid = param_grid,
    scoring = "neg_mean_squared_error",
    verbose = 2
).fit(X_train.to_numpy(), y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END model__max_depth=10, model__min_samples_leaf=1, model__min_samples_split=2; total time=   1.4s
[CV] END model__max_depth=10, model__min_samples_leaf=1, model__min_samples_split=2; total time=   1.4s
[CV] END model__max_depth=10, model__min_samples_leaf=1, model__min_samples_split=2; total time=   1.4s
[CV] END model__max_depth=10, model__min_samples_leaf=1, model__min_samples_split=2; total time=   1.5s
[CV] END model__max_depth=10, model__min_samples_leaf=1, model__min_samples_split=2; total time=   1.6s
[CV] END model__max_depth=10, model__min_samples_leaf=1, model__min_samples_split=5; total time=   1.4s
[CV] END model__max_depth=10, model__min_samples_leaf=1, model__min_samples_split=5; total time=   1.3s
[CV] END model__max_depth=10, model__min_samples_leaf=1, model__min_samples_split=5; total time=   1.4s
[CV] END model__max_depth=10, model__min_samples_leaf=1, model__min_samples_split=5; total time=   1.3s
[C

## Submission

In [15]:
my_prediction = pipe.set_params(
    **gs_result.best_params_
).fit(
    X_train, y_train
).predict(X_test)

pd.Series(
    my_prediction,
    index = X_test.index,
    name  = "SalePrice"
).to_csv("submission.csv")