# House prices

In [10]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    OneHotEncoder,
    TargetEncoder,
    StandardScaler
)
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

## Data loading

In [2]:
train_data_link = "https://storage.googleapis.com/kagglesdsdata/competitions/10211/111096/train.csv?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1700329433&Signature=xWLQoWm5AFNswwtoSdz4fB5VtSvQRE4PESVJQE5oygFXQG4blCuSv36pLcY4An00SDKa6%2FdGgd6OBiNrJOAwqjmoNpq2Z44YXx6OpEcXAWi%2BTMV1ld7Uks10yaneHRDyxUvHnTqgREnyLrQ%2FExcFsf9%2FRy%2F5GhunNUQGocgD05CKTqsIefmWXC777xbD8aOpX7qIGw1LV3%2B09A53L%2BcM70ci9d%2Bs9TIfOYWd5EB8fSBM9lD2JUcEcuRDNDzgIE8hexPaz0C13CppBcYDmeVptQHJT2FqHMq2Et5xttJZixd4ERuST6EuBLdC6UOwfTMOoLfwoACb6ZN5kmdBo2BgnQ%3D%3D&response-content-disposition=attachment%3B+filename%3Dtrain.csv"
test_data_link = "https://storage.googleapis.com/kagglesdsdata/competitions/10211/111096/test.csv?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1700329692&Signature=q4JbSnDbjzH5gl2d%2BEG%2B66jCygY%2BCg8EyYFni8aBYo5SWVVG1q9tD4ktcWVSlpuu4wiIqXDek5LGNdCcQmHW%2BKCrCl4aIdGXWjCzkNnO9lxCAHHsSiHi8q%2BjDw4JzIbjzD%2FC0FzEjif3wWeeVg5u701YMMsH1UbzMXf2L%2F%2BtQxIEzv7LjuN%2BlmJDwznyY6sw8wYKSU77nCzR%2Bs%2B8An8bIFHgbXeBsznmyhf5nglkeeg7%2B0t37m%2Bz4OqYrb9rM5A8iAFYZF6DSJg%2F%2B7zUuASgBSjAw%2F3EBdFqerEk%2F%2FRhMqkx4RWQqpr7hJkM7c0FwEi8OXmoAGUslt430tbGiLko3w%3D%3D&response-content-disposition=attachment%3B+filename%3Dtest.csv"

train_data = pd.read_csv(train_data_link, index_col = 0)
X_test = pd.read_csv(test_data_link, index_col = 0)

X_train = train_data.drop("SalePrice", axis = 1)
y_train = train_data["SalePrice"]

X_full = pd.concat([X_train, X_test])

## Empty values

In [4]:
# drop all columns that have more than 15% empty values
X_full = X_full.loc[:, X_full.isna().sum()/X_full.shape[0] <= 0.15]

# we will deal with categorical and numerical columns in different ways
numeric_columns = X_full.select_dtypes("number").columns.to_numpy()
categorial_columns = np.setdiff1d(X_full.columns, numeric_columns)

X_full[list(numeric_columns)] = (
    X_full[list(numeric_columns)].
    fillna(X_full[list(numeric_columns)].mean())
)
X_full[list(categorial_columns)] = X_full[
    list(categorial_columns)
].fillna(
    X_full[list(categorial_columns)].mode().iloc[0, :]
)

# finally map changes to the train/test dataframe
X_train = X_full.loc[X_train.index]
X_test = X_full.loc[X_test.index]

## Model pipeline

In [20]:
ohe_cat_columns = categorial_columns[X_full[categorial_columns].nunique() < 5]
mse_cat_columns = np.setdiff1d(categorial_columns, ohe_cat_columns)

get_cols_inds = lambda cols: [X_full.columns.get_loc(col) for col in cols]

columns_transformer = ColumnTransformer(
    transformers = [
        ("one_hot_encoder", OneHotEncoder(), get_cols_inds(ohe_cat_columns)),
        (
            "mean_traget_encoder", 
            TargetEncoder(
                target_type = "continuous"), 
                get_cols_inds(mse_cat_columns)
        ),
        ("standart_scaler", StandardScaler(), get_cols_inds(numeric_columns))
    ]
)

pipe = Pipeline([
    ("columns_transformer", columns_transformer),
    ("model", RandomForestRegressor())
])

In [23]:
from sklearn.metrics import mean_squared_error

In [28]:
model = pipe.fit(X_train, y_train)

In [32]:
y_train.mean()

180921.19589041095

In [30]:
mean_squared_error(
    y_train,
    model.predict(X_train)
)**(1/2)

13100.432152848658