In [1]:
from category_encoders.cat_boost import CatBoostEncoder

In [2]:
import typing
import pickle
import pandas as pd
import numpy as np
import logging

from lightgbm import LGBMRegressor

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.exceptions import NotFittedError

In [3]:
import typing
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

THRESHOLD = 0.15
NEGATIVE_WEIGHT = 1.1

def deviation_metric_one_sample(y_true: typing.Union[float, int], y_pred: typing.Union[float, int]) -> float:
    """
    Реализация кастомной метрики для хакатона.

    :param y_true: float, реальная цена
    :param y_pred: float, предсказанная цена
    :return: float, значение метрики
    """
    deviation = (y_pred - y_true) / np.maximum(1e-8, y_true)
    if np.abs(deviation) <= THRESHOLD:
        return 0
    elif deviation <= - 4 * THRESHOLD:
        return 9 * NEGATIVE_WEIGHT
    elif deviation < -THRESHOLD:
        return NEGATIVE_WEIGHT * ((deviation / THRESHOLD) + 1) ** 2
    elif deviation < 4 * THRESHOLD:
        return ((deviation / THRESHOLD) - 1) ** 2
    else:
        return 9


def deviation_metric(y_true: np.array, y_pred: np.array) -> float:
    return np.array([deviation_metric_one_sample(y_true[n], y_pred[n]) for n in range(len(y_true))]).mean()

def median_absolute_percentage_error(y_true: np.array, y_pred: np.array) -> float:
    return np.median(np.abs(y_pred-y_true)/y_true)

def metrics_stat(y_true: np.array, y_pred: np.array) -> typing.Dict[str,float]:
    mape = mean_absolute_percentage_error(y_true, y_pred)
    mdape = median_absolute_percentage_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    raif_metric = deviation_metric(y_true, y_pred)
    return {'mape':mape, 'mdape':mdape, 'rmse': rmse, 'r2': r2, 'raif_metric':raif_metric}

EPS = 1e-8

In [4]:
from enum  import IntEnum

UNKNOWN_VALUE = 'missing'

class PriceTypeEnum(IntEnum):

    OFFER_PRICE = 0 # цена из объявления
    MANUAL_PRICE = 1 # цена, полученная путем ручной оценки

In [5]:
def prepare_categorical(df: pd.DataFrame) -> pd.DataFrame:
    """
    Заполняет пропущенные категориальные переменные
    :param df: dataframe, обучающая выборка
    :return: dataframe
    """
    df_new = df.copy()
    fillna_cols = ['region','city','street','realty_type']
    df_new[fillna_cols] = df_new[fillna_cols].fillna(UNKNOWN_VALUE)
    return df_new

In [6]:
TARGET = 'per_square_meter_price'
# признаки (или набор признаков), для которых применяем smoothed target encoding
CATEGORICAL_STE_FEATURES = ['region', 'city', 'realty_type']

# признаки, для которых применяем one hot encoding
CATEGORICAL_OHE_FEATURES = []

# численные признаки
NUM_FEATURES = ['lat', 'lng', 'osm_amenity_points_in_0.001',
       'osm_amenity_points_in_0.005', 'osm_amenity_points_in_0.0075',
       'osm_amenity_points_in_0.01', 'osm_building_points_in_0.001',
       'osm_building_points_in_0.005', 'osm_building_points_in_0.0075',
       'osm_building_points_in_0.01', 'osm_catering_points_in_0.001',
       'osm_catering_points_in_0.005', 'osm_catering_points_in_0.0075',
       'osm_catering_points_in_0.01', 'osm_city_closest_dist',
      'osm_city_nearest_population',
       'osm_crossing_closest_dist', 'osm_crossing_points_in_0.001',
       'osm_crossing_points_in_0.005', 'osm_crossing_points_in_0.0075',
       'osm_crossing_points_in_0.01', 'osm_culture_points_in_0.001',
       'osm_culture_points_in_0.005', 'osm_culture_points_in_0.0075',
       'osm_culture_points_in_0.01', 'osm_finance_points_in_0.001',
       'osm_finance_points_in_0.005', 'osm_finance_points_in_0.0075',
       'osm_finance_points_in_0.01', 'osm_healthcare_points_in_0.005',
       'osm_healthcare_points_in_0.0075', 'osm_healthcare_points_in_0.01',
       'osm_historic_points_in_0.005', 'osm_historic_points_in_0.0075',
       'osm_historic_points_in_0.01', 'osm_hotels_points_in_0.005',
       'osm_hotels_points_in_0.0075', 'osm_hotels_points_in_0.01',
       'osm_leisure_points_in_0.005', 'osm_leisure_points_in_0.0075',
       'osm_leisure_points_in_0.01', 'osm_offices_points_in_0.001',
       'osm_offices_points_in_0.005', 'osm_offices_points_in_0.0075',
       'osm_offices_points_in_0.01', 'osm_shops_points_in_0.001',
       'osm_shops_points_in_0.005', 'osm_shops_points_in_0.0075',
       'osm_shops_points_in_0.01', 'osm_subway_closest_dist',
       'osm_train_stop_closest_dist', 'osm_train_stop_points_in_0.005',
       'osm_train_stop_points_in_0.0075', 'osm_train_stop_points_in_0.01',
       'osm_transport_stop_closest_dist', 'osm_transport_stop_points_in_0.005',
       'osm_transport_stop_points_in_0.0075',
       'osm_transport_stop_points_in_0.01',
       'reform_count_of_houses_1000', 'reform_count_of_houses_500',
       'reform_house_population_1000', 'reform_house_population_500',
       'reform_mean_floor_count_1000', 'reform_mean_floor_count_500',
       'reform_mean_year_building_1000', 'reform_mean_year_building_500','total_square']

MODEL_PARAMS = dict(
            n_estimators=2000,
            learning_rate=0.01,
            reg_alpha=2,
            num_leaves=40,
            min_child_samples=5,
            importance_type="gain",
            n_jobs=-1,
            random_state=563,
            silent=False
        )

In [7]:
class BenchmarkModel():
    """
    Модель представляет из себя sklearn pipeline. Пошаговый алгоритм:
      1) в качестве обучения выбираются все данные с price_type=0
      1) все фичи делятся на три типа (numerical_features, ohe_categorical_features, ste_categorical_features):
          1.1) numerical_features - применяется StandardScaler
          1.2) ohe_categorical_featires - кодируются через one hot encoding
          1.3) ste_categorical_features - кодируются через SmoothedTargetEncoder
      2) после этого все полученные фичи конкатенируются в одно пространство фичей и подаются на вход модели Lightgbm
      3) делаем предикт на данных с price_type=1, считаем среднее отклонение реальных значений от предикта. Вычитаем это отклонение на финальном шаге (чтобы сместить отклонение к 0)

    :param numerical_features: list, список численных признаков из датафрейма
    :param ohe_categorical_features: list, список категориальных признаков для one hot encoding
    :param ste_categorical_features, list, список категориальных признаков для smoothed target encoding.
                                     Можно кодировать сразу несколько полей (например объединять категориальные признаки)
    :
    """

    def __init__(self, numerical_features: typing.List[str],
                 ohe_categorical_features: typing.List[str],
                 ste_categorical_features: typing.List[typing.Union[str, typing.List[str]]],
                 model_params: typing.Dict[str, typing.Union[str,int,float]]):
        self.num_features = numerical_features
        self.ohe_cat_features = ohe_categorical_features
        self.ste_cat_features = ste_categorical_features

        self.preprocessor = ColumnTransformer(transformers=[
            ('num', StandardScaler(), self.num_features),
            ('ohe', OneHotEncoder(), self.ohe_cat_features),
            ('ste', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1), # CatBoostEncoder(handle_missing='value', handle_unknown='value'),
             self.ste_cat_features)])

        self.model = LGBMRegressor(**model_params)

        self.pipeline = Pipeline(steps=[
            ('preprocessor', self.preprocessor),
            ('model', self.model)])

        self._is_fitted = False
        self.corr_coef = 0

    def _find_corr_coefficient(self, X_manual: pd.DataFrame, y_manual: pd.Series):
        """Вычисление корректирующего коэффициента

        :param X_manual: pd.DataFrame с ручными оценками
        :param y_manual: pd.Series - цены ручника
        """
        predictions = self.pipeline.predict(X_manual)
        deviation = ((y_manual - predictions)/predictions).median()
        self.corr_coef = deviation

    def fit(self, X_offer: pd.DataFrame, y_offer: pd.Series,
            X_manual: pd.DataFrame, y_manual: pd.Series):
        """Обучение модели.
        ML модель обучается на данных по предложениям на рынке (цены из объявления)
        Затем вычисляется среднее отклонение между руяными оценками и предиктами для корректировки стоимости

        :param X_offer: pd.DataFrame с объявлениями
        :param y_offer: pd.Series - цена предложения (в объявлениях)
        :param X_manual: pd.DataFrame с ручными оценками
        :param y_manual: pd.Series - цены ручника
        """
        print('Fit lightgbm')
        self.pipeline.fit(X_offer, y_offer, model__feature_name=[f'{i}' for i in range(70)],model__categorical_feature=['67','68','69'])
        print('Find corr coefficient')
        self._find_corr_coefficient(X_manual, y_manual)
        print(f'Corr coef: {self.corr_coef:.2f}')
        self.__is_fitted = True

    def predict(self, X: pd.DataFrame) -> np.array:
        """Предсказание модели Предсказываем преобразованный таргет, затем конвертируем в обычную цену через обратное
        преобразование.

        :param X: pd.DataFrame
        :return: np.array, предсказания (цены на коммерческую недвижимость)
        """
        if self.__is_fitted:
            predictions = self.pipeline.predict(X)
            corrected_price = predictions * (1 + self.corr_coef)
            return corrected_price
        else:
            raise NotFittedError(
                "This {} instance is not fitted yet! Call 'fit' with appropriate arguments before predict".format(
                    type(self).__name__
                )
            )

    def save(self, path: str):
        """Сериализует модель в pickle.

        :param path: str, путь до файла
        """
        with open(path, "wb") as f:
            pickle.dump(self, f)

    @classmethod
    def load(self, path: str):
        """Сериализует модель в pickle.

        :param path: str, путь до файла
        :return: Модель
        """
        with open(path, "rb") as f:
            model = pickle.load(f)
        return model

In [8]:
from pathlib import Path

In [9]:
train_path = Path('../data/train.csv')
test_path = Path('../data/test.csv')
submission_path = Path('../data/test_submission.csv')

In [10]:
train_df = pd.read_csv(train_path)
train_df = prepare_categorical(train_df)

X_offer = train_df[train_df.price_type == PriceTypeEnum.OFFER_PRICE][NUM_FEATURES+CATEGORICAL_OHE_FEATURES+CATEGORICAL_STE_FEATURES]
y_offer = train_df[train_df.price_type == PriceTypeEnum.OFFER_PRICE][TARGET]
X_manual = train_df[train_df.price_type == PriceTypeEnum.MANUAL_PRICE][NUM_FEATURES+CATEGORICAL_OHE_FEATURES+CATEGORICAL_STE_FEATURES]
y_manual = train_df[train_df.price_type == PriceTypeEnum.MANUAL_PRICE][TARGET]
print(f'X_offer {X_offer.shape}  y_offer {y_offer.shape}\tX_manual {X_manual.shape} y_manual {y_manual.shape}')


  exec(code_obj, self.user_global_ns, self.user_ns)


X_offer (275299, 70)  y_offer (275299,)	X_manual (4493, 70) y_manual (4493,)


In [11]:
model = BenchmarkModel(numerical_features=NUM_FEATURES, ohe_categorical_features=CATEGORICAL_OHE_FEATURES,
                          ste_categorical_features=CATEGORICAL_STE_FEATURES, model_params=MODEL_PARAMS)
model.fit(X_offer, y_offer, X_manual, y_manual)

Fit lightgbm




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11907
[LightGBM] [Info] Number of data points in the train set: 275299, number of used features: 70
[LightGBM] [Info] Start training from score 110865.208829
Find corr coefficient
Corr coef: -0.12


In [12]:
print('Save model')
model.save('model.bin')

predictions_offer = model.predict(X_offer)
metrics = metrics_stat(y_offer.values, predictions_offer/(1+model.corr_coef)) # для обучающей выборки с ценами из объявлений смотрим качество без коэффициента
print(f'Metrics stat for training data with offers prices: {metrics}')

predictions_manual = model.predict(X_manual)
metrics = metrics_stat(y_manual.values, predictions_manual)
print(f'Metrics stat for training data with manual prices: {metrics}')

Save model
Metrics stat for training data with offers prices: {'mape': 1.5181998475868552, 'mdape': 0.3124046997858884, 'rmse': 73946.60385198555, 'r2': 0.8172793455588138, 'raif_metric': 3.4927653569833415}
Metrics stat for training data with manual prices: {'mape': 0.30487446341674596, 'mdape': 0.24923610546749322, 'rmse': 73209.74339526857, 'r2': 0.3295145576373988, 'raif_metric': 2.042364670980183}


In [34]:
orig_test_df = pd.read_csv(test_path)
test_df = prepare_categorical(orig_test_df)
test_df = test_df[NUM_FEATURES+CATEGORICAL_OHE_FEATURES+CATEGORICAL_STE_FEATURES]

In [18]:
predictions_test = model.predict(test_df)

In [20]:
predictions_test.min()

7818.069157900964

In [35]:
orig_test_df['per_square_meter_price'] = predictions_test

In [23]:
subm = pd.read_csv(submission_path)

In [28]:
submission_path

PosixPath('../data/test_submission.csv')

In [29]:
!head -n 3 ../data/test_submission.csv

id,per_square_meter_price
COL_289284,0
COL_289305,0


In [25]:
subm.head()

Unnamed: 0,id,per_square_meter_price
0,COL_289284,0
1,COL_289305,0
2,COL_289318,0
3,COL_289354,0
4,COL_289399,0


In [33]:
test_df

Unnamed: 0,lat,lng,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,osm_building_points_in_0.005,osm_building_points_in_0.0075,osm_building_points_in_0.01,...,reform_house_population_500,reform_mean_floor_count_1000,reform_mean_floor_count_500,reform_mean_year_building_1000,reform_mean_year_building_500,total_square,region,city,realty_type,per_square_meter_price
0,51.709255,36.147908,7,55,85,117,0,0,0,0,...,743.0,4.325000,4.211268,1966.471591,1966.740260,156.148996,Курская область,Курск,100,32286.643467
1,61.233240,73.462509,8,70,112,140,0,0,0,0,...,1019.0,5.389831,5.500000,1988.259259,1989.068182,190.737943,Ханты-Мансийский АО,Сургут,110,57603.503596
2,57.143110,65.554573,3,28,67,122,0,0,0,0,...,1332.0,7.915493,8.250000,1985.880282,1991.458333,457.118051,Тюменская область,Тюмень,10,59107.697716
3,52.281380,104.282975,5,76,139,231,0,0,0,0,...,666.0,3.276860,3.012048,1947.073276,1941.657895,66.503622,Иркутская область,Иркутск,100,68222.288761
4,51.729706,36.194019,8,105,189,279,0,0,2,9,...,394.0,4.346154,4.827586,1948.764151,1946.689655,23.864915,Курская область,Курск,10,79765.711865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2969,55.978180,92.891457,0,3,6,16,0,0,0,0,...,240.0,5.730769,6.333333,1972.153846,1982.250000,2149.546362,Красноярский край,Красноярск,100,25173.083233
2970,56.459183,84.979872,2,33,111,222,0,0,1,1,...,979.0,5.714286,5.882353,1972.260870,1973.460000,359.371061,Томская область,Томск,10,39200.714660
2971,54.523247,36.295168,2,25,54,99,0,0,0,2,...,315.0,3.983871,4.909091,1966.390244,1966.904762,87.820453,Калужская область,Калуга,100,50979.549946
2972,56.328236,43.990039,13,70,114,158,0,0,0,0,...,370.0,3.829268,4.529412,1938.331361,1946.529412,208.625735,Нижегородская область,Нижний Новгород,10,65429.225467


In [37]:
subm[['id']].merge(orig_test_df[['id', 'per_square_meter_price']], how='left', on='id').to_csv('submission_bl.csv', index=False)