In [1]:
from category_encoders.cat_boost import CatBoostEncoder

In [2]:
import faiss

In [3]:
import typing
import pickle
import pandas as pd
import numpy as np
import logging

from lightgbm import LGBMRegressor

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.exceptions import NotFittedError
from sklearn.model_selection import KFold, StratifiedKFold

In [4]:
FOLDS = 5

In [5]:
import typing
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

THRESHOLD = 0.15
NEGATIVE_WEIGHT = 1.1

def deviation_metric_one_sample(y_true: typing.Union[float, int], y_pred: typing.Union[float, int]) -> float:
    """
    Реализация кастомной метрики для хакатона.

    :param y_true: float, реальная цена
    :param y_pred: float, предсказанная цена
    :return: float, значение метрики
    """
    deviation = (y_pred - y_true) / np.maximum(1e-8, y_true)
    if np.abs(deviation) <= THRESHOLD:
        return 0
    elif deviation <= - 4 * THRESHOLD:
        return 9 * NEGATIVE_WEIGHT
    elif deviation < -THRESHOLD:
        return NEGATIVE_WEIGHT * ((deviation / THRESHOLD) + 1) ** 2
    elif deviation < 4 * THRESHOLD:
        return ((deviation / THRESHOLD) - 1) ** 2
    else:
        return 9


def deviation_metric(y_true: np.array, y_pred: np.array) -> float:
    return np.array([deviation_metric_one_sample(y_true[n], y_pred[n]) for n in range(len(y_true))]).mean()

def median_absolute_percentage_error(y_true: np.array, y_pred: np.array) -> float:
    return np.median(np.abs(y_pred-y_true)/y_true)

def metrics_stat(y_true: np.array, y_pred: np.array) -> typing.Dict[str,float]:
    mape = mean_absolute_percentage_error(y_true, y_pred)
    mdape = median_absolute_percentage_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    raif_metric = deviation_metric(y_true, y_pred)
    return {'mape':mape, 'mdape':mdape, 'rmse': rmse, 'r2': r2, 'raif_metric':raif_metric}

EPS = 1e-8

In [6]:
from enum  import IntEnum

UNKNOWN_VALUE = 'missing'

class PriceTypeEnum(IntEnum):

    OFFER_PRICE = 0 # цена из объявления
    MANUAL_PRICE = 1 # цена, полученная путем ручной оценки

In [7]:
def prepare_categorical(df: pd.DataFrame) -> pd.DataFrame:
    """
    Заполняет пропущенные категориальные переменные
    :param df: dataframe, обучающая выборка
    :return: dataframe
    """
    df_new = df.copy()
    fillna_cols = ['region','city','street','realty_type']
    df_new[fillna_cols] = df_new[fillna_cols].fillna(UNKNOWN_VALUE)
    return df_new

In [8]:
# def prepare_numerrical(df: pd.DataFrame) -> pd.DataFrame:
#     """
#     Заполняет пропущенные вещестенные переменные
#     :param df: dataframe, обучающая выборка
#     :return: dataframe
#     """
#     df_new = df.copy()
#     fillna_cols = ['region','city','street','realty_type']
#     df_new[fillna_cols] = df_new[fillna_cols].fillna(UNKNOWN_VALUE)
#     return df_new

In [9]:
TARGET = 'per_square_meter_price'
# признаки (или набор признаков), для которых применяем smoothed target encoding
CATEGORICAL_STE_FEATURES = ['region', 'city', 'realty_type', 'month']

# признаки, для которых применяем one hot encoding
CATEGORICAL_OHE_FEATURES = []

# численные признаки
NUM_FEATURES = ['month_num', 'lat', 'lng', 'osm_amenity_points_in_0.001',
       'osm_amenity_points_in_0.005', 'osm_amenity_points_in_0.0075',
       'osm_amenity_points_in_0.01', 'osm_building_points_in_0.001',
       'osm_building_points_in_0.005', 'osm_building_points_in_0.0075',
       'osm_building_points_in_0.01', 'osm_catering_points_in_0.001',
       'osm_catering_points_in_0.005', 'osm_catering_points_in_0.0075',
       'osm_catering_points_in_0.01', 'osm_city_closest_dist',
      'osm_city_nearest_population',
       'osm_crossing_closest_dist', 'osm_crossing_points_in_0.001',
       'osm_crossing_points_in_0.005', 'osm_crossing_points_in_0.0075',
       'osm_crossing_points_in_0.01', 'osm_culture_points_in_0.001',
       'osm_culture_points_in_0.005', 'osm_culture_points_in_0.0075',
       'osm_culture_points_in_0.01', 'osm_finance_points_in_0.001',
       'osm_finance_points_in_0.005', 'osm_finance_points_in_0.0075',
       'osm_finance_points_in_0.01', 'osm_healthcare_points_in_0.005',
       'osm_healthcare_points_in_0.0075', 'osm_healthcare_points_in_0.01',
       'osm_historic_points_in_0.005', 'osm_historic_points_in_0.0075',
       'osm_historic_points_in_0.01', 'osm_hotels_points_in_0.005',
       'osm_hotels_points_in_0.0075', 'osm_hotels_points_in_0.01',
       'osm_leisure_points_in_0.005', 'osm_leisure_points_in_0.0075',
       'osm_leisure_points_in_0.01', 'osm_offices_points_in_0.001',
       'osm_offices_points_in_0.005', 'osm_offices_points_in_0.0075',
       'osm_offices_points_in_0.01', 'osm_shops_points_in_0.001',
       'osm_shops_points_in_0.005', 'osm_shops_points_in_0.0075',
       'osm_shops_points_in_0.01', 'osm_subway_closest_dist',
       'osm_train_stop_closest_dist', 'osm_train_stop_points_in_0.005',
       'osm_train_stop_points_in_0.0075', 'osm_train_stop_points_in_0.01',
       'osm_transport_stop_closest_dist', 'osm_transport_stop_points_in_0.005',
       'osm_transport_stop_points_in_0.0075',
       'osm_transport_stop_points_in_0.01',
       'reform_count_of_houses_1000', 'reform_count_of_houses_500',
       'reform_house_population_1000', 'reform_house_population_500',
       'reform_mean_floor_count_1000', 'reform_mean_floor_count_500',
       'reform_mean_year_building_1000', 'reform_mean_year_building_500','total_square']

MODEL_PARAMS = dict(
            n_estimators=1000,
            learning_rate=0.01,
            reg_alpha=1,
            num_leaves=40,
            min_child_samples=5,
            importance_type="gain",
            n_jobs=-1,
            random_state=563,
            objective='regression_l1',
        )

MODEL_PARAMS_COEFF = dict(
            n_estimators=700,
            learning_rate=0.01,
            reg_alpha=1,
            num_leaves=40,
            min_child_samples=5,
            importance_type="gain",
            n_jobs=-1,
            random_state=563,
            objective='regression_l1',
        )

In [10]:
class CoeffBoostingModel():
    """
    Модель представляет из себя sklearn pipeline. Пошаговый алгоритм:
      1) в качестве обучения выбираются все данные с price_type=0
      1) все фичи делятся на три типа (numerical_features, ohe_categorical_features, ste_categorical_features):
          1.1) numerical_features - применяется StandardScaler
          1.2) ohe_categorical_featires - кодируются через one hot encoding
          1.3) ste_categorical_features - кодируются через SmoothedTargetEncoder
      2) после этого все полученные фичи конкатенируются в одно пространство фичей и подаются на вход модели Lightgbm
      3) делаем предикт на данных с price_type=1, считаем среднее отклонение реальных значений от предикта. Вычитаем это отклонение на финальном шаге (чтобы сместить отклонение к 0)

    :param numerical_features: list, список численных признаков из датафрейма
    :param ohe_categorical_features: list, список категориальных признаков для one hot encoding
    :param ste_categorical_features, list, список категориальных признаков для smoothed target encoding.
                                     Можно кодировать сразу несколько полей (например объединять категориальные признаки)
    :
    """

    def __init__(self, numerical_features: typing.List[str],
                 ohe_categorical_features: typing.List[str],
                 ste_categorical_features: typing.List[typing.Union[str, typing.List[str]]],
                 model_params: typing.Dict[str, typing.Union[str,int,float]],
                 model_params_coef: typing.Dict[str, typing.Union[str,int,float]]):
        self.num_features = numerical_features
        self.ohe_cat_features = ohe_categorical_features
        self.ste_cat_features = ste_categorical_features

        self.preprocessor = ColumnTransformer(transformers=[
            ('num', StandardScaler(), self.num_features),
            ('ohe', OneHotEncoder(), self.ohe_cat_features),
            ('ste', CatBoostEncoder(handle_missing='value', handle_unknown='value'), # OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1), # CatBoostEncoder(handle_missing='value', handle_unknown='value'),
             self.ste_cat_features)])

        self.model = LGBMRegressor(**model_params)

        self.pipeline = Pipeline(steps=[
            ('preprocessor', self.preprocessor),
            ('model', self.model)])

        self._is_fitted = False
        
        self.coeff_model = LGBMRegressor(**model_params_coef)
        
        self.coef_preprocessor = ColumnTransformer(transformers=[
            ('num', StandardScaler(), self.num_features+['predictions']),
            ('ohe', OneHotEncoder(), self.ohe_cat_features),
            ('ste', CatBoostEncoder(handle_missing='value', handle_unknown='value'), # OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1), # CatBoostEncoder(handle_missing='value', handle_unknown='value'),
             self.ste_cat_features)])
        
        self.coeff_pipeline = Pipeline(steps=[
            ('preprocessor', self.coef_preprocessor),
            ('model', self.coeff_model)])

    def _find_corr_coefficient(self, X_manual: pd.DataFrame, y_manual: pd.Series):
        """Вычисление корректирующего коэффициента

        :param X_manual: pd.DataFrame с ручными оценками
        :param y_manual: pd.Series - цены ручника
        """
        predictions = np.expm1(np.clip(self.pipeline.predict(X_manual),5,15))
        X_manual['predictions'] = self.pipeline.predict(X_manual)
        self.coeff_pipeline.fit(X_manual, y_manual , model__feature_name=[f'{i}' for i in range(X_manual.shape[1])],
                 model__sample_weight=1/y_manual.values)
        self.__is_fitted = True

    def fit(self, X_offer: pd.DataFrame, y_offer: pd.Series,
            X_manual: pd.DataFrame, y_manual: pd.Series):
        """Обучение модели.
        ML модель обучается на данных по предложениям на рынке (цены из объявления)
        Затем вычисляется среднее отклонение между руяными оценками и предиктами для корректировки стоимости

        :param X_offer: pd.DataFrame с объявлениями
        :param y_offer: pd.Series - цена предложения (в объявлениях)
        :param X_manual: pd.DataFrame с ручными оценками
        :param y_manual: pd.Series - цены ручника
        """
        print('Fit lightgbm')
        self.pipeline.fit(X_offer, np.log1p(y_offer) , model__feature_name=[f'{i}' for i in range(X_offer.shape[1])],
                         model__sample_weight=1/np.log1p(y_offer.values)) # ,model__categorical_feature=None)
        print('Find corr coefficient')
        self._find_corr_coefficient(X_manual, y_manual)
        self.__is_fitted = True

    def predict(self, X: pd.DataFrame) -> np.array:
        """Предсказание модели Предсказываем преобразованный таргет, затем конвертируем в обычную цену через обратное
        преобразование.

        :param X: pd.DataFrame
        :return: np.array, предсказания (цены на коммерческую недвижимость)
        """
        if self.__is_fitted:
            X['predictions'] = np.expm1(np.clip(self.pipeline.predict(X), 5,15))
            predictions = self.coeff_pipeline.predict(X)
            return predictions
        else:
            raise NotFittedError(
                "This {} instance is not fitted yet! Call 'fit' with appropriate arguments before predict".format(
                    type(self).__name__
                )
            )

    def save(self, path: str):
        """Сериализует модель в pickle.

        :param path: str, путь до файла
        """
        with open(path, "wb") as f:
            pickle.dump(self, f)

    @classmethod
    def load(self, path: str):
        """Сериализует модель в pickle.

        :param path: str, путь до файла
        :return: Модель
        """
        with open(path, "rb") as f:
            model = pickle.load(f)
        return model

In [11]:
from tqdm.auto import tqdm
from typing import List
from sklearn.preprocessing import RobustScaler

In [12]:
def preprocess_data(train:pd.DataFrame, 
                    split_coulmn:str = 'price_type',
                    val_to_check:int = 0,
                    columns: List[str] = NUM_FEATURES,
                    strategy:str = 'drop',
                    sigma_tolerance:float = 3.0,
                   ) -> pd.DataFrame:
    
    train_to_check_orig = train[train[split_coulmn] == val_to_check]
    train_to_check = train_to_check_orig.copy()
    train_sample = train[train[split_coulmn] != val_to_check]
    
    stats = {}
    for col in tqdm(columns):
        mean = train_sample[col].mean()
        sigma = train_sample[col].std()
        min_val, max_val = mean - sigma_tolerance * sigma, mean + sigma_tolerance * sigma
        stats[col] = (min_val, max_val)
        train_to_check = train_to_check[(train_to_check[col] > min_val) & (train_to_check[col] < max_val)]
        
        
    train_result = pd.concat((train_to_check, train_sample)).reset_index(drop=True)
    
#     if strategy == 'impute':
#         train_missing = train_to_check_orig[~train_to_check_orig.id.isin(train_result.id.values)].reset_index(drop=True)
#         index_vecs = np.vstack(train_result[['lat', 'lng']].astype(np.float32).values)
#         index_find = np.vstack(train_missing[['lat', 'lng']].astype(np.float32).values)
#         index = faiss.IndexFlatL2(2)
#         index.add(index_vecs)
#         print("Creating index about 1 min")
#         D, I = index.search(index_find, 1) 
#         extract_df = train_result.loc[I.flatten()].reset_index(drop=True)
#         for col in tqdm(columns):
#             (min_val, max_val) = stats[col]
#             ind = train_missing[(train_missing[col] <= min_val) | (train_missing[col] >= max_val)].index
#             train_missing.loc[ind, col] = extract_df.loc[ind, col]
            
#         train_result = pd.concat((train_result, train_missing)).reset_index(drop=True)
    
    return train_result

In [13]:
from pathlib import Path

In [14]:
train_path = Path('../data/train.csv')
test_path = Path('../data/test.csv')
submission_path = Path('../data/test_submission.csv')

In [15]:
pd.options.display.max_rows = 100

In [16]:
train_df = pd.read_csv(train_path)
train_df['month'] = pd.to_datetime(train_df['date']).dt.month
train_df['month_num'] = pd.to_datetime(train_df['date']).dt.month

  exec(code_obj, self.user_global_ns, self.user_ns)


In [17]:
train_df.shape

(279792, 79)

In [18]:
#%debug

In [19]:
#train_df_new = preprocess_data(train_df, strategy = 'drop')

In [20]:
train_df_new = train_df

In [21]:
train_df_new.shape

(279792, 79)

In [24]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=239)
metrics_arr = []
predicts_arr = []
for fold, (tr,va) in enumerate(kf.split(train_df_new, train_df_new['price_type'])):
    df_tr = prepare_categorical(train_df_new.loc[tr].reset_index(drop=True))
    df_vl = prepare_categorical(train_df_new.loc[va].reset_index(drop=True))
    
    X_offer_tr = df_tr[df_tr.price_type == PriceTypeEnum.OFFER_PRICE][NUM_FEATURES+CATEGORICAL_OHE_FEATURES+CATEGORICAL_STE_FEATURES]
    y_offer_tr = df_tr[df_tr.price_type == PriceTypeEnum.OFFER_PRICE][TARGET]
    
    X_offer_vl = df_vl[df_vl.price_type == PriceTypeEnum.OFFER_PRICE][NUM_FEATURES+CATEGORICAL_OHE_FEATURES+CATEGORICAL_STE_FEATURES]
    y_offer_vl = df_vl[df_vl.price_type == PriceTypeEnum.OFFER_PRICE][TARGET]
    
    
    X_manual_tr = df_tr[df_tr.price_type == PriceTypeEnum.MANUAL_PRICE][NUM_FEATURES+CATEGORICAL_OHE_FEATURES+CATEGORICAL_STE_FEATURES]
    y_manual_tr = df_tr[df_tr.price_type == PriceTypeEnum.MANUAL_PRICE][TARGET]
    
    X_manual_vl = df_vl[df_vl.price_type == PriceTypeEnum.MANUAL_PRICE][NUM_FEATURES+CATEGORICAL_OHE_FEATURES+CATEGORICAL_STE_FEATURES]
    y_manual_vl = df_vl[df_vl.price_type == PriceTypeEnum.MANUAL_PRICE][TARGET]
    X_manual_vl_id = df_vl[df_vl.price_type == PriceTypeEnum.MANUAL_PRICE]['id']
    
    
    model = CoeffBoostingModel(numerical_features=NUM_FEATURES, ohe_categorical_features=CATEGORICAL_OHE_FEATURES,
                          ste_categorical_features=CATEGORICAL_STE_FEATURES, model_params=MODEL_PARAMS,
                              model_params_coef=MODEL_PARAMS)
    
    model.fit(X_offer_tr, y_offer_tr, X_manual_tr, y_manual_tr)
    
    predictions_manual = model.predict(X_manual_vl)
    metrics = metrics_stat(y_manual_vl.values, predictions_manual)
    predicts_arr.append(pd.DataFrame((X_manual_vl_id, predictions_manual), columns=['id', f'predict_{fold}']))
    print(f'fold: {fold}, metrics {metrics}')
    metrics_arr.append(metrics)
    model.save(f"model_bst_nolg_{fold}.bin")

Fit lightgbm
Find corr coefficient
fold: 0, metrics {'mape': 0.3494856187040405, 'mdape': 0.2958462655232108, 'rmse': 97170.09041280294, 'r2': -0.08282244559999552, 'raif_metric': 3.132344825582455}
Fit lightgbm
Find corr coefficient
fold: 1, metrics {'mape': 0.3730217724226082, 'mdape': 0.2903982438464148, 'rmse': 98470.43642911204, 'r2': -0.0912134320004434, 'raif_metric': 2.904709734572059}
Fit lightgbm
Find corr coefficient
fold: 2, metrics {'mape': 0.34842560715310483, 'mdape': 0.2936839009494962, 'rmse': 74643.12162765785, 'r2': -0.13846377635590823, 'raif_metric': 3.1223763637120245}
Fit lightgbm
Find corr coefficient
fold: 3, metrics {'mape': 0.36298459885049733, 'mdape': 0.2919371162562766, 'rmse': 98281.43573441143, 'r2': -0.10044495850748048, 'raif_metric': 3.2045484738660845}
Fit lightgbm
Find corr coefficient
fold: 4, metrics {'mape': 0.3528777587917625, 'mdape': 0.30411875953473044, 'rmse': 97860.6103549137, 'r2': -0.10375538686135588, 'raif_metric': 3.0997900644488277}


In [25]:
np.mean([e['raif_metric'] for e in metrics_arr])

3.0927538924362903

In [25]:
# Local
# 1.320941136882037
#1.9456598148702269
#1.950442347124326
#1.9597450099363427

In [19]:
# LB
#1.8021098072169865

In [28]:
test_df = pd.read_csv(test_path)
test_df['month'] = pd.to_datetime(test_df['date']).dt.month
test_df['month_num'] = pd.to_datetime(test_df['date']).dt.month
pred_df = prepare_categorical(test_df)[NUM_FEATURES+CATEGORICAL_OHE_FEATURES+CATEGORICAL_STE_FEATURES]

In [29]:
pred = 0

for ifold in range(FOLDS):
    model = CoeffBoostingModel(numerical_features=NUM_FEATURES, ohe_categorical_features=CATEGORICAL_OHE_FEATURES,
                          ste_categorical_features=CATEGORICAL_STE_FEATURES, model_params=MODEL_PARAMS)
    model = model.load(f"model_bst_{fold}.bin")
    y_score = model.predict(pred_df)
    pred += y_score / FOLDS
test_sub = pd.read_csv(test_path)[['id']]
test_sub[TARGET] = pred
test_sub.to_csv('boots_folds.csv', index=False)

In [30]:
test_sub['per_square_meter_price'].max()

350709.5313423764

In [31]:
test_sub['per_square_meter_price'].min()

22168.363246630317