In [13]:
import typing
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from autowoe import AutoWoE
from scipy.stats import rankdata
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import KBinsDiscretizer
from tqdm.notebook import tqdm

THRESHOLD = 0.15
NEGATIVE_WEIGHT = 1.1

def deviation_metric_one_sample(y_true: typing.Union[float, int], y_pred: typing.Union[float, int]) -> float:
    """
    Реализация кастомной метрики для хакатона.

    :param y_true: float, реальная цена
    :param y_pred: float, предсказанная цена
    :return: float, значение метрики
    """
    deviation = (y_pred - y_true) / np.maximum(1e-8, y_true)
    if np.abs(deviation) <= THRESHOLD:
        return 0
    elif deviation <= - 4 * THRESHOLD:
        return 9 * NEGATIVE_WEIGHT
    elif deviation < -THRESHOLD:
        return NEGATIVE_WEIGHT * ((deviation / THRESHOLD) + 1) ** 2
    elif deviation < 4 * THRESHOLD:
        return ((deviation / THRESHOLD) - 1) ** 2
    else:
        return 9


def deviation_metric(y_true: np.array, y_pred: np.array) -> float:
    return np.array([deviation_metric_one_sample(y_true[n], y_pred[n]) for n in range(len(y_true))]).mean()

def median_absolute_percentage_error(y_true: np.array, y_pred: np.array) -> float:
    return np.median(np.abs(y_pred-y_true)/y_true)

def metrics_stat(y_true: np.array, y_pred: np.array) -> typing.Dict[str,float]:
    mape = mean_absolute_percentage_error(y_true, y_pred)
    mdape = median_absolute_percentage_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    raif_metric = deviation_metric(y_true, y_pred)
    return {'mape':mape, 'mdape':mdape, 'rmse': rmse, 'r2': r2, 'raif_metric':raif_metric}

def deviation_metric_vec(y_true: np.array, y_pred: np.array) -> float:
    deviation = (y_pred - y_true) / np.maximum(1e-8, y_true)
    
    metr = deviation * 0.0 + 9
    
    metr[np.abs(deviation) <= THRESHOLD] = 0
    
    metr[deviation <= - 4 * THRESHOLD] = 9 * NEGATIVE_WEIGHT
    
    mask = (-4 * THRESHOLD < deviation) & (deviation < -THRESHOLD)
    metr[mask] = NEGATIVE_WEIGHT * ((deviation[mask] / THRESHOLD) + 1) ** 2
    
    mask = (deviation < 4 * THRESHOLD) & (deviation > THRESHOLD)
    metr[mask] = ((deviation[mask] / THRESHOLD) - 1) ** 2
    
    return metr.mean()

In [2]:
TARGET = 'per_square_meter_price'

cat_features = ['region', 'street', 'city', 'realty_type']
features = cat_features + ['floor', 'osm_amenity_points_in_0.001',
       'osm_amenity_points_in_0.005', 'osm_amenity_points_in_0.0075',
       'osm_amenity_points_in_0.01', 'osm_building_points_in_0.001',
       'osm_building_points_in_0.005', 'osm_building_points_in_0.0075',
       'osm_building_points_in_0.01', 'osm_catering_points_in_0.001',
       'osm_catering_points_in_0.005', 'osm_catering_points_in_0.0075',
       'osm_catering_points_in_0.01', 'osm_city_closest_dist',
      'osm_city_nearest_population',
       'osm_crossing_closest_dist', 'osm_crossing_points_in_0.001',
       'osm_crossing_points_in_0.005', 'osm_crossing_points_in_0.0075',
       'osm_crossing_points_in_0.01', 'osm_culture_points_in_0.001',
       'osm_culture_points_in_0.005', 'osm_culture_points_in_0.0075',
       'osm_culture_points_in_0.01', 'osm_finance_points_in_0.001',
       'osm_finance_points_in_0.005', 'osm_finance_points_in_0.0075',
       'osm_finance_points_in_0.01', 'osm_healthcare_points_in_0.005',
       'osm_healthcare_points_in_0.0075', 'osm_healthcare_points_in_0.01',
       'osm_historic_points_in_0.005', 'osm_historic_points_in_0.0075',
       'osm_historic_points_in_0.01', 'osm_hotels_points_in_0.005',
       'osm_hotels_points_in_0.0075', 'osm_hotels_points_in_0.01',
       'osm_leisure_points_in_0.005', 'osm_leisure_points_in_0.0075',
       'osm_leisure_points_in_0.01', 'osm_offices_points_in_0.001',
       'osm_offices_points_in_0.005', 'osm_offices_points_in_0.0075',
       'osm_offices_points_in_0.01', 'osm_shops_points_in_0.001',
       'osm_shops_points_in_0.005', 'osm_shops_points_in_0.0075',
       'osm_shops_points_in_0.01', 'osm_subway_closest_dist',
       'osm_train_stop_closest_dist', 'osm_train_stop_points_in_0.005',
       'osm_train_stop_points_in_0.0075', 'osm_train_stop_points_in_0.01',
       'osm_transport_stop_closest_dist', 'osm_transport_stop_points_in_0.005',
       'osm_transport_stop_points_in_0.0075',
       'osm_transport_stop_points_in_0.01',
       'reform_count_of_houses_1000', 'reform_count_of_houses_500',
       'reform_house_population_1000', 'reform_house_population_500',
       'reform_mean_floor_count_1000', 'reform_mean_floor_count_500',
       'reform_mean_year_building_1000', 'reform_mean_year_building_500','total_square']
TARGET2 = 'target'

In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
f = 'score_knn_5'
test[f] = 0
oof_pred = np.zeros_like(train[TARGET], dtype=np.float64) * np.nan
kfold = StratifiedKFold(n_splits=5 , shuffle=True, random_state=42)
clfs = []
for train_idx, valid_idx in tqdm(kfold.split(train[TARGET], train['city']), total=kfold.n_splits):
    train_data, valid_data = train.loc[train_idx, ['lat', 'lng']].values, train.loc[valid_idx, ['lat', 'lng']].values
    train_target, valid_target = train.loc[train_idx, TARGET], train.loc[valid_idx, TARGET]
    clf = KNeighborsRegressor(5)
    clf.fit(train_data, train_target)
    oof_pred[valid_idx] = clf.predict(valid_data)
    test[f] += clf.predict(test[['lat', 'lng']].values) / kfold.n_splits
    clfs.append(clf)
train[f] = oof_pred
features.append(f)

  0%|          | 0/5 [00:00<?, ?it/s]



In [5]:
street_mapper = train['street'].value_counts().to_dict()
street_mapper = {k: i for i, (k, v) in enumerate(street_mapper.items(), start=1) if v > 10}
street_mapper = defaultdict(lambda: 0, street_mapper)

In [6]:
train['street'] = train['street'].map(street_mapper)
test['street'] = test['street'].map(street_mapper)

In [7]:
def parse_floor(x):
    if not isinstance(x, str) and np.isnan(x):
        return -10
    x = str(x).strip().replace('этаж', '').replace('-й', '').replace('цоколь', '-1').replace('подвал', '-1')
    if x.count(',') > 0 or x.count('+') > 0:
        return 100 + x.count(',') +  x.count('+')
    try:
        return int(float(x))
    except ValueError:
        return -11

In [8]:
train['floor'] = train['floor'].map(parse_floor)
test['floor'] = test['floor'].map(parse_floor)

In [9]:
train[cat_features] = train[cat_features].astype(str).fillna('__NAN__')
test[cat_features] = test[cat_features].astype(str).fillna('__NAN__')

In [31]:
est = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
est.fit(train[TARGET].values.reshape(-1, 1))

KBinsDiscretizer(encode='ordinal', n_bins=10)

In [32]:
x = est.transform(train[TARGET].values.reshape(-1, 1))[:, 0]

In [33]:
def get_oof_and_test_pred(tr, real_te):
    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

    oof_preds_woe = np.zeros(len(tr))
    real_test_preds_woe = np.zeros(len(real_te))

    y = tr[TARGET2].values

    for fold, (train_idx, val_idx) in tqdm(enumerate(skf.split(y, y)), total=5):

        X_tr, X_val = tr.iloc[train_idx, :], tr.iloc[val_idx, :]

        auto_woe = AutoWoE(monotonic=False,
                           max_bin_count=10,
                           vif_th=10.,
                           imp_th=0,
                           th_const=32,
                           force_single_split=True,
                           min_bin_size = 0.005,
                           oof_woe=True,
                           n_folds=5,
                           n_jobs=8,
                           regularized_refit=True,
                           verbose=0)

        auto_woe.fit(X_tr, target_name=TARGET2)

        val_pred = auto_woe.predict_proba(X_val)
        print("FOLD {}, AUC_SCORE = {:.5f}".format(fold, roc_auc_score(X_val[TARGET2], val_pred)))

        oof_preds_woe[val_idx] = val_pred
        real_test_preds_woe += auto_woe.predict_proba(real_te) / 5

    print("SCORE AUC_TRAIN = {:.5f}".format(roc_auc_score(train[TARGET2], oof_preds_woe)))
    
    return oof_preds_woe, real_test_preds_woe

In [34]:
for i in np.arange(10):
    train[TARGET2] = (x == i).astype(int)
    oof_preds_woe, real_test_preds_woe = get_oof_and_test_pred(train[[*features, TARGET2]], test[[*features]])
    train[f'oof_woe_{i}'] = oof_preds_woe
    test[f'oof_woe_{i}'] = real_test_preds_woe

    train[f'rank_oof_woe_{i}'] = rankdata(oof_preds_woe)
    test[f'rank_oof_woe_{i}'] = rankdata(real_test_preds_woe)

  0%|          | 0/5 [00:00<?, ?it/s]

This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 17895, number of negative: 161171
[LightGBM] [Info] Total Bins 15281
[LightGBM] [Info] Number of data: 179066, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.099935 -> initscore=-2.197945
[LightGBM] [Info] Start training from score -2.197945
FOLD 0, AUC_SCORE = 0.88362
This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 17842, number of negative: 161224
[LightGBM] [Info] Total Bins 15261
[LightGBM] [Info] Number of data: 179066, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.099639 -> initscore=-2.201239
[LightGBM] [Info] Start training from score -2.201239
FOLD 1, 

  0%|          | 0/5 [00:00<?, ?it/s]

This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 17830, number of negative: 161236
[LightGBM] [Info] Total Bins 15267
[LightGBM] [Info] Number of data: 179066, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.099572 -> initscore=-2.201987
[LightGBM] [Info] Start training from score -2.201987
FOLD 0, AUC_SCORE = 0.82321
This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 17861, number of negative: 161205
[LightGBM] [Info] Total Bins 15304
[LightGBM] [Info] Number of data: 179066, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.099745 -> initscore=-2.200057
[LightGBM] [Info] Start training from score -2.200057
FOLD 1, 

  0%|          | 0/5 [00:00<?, ?it/s]

This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 17998, number of negative: 161068
[LightGBM] [Info] Total Bins 15249
[LightGBM] [Info] Number of data: 179066, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.100510 -> initscore=-2.191566
[LightGBM] [Info] Start training from score -2.191566
FOLD 0, AUC_SCORE = 0.78564
This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 17954, number of negative: 161112
[LightGBM] [Info] Total Bins 15274
[LightGBM] [Info] Number of data: 179066, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.100265 -> initscore=-2.194287
[LightGBM] [Info] Start training from score -2.194287
FOLD 1, 

  0%|          | 0/5 [00:00<?, ?it/s]

This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 17901, number of negative: 161165
[LightGBM] [Info] Total Bins 15248
[LightGBM] [Info] Number of data: 179066, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.099969 -> initscore=-2.197572
[LightGBM] [Info] Start training from score -2.197572
FOLD 0, AUC_SCORE = 0.76841
This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 17921, number of negative: 161145
[LightGBM] [Info] Total Bins 15273
[LightGBM] [Info] Number of data: 179066, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.100080 -> initscore=-2.196331
[LightGBM] [Info] Start training from score -2.196331
FOLD 1, 

  0%|          | 0/5 [00:00<?, ?it/s]

This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 18049, number of negative: 161017
[LightGBM] [Info] Total Bins 15241
[LightGBM] [Info] Number of data: 179066, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.100795 -> initscore=-2.188420
[LightGBM] [Info] Start training from score -2.188420
FOLD 0, AUC_SCORE = 0.77285
This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 17988, number of negative: 161078
[LightGBM] [Info] Total Bins 15271
[LightGBM] [Info] Number of data: 179066, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.100455 -> initscore=-2.192184
[LightGBM] [Info] Start training from score -2.192184
FOLD 1, 

  0%|          | 0/5 [00:00<?, ?it/s]

This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 17909, number of negative: 161157
[LightGBM] [Info] Total Bins 15275
[LightGBM] [Info] Number of data: 179066, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.100013 -> initscore=-2.197076
[LightGBM] [Info] Start training from score -2.197076
FOLD 0, AUC_SCORE = 0.79143
This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 17813, number of negative: 161253
[LightGBM] [Info] Total Bins 15272
[LightGBM] [Info] Number of data: 179066, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.099477 -> initscore=-2.203046
[LightGBM] [Info] Start training from score -2.203046
FOLD 1, 

  0%|          | 0/5 [00:00<?, ?it/s]

This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 17814, number of negative: 161252
[LightGBM] [Info] Total Bins 15240
[LightGBM] [Info] Number of data: 179066, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.099483 -> initscore=-2.202984
[LightGBM] [Info] Start training from score -2.202984
FOLD 0, AUC_SCORE = 0.81005
This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 17929, number of negative: 161137
[LightGBM] [Info] Total Bins 15301
[LightGBM] [Info] Number of data: 179066, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.100125 -> initscore=-2.195835
[LightGBM] [Info] Start training from score -2.195835
FOLD 1, 

  0%|          | 0/5 [00:00<?, ?it/s]

This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 17850, number of negative: 161216
[LightGBM] [Info] Total Bins 15279
[LightGBM] [Info] Number of data: 179066, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.099684 -> initscore=-2.200742
[LightGBM] [Info] Start training from score -2.200742
FOLD 0, AUC_SCORE = 0.85257
This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 17829, number of negative: 161237
[LightGBM] [Info] Total Bins 15260
[LightGBM] [Info] Number of data: 179066, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.099567 -> initscore=-2.202049
[LightGBM] [Info] Start training from score -2.202049
FOLD 1, 

  0%|          | 0/5 [00:00<?, ?it/s]

This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 17891, number of negative: 161175
[LightGBM] [Info] Total Bins 15266
[LightGBM] [Info] Number of data: 179066, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.099913 -> initscore=-2.198193
[LightGBM] [Info] Start training from score -2.198193
FOLD 0, AUC_SCORE = 0.91212
This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 17943, number of negative: 161123
[LightGBM] [Info] Total Bins 15284
[LightGBM] [Info] Number of data: 179066, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.100203 -> initscore=-2.194968
[LightGBM] [Info] Start training from score -2.194968
FOLD 1, 

  0%|          | 0/5 [00:00<?, ?it/s]

This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 17882, number of negative: 161184
[LightGBM] [Info] Total Bins 15266
[LightGBM] [Info] Number of data: 179066, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.099863 -> initscore=-2.198752
[LightGBM] [Info] Start training from score -2.198752
FOLD 0, AUC_SCORE = 0.97044
This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 17868, number of negative: 161198
[LightGBM] [Info] Total Bins 15283
[LightGBM] [Info] Number of data: 179066, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.099784 -> initscore=-2.199622
[LightGBM] [Info] Start training from score -2.199622
FOLD 1, 

In [42]:
woe_features = np.array([[f'oof_woe_{i}'] for i in np.arange(10)]).flatten()

In [43]:
train[woe_features].to_csv('data/train_woe_features.csv', index=False)
test[woe_features].to_csv('data/test_woe_features.csv', index=False)