In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('../data/train.csv', low_memory=False)
test = pd.read_csv('../data/test.csv')

In [3]:
train.columns

Index(['city', 'floor', 'id', 'lat', 'lng', 'osm_amenity_points_in_0.001',
       'osm_amenity_points_in_0.005', 'osm_amenity_points_in_0.0075',
       'osm_amenity_points_in_0.01', 'osm_building_points_in_0.001',
       'osm_building_points_in_0.005', 'osm_building_points_in_0.0075',
       'osm_building_points_in_0.01', 'osm_catering_points_in_0.001',
       'osm_catering_points_in_0.005', 'osm_catering_points_in_0.0075',
       'osm_catering_points_in_0.01', 'osm_city_closest_dist',
       'osm_city_nearest_name', 'osm_city_nearest_population',
       'osm_crossing_closest_dist', 'osm_crossing_points_in_0.001',
       'osm_crossing_points_in_0.005', 'osm_crossing_points_in_0.0075',
       'osm_crossing_points_in_0.01', 'osm_culture_points_in_0.001',
       'osm_culture_points_in_0.005', 'osm_culture_points_in_0.0075',
       'osm_culture_points_in_0.01', 'osm_finance_points_in_0.001',
       'osm_finance_points_in_0.005', 'osm_finance_points_in_0.0075',
       'osm_finance_points_in

In [4]:
for c in test.columns:
    if train.loc[train[c].isnull()].shape[0] > 0:
        print(c)

floor
osm_city_nearest_population
reform_house_population_1000
reform_house_population_500
reform_mean_floor_count_1000
reform_mean_floor_count_500
reform_mean_year_building_1000
reform_mean_year_building_500
street


In [5]:
import pandas as pd
UNKNOWN_VALUE = 'missing'

def prepare_categorical(df: pd.DataFrame) -> pd.DataFrame:
    """
    Заполняет пропущенные категориальные переменные
    :param df: dataframe, обучающая выборка
    :return: dataframe
    """
    df_new = df.copy()
    fillna_cols = ['region', 'city', 'street', 'realty_type', 'floor']
    df_new[fillna_cols] = df_new[fillna_cols].fillna(UNKNOWN_VALUE)
    return df_new
train = prepare_categorical(train)
test = prepare_categorical(test)

In [6]:
TARGET = 'per_square_meter_price'
# признаки (или набор признаков), для которых применяем smoothed target encoding
CATEGORICAL_FEATURES = ['region', 'city', 'realty_type', 'street','floor']

# численные признаки
NUM_FEATURES = ['lat', 'lng', 'osm_amenity_points_in_0.001',
       'osm_amenity_points_in_0.005', 'osm_amenity_points_in_0.0075',
       'osm_amenity_points_in_0.01', 'osm_building_points_in_0.001',
       'osm_building_points_in_0.005', 'osm_building_points_in_0.0075',
       'osm_building_points_in_0.01', 'osm_catering_points_in_0.001',
       'osm_catering_points_in_0.005', 'osm_catering_points_in_0.0075',
       'osm_catering_points_in_0.01', 'osm_city_closest_dist',
       'osm_city_nearest_population',
       'osm_crossing_closest_dist', 'osm_crossing_points_in_0.001',
       'osm_crossing_points_in_0.005', 'osm_crossing_points_in_0.0075',
       'osm_crossing_points_in_0.01', 'osm_culture_points_in_0.001',
       'osm_culture_points_in_0.005', 'osm_culture_points_in_0.0075',
       'osm_culture_points_in_0.01', 'osm_finance_points_in_0.001',
       'osm_finance_points_in_0.005', 'osm_finance_points_in_0.0075',
       'osm_finance_points_in_0.01', 'osm_healthcare_points_in_0.005',
       'osm_healthcare_points_in_0.0075', 'osm_healthcare_points_in_0.01',
       'osm_historic_points_in_0.005', 'osm_historic_points_in_0.0075',
       'osm_historic_points_in_0.01', 'osm_hotels_points_in_0.005',
       'osm_hotels_points_in_0.0075', 'osm_hotels_points_in_0.01',
       'osm_leisure_points_in_0.005', 'osm_leisure_points_in_0.0075',
       'osm_leisure_points_in_0.01', 'osm_offices_points_in_0.001',
       'osm_offices_points_in_0.005', 'osm_offices_points_in_0.0075',
       'osm_offices_points_in_0.01', 'osm_shops_points_in_0.001',
       'osm_shops_points_in_0.005', 'osm_shops_points_in_0.0075',
       'osm_shops_points_in_0.01', 'osm_subway_closest_dist',
       'osm_train_stop_closest_dist', 'osm_train_stop_points_in_0.005',
       'osm_train_stop_points_in_0.0075', 'osm_train_stop_points_in_0.01',
       'osm_transport_stop_closest_dist', 'osm_transport_stop_points_in_0.005',
       'osm_transport_stop_points_in_0.0075',
       'osm_transport_stop_points_in_0.01',
       'reform_count_of_houses_1000', 'reform_count_of_houses_500',
       'reform_house_population_1000', 'reform_house_population_500',
       'reform_mean_floor_count_1000', 'reform_mean_floor_count_500',
       'reform_mean_year_building_1000', 'reform_mean_year_building_500', 'total_square']

In [7]:
import typing
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

THRESHOLD = 0.15
NEGATIVE_WEIGHT = 1.1


def deviation_metric_one_sample(y_true: typing.Union[float, int], y_pred: typing.Union[float, int]) -> float:
    """
    Реализация кастомной метрики для хакатона.

    :param y_true: float, реальная цена
    :param y_pred: float, предсказанная цена
    :return: float, значение метрики
    """
    deviation = (y_pred - y_true) / np.maximum(1e-8, y_true)
    if np.abs(deviation) <= THRESHOLD:
        return 0
    elif deviation <= - 4 * THRESHOLD:
        return 9 * NEGATIVE_WEIGHT
    elif deviation < -THRESHOLD:
        return NEGATIVE_WEIGHT * ((deviation / THRESHOLD) + 1) ** 2
    elif deviation < 4 * THRESHOLD:
        return ((deviation / THRESHOLD) - 1) ** 2
    else:
        return 9


def deviation_metric(y_true: np.array, y_pred: np.array) -> float:
    return np.mean([deviation_metric_one_sample(y_true[n], y_pred[n]) for n in range(len(y_true))]) #.mean()

def median_absolute_percentage_error(y_true: np.array, y_pred: np.array) -> float:
    return np.median(np.abs(y_pred-y_true)/y_true)

def metrics_stat(y_true: np.array, y_pred: np.array) -> typing.Dict[str,float]:
    mape = mean_absolute_percentage_error(y_true, y_pred)
    mdape = median_absolute_percentage_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    raif_metric = deviation_metric(y_true, y_pred)
    return {'mape':mape, 'mdape':mdape, 'rmse': rmse, 'r2': r2, 'raif_metric':raif_metric}

In [8]:
train_df = train.drop(columns=['id','per_square_meter_price','date', 'osm_city_nearest_name'])
test_df = test.drop(columns=['id','date', 'osm_city_nearest_name'])[train_df.columns.tolist()]
(np.array(train_df.columns.tolist()) == np.array(test_df.columns.tolist())).all()

True

In [9]:
from sklearn.preprocessing import LabelEncoder

dim_size_ = []
for c in CATEGORICAL_FEATURES:
    le = LabelEncoder()
    le.fit(train_df[c].values.tolist() + test_df[c].values.tolist())
    train_df[c] = le.transform(train_df[c])
    test_df[c] = le.transform(test_df[c])
    dim_size_.append(len(np.unique(train_df[c].values.tolist() + test_df[c].values.tolist())))

In [10]:
cat_idxs = [train_df.columns.tolist().index(x) for x in CATEGORICAL_FEATURES]
cat_idxs

[68, 0, 71, 70, 1]

In [11]:
dim_size_

[49, 4949, 3, 28964, 198]

In [12]:
CATEGORICAL_FEATURES

['region', 'city', 'realty_type', 'street', 'floor']

In [15]:
from sklearn.model_selection import KFold
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

import torch
from torch.optim import AdamW, SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts

tabnet_params = dict(
    cat_idxs=cat_idxs,
    cat_dims=dim_size_,
    cat_emb_dim=[3,3,1,8,3],
    n_d = 8,
    n_a = 8,
    n_steps = 1,
    gamma = 4,
    n_independent = 2,
    n_shared = 2,
    lambda_sparse = 0,
    optimizer_fn = AdamW,
    optimizer_params = dict(lr = (1e-2), weight_decay=0.01),
    mask_type = "entmax",
    scheduler_params = dict(T_0=120, T_mult=1, eta_min=1e-4, last_epoch=-1, verbose=False),
    scheduler_fn = CosineAnnealingWarmRestarts,
    seed = 239,
    verbose = 10
)


In [15]:

kf = KFold(n_splits=5, shuffle=True, random_state=239)

class DevMetric(Metric):
    def __init__(self):
        self._name = "DevMetric"
        self._maximize = False

    def __call__(self, y_true, y_score):
        #print(y_true.shape, y_score.shape, y_true.dtype)
        return metrics_stat(np.expm1(y_true.flatten()), 
                            np.expm1(np.clip(y_score.flatten(),5,15))
                           )['raif_metric']
    

ifold = 0
for tr,va in kf.split(train_df):
    df_tr = train_df.loc[tr].reset_index(drop=True).fillna(0).values
    df_va = train_df.loc[va].reset_index(drop=True).fillna(0).values
    tr_y = np.log1p(train.loc[tr,[TARGET]].values)
    va_y = np.log1p(train.loc[va,[TARGET]].values)
    
    va_y = va_y[train_df.loc[va].price_type.values == 1]
    df_va = df_va[train_df.loc[va].price_type.values == 1]
    
    c_1_tr_y = tr_y[train_df.loc[tr].price_type.values == 1]
    c_1_df_tr = df_tr[train_df.loc[tr].price_type.values == 1]
    
    clf =  TabNetRegressor(**tabnet_params)
    clf.fit(
      df_tr, tr_y,
      eval_set=[(c_1_df_tr, c_1_tr_y), (df_va, va_y)],
      max_epochs = 120,
      patience = 50,
      batch_size = 1024, 
      virtual_batch_size = 1024,
      num_workers = 4,
      drop_last = False,
      eval_metric=[DevMetric]
    )
    clf.save_model('model_'+str(ifold)+'.pth')
    ifold += 1

Device used : cuda
epoch 0  | loss: 14.11548| val_0_DevMetric: 3.21479 | val_1_DevMetric: 3.43257 |  0:00:04s
epoch 10 | loss: 0.42043 | val_0_DevMetric: 2.18471 | val_1_DevMetric: 2.4995  |  0:00:46s
epoch 20 | loss: 0.36017 | val_0_DevMetric: 1.47189 | val_1_DevMetric: 1.95561 |  0:01:28s
epoch 30 | loss: 0.32789 | val_0_DevMetric: 1.42917 | val_1_DevMetric: 2.05211 |  0:02:09s
epoch 40 | loss: 0.30614 | val_0_DevMetric: 1.402   | val_1_DevMetric: 2.08661 |  0:02:51s
epoch 50 | loss: 0.29025 | val_0_DevMetric: 1.49652 | val_1_DevMetric: 2.27925 |  0:03:34s
epoch 60 | loss: 0.27731 | val_0_DevMetric: 1.23025 | val_1_DevMetric: 1.88761 |  0:04:16s

Early stopping occurred at epoch 67 with best_epoch = 17 and best_val_1_DevMetric = 1.80671
Best weights from best epoch are automatically used!
Successfully saved model at model_0.pth.zip
Device used : cuda
epoch 0  | loss: 14.08821| val_0_DevMetric: 3.16155 | val_1_DevMetric: 3.57503 |  0:00:04s
epoch 10 | loss: 0.42021 | val_0_DevMetric: 

In [16]:
from sklearn.model_selection import KFold
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

import torch
from torch.optim import AdamW, SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts

pred = 0
for ifold in range(5):
    clf = TabNetRegressor(**tabnet_params)
    clf.load_model('model_'+str(ifold)+'.pth.zip')
    y_score = clf.predict(test_df.fillna(0).values)
    y_score = np.clip(y_score.flatten(),5,15)
    pred += np.expm1(y_score) * 0.2
test_sub = pd.read_csv('../data/test.csv')[['id']]
test_sub[TARGET] = pred
test_sub.to_csv('tabnet_base.csv', index=False)

Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda


In [17]:
test_sub.head()

Unnamed: 0,id,per_square_meter_price
0,COL_289284,37242.601562
1,COL_289305,59349.453125
2,COL_289318,31521.742188
3,COL_289354,73234.507812
4,COL_289399,47445.417969
