In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('../data/train.csv', low_memory=False)
test = pd.read_csv('../data/test.csv')

In [3]:
train.columns

Index(['city', 'floor', 'id', 'lat', 'lng', 'osm_amenity_points_in_0.001',
       'osm_amenity_points_in_0.005', 'osm_amenity_points_in_0.0075',
       'osm_amenity_points_in_0.01', 'osm_building_points_in_0.001',
       'osm_building_points_in_0.005', 'osm_building_points_in_0.0075',
       'osm_building_points_in_0.01', 'osm_catering_points_in_0.001',
       'osm_catering_points_in_0.005', 'osm_catering_points_in_0.0075',
       'osm_catering_points_in_0.01', 'osm_city_closest_dist',
       'osm_city_nearest_name', 'osm_city_nearest_population',
       'osm_crossing_closest_dist', 'osm_crossing_points_in_0.001',
       'osm_crossing_points_in_0.005', 'osm_crossing_points_in_0.0075',
       'osm_crossing_points_in_0.01', 'osm_culture_points_in_0.001',
       'osm_culture_points_in_0.005', 'osm_culture_points_in_0.0075',
       'osm_culture_points_in_0.01', 'osm_finance_points_in_0.001',
       'osm_finance_points_in_0.005', 'osm_finance_points_in_0.0075',
       'osm_finance_points_in

In [4]:
for c in test.columns:
    if train.loc[train[c].isnull()].shape[0] > 0:
        print(c)

floor
osm_city_nearest_population
reform_house_population_1000
reform_house_population_500
reform_mean_floor_count_1000
reform_mean_floor_count_500
reform_mean_year_building_1000
reform_mean_year_building_500
street


In [5]:
train['floor'].value_counts()

1.0               72871
2.0                8102
-1.0               8065
3.0                3682
4.0                2284
                  ...  
1-3                   1
мансарда (4эт)        1
1, 2.                 1
9                     1
3, мансарда           1
Name: floor, Length: 168, dtype: int64

In [6]:
test['floor'].value_counts()

1.0                                   591
1                                     391
-1.0                                  102
2.0                                    99
2                                      66
                                     ... 
подвал, 1, 2, 3                         1
1,2,3, антресоль, технический этаж      1
3, 4                                    1
4, 5                                    1
подвал,1                                1
Name: floor, Length: 74, dtype: int64

In [7]:
import json
with open('../data/russian-cities.json', 'r') as json_file:
    cities_data = json.load(json_file)
population_dict = {city_dict['name']: city_dict['population'] for city_dict in cities_data}
train['population'] = train['city'].map(population_dict)
test['population'] = test['city'].map(population_dict)

In [8]:
import re
pattern = r'[a-zа-я\s]'

train['floor_count_comma'] = train['floor'].str.count(',').fillna(0)
test['floor_count_comma'] = test['floor'].str.count(',').fillna(0)

train['floor']=train['floor'].fillna('missing').str.lower()
test['floor']=test['floor'].fillna('missing').str.lower()
train['floor']=train['floor'].str.replace(' ','')
test['floor']=test['floor'].str.replace(' ','')
train['floor']=train['floor'].str.replace('.0','', regex=False)
test['floor']=test['floor'].str.replace('.0','', regex=False)

train['floor_under'] = (train['floor'].str.count('подвал')  > 0).astype(int)
train['floor_upper'] = (train['floor'].str.count('мансарда') > 0).astype(int)
train['floor_ground'] = (train['floor'].str.count('цоколь') > 0).astype(int)
test['floor_under'] = (test['floor'].str.count('подвал')  > 0).astype(int)
test['floor_upper'] = (test['floor'].str.count('мансарда') > 0).astype(int)
test['floor_ground'] = (test['floor'].str.count('цоколь') > 0).astype(int)

def min_floor(s):
    x = 0
    try:
        x = int(s)
    except:
        pass
    if x == 0:
        try:
            ss = re.sub(pattern, '', s)
            if ss[0] == ',':
                ss = ss[1:]
            x = int(ss.split(',')[0])
        except:
            pass
    return x

def max_floor(s):
    x = 0
    try:
        x = int(s)
    except:
        pass
    if x == 0:
        try:
            ss = re.sub(pattern, '', s)
            if ss[-1] == ',':
                ss = ss[1:]
            x = int(ss.split(',')[-1])
        except:
            pass
    return x

train['num_min_floor'] = train['floor'].apply(min_floor).fillna(0)
test['num_min_floor'] = test['floor'].apply(min_floor).fillna(0)
train['num_max_floor'] = train['floor'].apply(max_floor).fillna(0)
test['num_max_floor'] = test['floor'].apply(max_floor).fillna(0)

In [9]:
train[['floor','floor_count_comma', 'num_min_floor', 'num_max_floor', 'population']].sample(10)

Unnamed: 0,floor,floor_count_comma,num_min_floor,num_max_floor,population
61650,1,0.0,1,1,12655050.0
99284,missing,0.0,0,0,5384342.0
166298,missing,0.0,0,0,5384342.0
184479,1,0.0,1,1,
14297,1,0.0,1,1,343285.0
45453,-1,0.0,-1,-1,16395.0
112854,missing,0.0,0,0,88113.0
92608,-1,0.0,-1,-1,12655050.0
162522,missing,0.0,0,0,306703.0
14009,missing,0.0,0,0,1244254.0


In [10]:
TARGET = 'per_square_meter_price'
# признаки (или набор признаков), для которых применяем smoothed target encoding
CATEGORICAL_FEATURES = ['region', 'city', 'realty_type', 'street','floor','osm_city_nearest_name']

# численные признаки
NUM_FEATURES = ['lat', 'lng', 'osm_amenity_points_in_0.001',
       'osm_amenity_points_in_0.005', 'osm_amenity_points_in_0.0075',
       'osm_amenity_points_in_0.01', 'osm_building_points_in_0.001',
       'osm_building_points_in_0.005', 'osm_building_points_in_0.0075',
       'osm_building_points_in_0.01', 'osm_catering_points_in_0.001',
       'osm_catering_points_in_0.005', 'osm_catering_points_in_0.0075',
       'osm_catering_points_in_0.01', 'osm_city_closest_dist',
       'osm_city_nearest_population',
       'osm_crossing_closest_dist', 'osm_crossing_points_in_0.001',
       'osm_crossing_points_in_0.005', 'osm_crossing_points_in_0.0075',
       'osm_crossing_points_in_0.01', 'osm_culture_points_in_0.001',
       'osm_culture_points_in_0.005', 'osm_culture_points_in_0.0075',
       'osm_culture_points_in_0.01', 'osm_finance_points_in_0.001',
       'osm_finance_points_in_0.005', 'osm_finance_points_in_0.0075',
       'osm_finance_points_in_0.01', 'osm_healthcare_points_in_0.005',
       'osm_healthcare_points_in_0.0075', 'osm_healthcare_points_in_0.01',
       'osm_historic_points_in_0.005', 'osm_historic_points_in_0.0075',
       'osm_historic_points_in_0.01', 'osm_hotels_points_in_0.005',
       'osm_hotels_points_in_0.0075', 'osm_hotels_points_in_0.01',
       'osm_leisure_points_in_0.005', 'osm_leisure_points_in_0.0075',
       'osm_leisure_points_in_0.01', 'osm_offices_points_in_0.001',
       'osm_offices_points_in_0.005', 'osm_offices_points_in_0.0075',
       'osm_offices_points_in_0.01', 'osm_shops_points_in_0.001',
       'osm_shops_points_in_0.005', 'osm_shops_points_in_0.0075',
       'osm_shops_points_in_0.01', 'osm_subway_closest_dist',
       'osm_train_stop_closest_dist', 'osm_train_stop_points_in_0.005',
       'osm_train_stop_points_in_0.0075', 'osm_train_stop_points_in_0.01',
       'osm_transport_stop_closest_dist', 'osm_transport_stop_points_in_0.005',
       'osm_transport_stop_points_in_0.0075',
       'osm_transport_stop_points_in_0.01',
       'reform_count_of_houses_1000', 'reform_count_of_houses_500',
       'reform_house_population_1000', 'reform_house_population_500',
       'reform_mean_floor_count_1000', 'reform_mean_floor_count_500',
       'reform_mean_year_building_1000', 'reform_mean_year_building_500', 'total_square']

In [11]:
import pandas as pd
UNKNOWN_VALUE = 'missing'

def prepare_categorical(df: pd.DataFrame) -> pd.DataFrame:
    """
    Заполняет пропущенные категориальные переменные
    :param df: dataframe, обучающая выборка
    :return: dataframe
    """
    df_new = df.copy()
    fillna_cols = list(CATEGORICAL_FEATURES)
    df_new[fillna_cols] = df_new[fillna_cols].fillna(UNKNOWN_VALUE)
    return df_new
train = prepare_categorical(train)
test = prepare_categorical(test)

# в трейне те значения, которые встречаются только один раз заменяем на missing
for c in CATEGORICAL_FEATURES:
    q = train[c].value_counts()
    single = q.loc[q.values==1].index.tolist()
    changed_in_train = train.loc[train[c].isin(single),c].shape[0]
    changed_in_test = test.loc[train[c].isin(single),c].shape[0]
    train.loc[train[c].isin(single),c] = UNKNOWN_VALUE
    test.loc[test[c].isin(single),c] = UNKNOWN_VALUE
    print('category',c,'changed in train',changed_in_train,'changed in test',changed_in_test)

#убираем из теста те значения, которых нет в трейн
for c in CATEGORICAL_FEATURES:
    train_uniq = train[c].unique()
    count_changed = 0
    for v in test[c].unique():
        if v not in train_uniq:
            count_changed += test.loc[test[c] == v, c].shape[0]
            test.loc[test[c] == v, c] = UNKNOWN_VALUE
        if count_changed > 0:
            print('changed in category ', c, 'value', v, count_changed, 'times')

category region changed in train 0 changed in test 0
category city changed in train 1842 changed in test 18
category realty_type changed in train 0 changed in test 0
category street changed in train 14910 changed in test 343
category floor changed in train 35 changed in test 0
category osm_city_nearest_name changed in train 0 changed in test 0
changed in category  city value Ромашково 1 times
changed in category  city value Тобольск 1 times
changed in category  city value Курчатов 1 times
changed in category  city value Боброво 1 times
changed in category  city value Дрожжино 1 times
changed in category  city value Бердск 1 times
changed in category  city value Пенза 1 times
changed in category  city value Усинск 1 times
changed in category  city value Волжский 1 times
changed in category  city value Белокуриха 1 times
changed in category  city value Коммунар 1 times
changed in category  city value Нижнеудинск 1 times
changed in category  city value Кострома 1 times
changed in category

changed in category  street value S30079 23 times
changed in category  street value S10659 23 times
changed in category  street value S14813 23 times
changed in category  street value S32251 23 times
changed in category  street value S24638 23 times
changed in category  street value S19612 23 times
changed in category  street value S13951 23 times
changed in category  street value S6944 23 times
changed in category  street value S20753 23 times
changed in category  street value S27518 23 times
changed in category  street value S7934 23 times
changed in category  street value S25549 23 times
changed in category  street value S11220 23 times
changed in category  street value S24408 23 times
changed in category  street value S2095 23 times
changed in category  street value S11014 23 times
changed in category  street value S22411 23 times
changed in category  street value S13611 23 times
changed in category  street value S18524 23 times
changed in category  street value S16773 23 times
cha

changed in category  street value S6940 38 times
changed in category  street value S12688 38 times
changed in category  street value S18767 38 times
changed in category  street value S18555 38 times
changed in category  street value S23629 38 times
changed in category  street value S8184 38 times
changed in category  street value S29266 38 times
changed in category  street value S13304 38 times
changed in category  street value S21766 38 times
changed in category  street value S5324 38 times
changed in category  street value S23823 38 times
changed in category  street value S3973 38 times
changed in category  street value S32392 39 times
changed in category  street value S26616 39 times
changed in category  street value S5885 39 times
changed in category  street value S30243 40 times
changed in category  street value S24228 40 times
changed in category  street value S17829 40 times
changed in category  street value S23978 40 times
changed in category  street value S30356 40 times
chang

changed in category  street value S20512 62 times
changed in category  street value S14304 62 times
changed in category  street value S18691 62 times
changed in category  street value S3318 62 times
changed in category  street value S18312 62 times
changed in category  street value S11300 62 times
changed in category  street value S13411 62 times
changed in category  street value S29130 62 times
changed in category  street value S23364 62 times
changed in category  street value S6648 62 times
changed in category  street value S28462 62 times
changed in category  street value S6224 62 times
changed in category  street value S12196 62 times
changed in category  street value S3069 62 times
changed in category  street value S13735 62 times
changed in category  street value S2001 62 times
changed in category  street value S11219 62 times
changed in category  street value S7915 62 times
changed in category  street value S4336 62 times
changed in category  street value S20585 62 times
changed

changed in category  street value S20078 79 times
changed in category  street value S14692 81 times
changed in category  street value S10638 81 times
changed in category  street value S2693 81 times
changed in category  street value S16873 81 times
changed in category  street value S14485 81 times
changed in category  street value S588 81 times
changed in category  street value S4288 81 times
changed in category  street value S3433 81 times
changed in category  street value S1047 81 times
changed in category  street value S13652 81 times
changed in category  street value S32081 81 times
changed in category  street value S18373 81 times
changed in category  street value S9166 81 times
changed in category  street value S7637 81 times
changed in category  street value S23381 81 times
changed in category  street value S27508 81 times
changed in category  street value S23579 81 times
changed in category  street value S3469 81 times
changed in category  street value S13453 81 times
changed i

changed in category  street value S4639 108 times
changed in category  street value S31933 108 times
changed in category  street value S32115 108 times
changed in category  street value S10396 108 times
changed in category  street value S5099 112 times
changed in category  street value S31061 113 times
changed in category  street value S25836 113 times
changed in category  street value S3362 113 times
changed in category  street value S24059 113 times
changed in category  street value S31088 113 times
changed in category  street value S9126 113 times
changed in category  street value S7914 113 times
changed in category  street value S14811 114 times
changed in category  street value S1979 115 times
changed in category  street value S19407 116 times
changed in category  street value S3483 116 times
changed in category  street value S5291 116 times
changed in category  street value S2165 117 times
changed in category  street value S25354 117 times
changed in category  street value S12238

In [12]:
train['date'] = pd.to_datetime(train['date'])
train['month'] = train['date'].dt.month
test['date'] = pd.to_datetime(test['date'])
test['month'] = test['date'].dt.month

In [13]:
import typing
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

THRESHOLD = 0.15
NEGATIVE_WEIGHT = 1.1


def deviation_metric_one_sample(y_true: typing.Union[float, int], y_pred: typing.Union[float, int]) -> float:
    """
    Реализация кастомной метрики для хакатона.

    :param y_true: float, реальная цена
    :param y_pred: float, предсказанная цена
    :return: float, значение метрики
    """
    deviation = (y_pred - y_true) / np.maximum(1e-8, y_true)
    if np.abs(deviation) <= THRESHOLD:
        return 0
    elif deviation <= - 4 * THRESHOLD:
        return 9 * NEGATIVE_WEIGHT
    elif deviation < -THRESHOLD:
        return NEGATIVE_WEIGHT * ((deviation / THRESHOLD) + 1) ** 2
    elif deviation < 4 * THRESHOLD:
        return ((deviation / THRESHOLD) - 1) ** 2
    else:
        return 9


def deviation_metric(y_true: np.array, y_pred: np.array) -> float:
    return np.mean([deviation_metric_one_sample(y_true[n], y_pred[n]) for n in range(len(y_true))]) #.mean()

def median_absolute_percentage_error(y_true: np.array, y_pred: np.array) -> float:
    return np.median(np.abs(y_pred-y_true)/y_true)

def metrics_stat(y_true: np.array, y_pred: np.array) -> typing.Dict[str,float]:
    mape = mean_absolute_percentage_error(y_true, y_pred)
    mdape = median_absolute_percentage_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    raif_metric = deviation_metric(y_true, y_pred)
    return {'mape':mape, 'mdape':mdape, 'rmse': rmse, 'r2': r2, 'raif_metric':raif_metric}

In [14]:
train_df = train.drop(columns=['id','per_square_meter_price','date'])
test_df = test.drop(columns=['id','date'])[train_df.columns.tolist()]
(np.array(train_df.columns.tolist()) == np.array(test_df.columns.tolist())).all()

True

In [15]:
from sklearn.preprocessing import LabelEncoder

dim_size_ = []
for c in CATEGORICAL_FEATURES:
    le = LabelEncoder()
    le.fit(train_df[c].values.tolist() + test_df[c].values.tolist())
    train_df[c] = le.transform(train_df[c])
    test_df[c] = le.transform(test_df[c])
    dim_size_.append(len(np.unique(train_df[c].values.tolist() + test_df[c].values.tolist())))

In [16]:
cat_idxs = [train_df.columns.tolist().index(x) for x in CATEGORICAL_FEATURES]
cat_idxs

[69, 0, 72, 71, 1, 17]

In [17]:
dim_size_

[49, 3107, 3, 13932, 109, 170]

In [18]:
CATEGORICAL_FEATURES

['region', 'city', 'realty_type', 'street', 'floor', 'osm_city_nearest_name']

In [19]:
for c in train_df.columns:
    median_val = train_df[c].median()
    train_df[c] = train_df[c].fillna(median_val)
    test_df[c] = test_df[c].fillna(median_val)

In [20]:
from sklearn.model_selection import KFold
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

import torch
from torch.optim import AdamW, SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts

tabnet_params = dict(
    cat_idxs=cat_idxs,
    cat_dims=dim_size_,
    cat_emb_dim=5,
    n_d = 8,
    n_a = 8,
    n_steps = 1,
    gamma = 5,
    n_independent = 2,
    n_shared = 2,
    lambda_sparse = 0,
    optimizer_fn = AdamW,
    optimizer_params = dict(lr = (1e-2), weight_decay=0.0),
    mask_type = "entmax",
    scheduler_params = dict(T_0=120, T_mult=1, eta_min=1e-5, last_epoch=-1, verbose=False),
    scheduler_fn = CosineAnnealingWarmRestarts,
    seed = 42,
    verbose = 10
)

kf = KFold(n_splits=5, shuffle=True, random_state=239)

class DevMetric(Metric):
    def __init__(self):
        self._name = "DevMetric"
        self._maximize = False

    def __call__(self, y_true, y_score):
        #print(y_true.shape, y_score.shape, y_true.dtype)
        return metrics_stat(np.expm1(y_true.flatten()), 
                            np.expm1(np.clip(y_score.flatten(),5,15))
                           )['raif_metric']
    
def MAPELoss(y_pred, y_true):
    return torch.mean(torch.abs(y_true - y_pred) / y_true).clone()
    

ifold = 0
for tr,va in kf.split(train_df):
    df_tr = train_df.loc[tr].reset_index(drop=True).values
    df_va = train_df.loc[va].reset_index(drop=True).values
    tr_y = np.log1p(train.loc[tr,[TARGET]].values)
    va_y = np.log1p(train.loc[va,[TARGET]].values)
    
    va_y = va_y[train_df.loc[va].price_type.values == 1]
    df_va = df_va[train_df.loc[va].price_type.values == 1]
    
    c_1_tr_y = tr_y[train_df.loc[tr].price_type.values == 1]
    c_1_df_tr = df_tr[train_df.loc[tr].price_type.values == 1]
    
    clf = TabNetRegressor(**tabnet_params)
    clf.fit(
      df_tr, tr_y,
      eval_set=[(c_1_df_tr, c_1_tr_y), (df_va, va_y)],
      max_epochs = 120,
      patience = 15,
      batch_size = 256, 
      virtual_batch_size = 256,
      num_workers = 4,
      drop_last = False,
      eval_metric=[DevMetric],
      loss_fn=MAPELoss
    )
    clf.save_model('model_preproc_'+str(ifold)+'.pth')
    ifold += 1

Device used : cuda
epoch 0  | loss: 0.13052 | val_0_DevMetric: 2.16821 | val_1_DevMetric: 2.05997 |  0:00:13s
epoch 10 | loss: 0.04274 | val_0_DevMetric: 1.54192 | val_1_DevMetric: 1.69678 |  0:02:33s
epoch 20 | loss: 0.03998 | val_0_DevMetric: 1.76415 | val_1_DevMetric: 1.92404 |  0:04:54s

Early stopping occurred at epoch 29 with best_epoch = 14 and best_val_1_DevMetric = 1.45183
Best weights from best epoch are automatically used!
Successfully saved model at model_preproc_0.pth.zip
Device used : cuda
epoch 0  | loss: 0.13099 | val_0_DevMetric: 2.19852 | val_1_DevMetric: 2.29357 |  0:00:14s
epoch 10 | loss: 0.04274 | val_0_DevMetric: 1.43621 | val_1_DevMetric: 1.9091  |  0:02:41s
epoch 20 | loss: 0.03982 | val_0_DevMetric: 1.40056 | val_1_DevMetric: 2.05384 |  0:05:03s

Early stopping occurred at epoch 20 with best_epoch = 5 and best_val_1_DevMetric = 1.89629
Best weights from best epoch are automatically used!
Successfully saved model at model_preproc_1.pth.zip
Device used : cuda
ep

In [21]:
from sklearn.model_selection import KFold
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

import torch
from torch.optim import AdamW, SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts

pred = 0
for ifold in range(5):
    clf = TabNetRegressor(**tabnet_params)
    clf.load_model('model_preproc_'+str(ifold)+'.pth.zip')
    y_score = clf.predict(test_df.values)
    y_score = np.clip(y_score.flatten(),5,15)
    pred += np.expm1(y_score) * 0.2
test_sub = pd.read_csv('../data/test.csv')[['id']]
test_sub[TARGET] = pred
test_sub.to_csv('tabnet_preproc_new_loss.csv', index=False)

Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda


In [22]:
test_sub.head()

Unnamed: 0,id,per_square_meter_price
0,COL_289284,38539.027344
1,COL_289305,51784.941406
2,COL_289318,36924.695312
3,COL_289354,68011.429688
4,COL_289399,48563.992188
