In [1]:
import pandas as pd
from sklearn import metrics,model_selection
from catboost import CatBoostRegressor
from tqdm import tqdm
from lightgbm import LGBMRegressor

In [2]:
import warnings
import matplotlib.pyplot as plt
plt.style.use('ggplot')

warnings.simplefilter('ignore')

In [3]:
df = pd.read_parquet('../../data/sample_data.parquet').dropna(subset='rubm2')
df = df.drop(['datetime','publish_delta','url','id','text','Город','title','price','img_list','metro_branch','metro_name','metro_dist'],axis=1)

In [46]:
df.shape

(8729, 21)

In [4]:
df['rubm2'].median()

286274.50980392157

In [5]:

df['postcode'] = df['postcode'].explode().astype(float).groupby(level=0).mean()

In [6]:
df['is_apart'] = df['rooms'].str.contains('апарт')

In [7]:
df['rooms'] = df['rooms'].str.extract('(\d+)').iloc[:,0]

In [8]:
def check_model_prefomance(dataset:pd.DataFrame,model):

    
    folds = model_selection.ShuffleSplit(n_splits= 10,train_size=.75,random_state=11)

    X = dataset.drop('rubm2',axis=1)
    Y = dataset['rubm2']

    result = []

    for train_idx,test_idx in tqdm(folds.split(dataset),total=10):

        _metrics = {}

        x,y = X.iloc[train_idx] , Y.iloc[train_idx]
        xv,yv = X.iloc[test_idx] , Y.iloc[test_idx]

        model.fit(x,y)
        yhat = model.predict(xv)

        _metrics.update({'mape': metrics.mean_absolute_percentage_error(yv, yhat)
        })

        _metrics.update({'rmse': metrics.mean_squared_error(yv, yhat)**.5})
        
        _metrics.update({'mae': metrics.mean_absolute_error(yv, yhat)})

        result.append(_metrics)
        
    return pd.DataFrame(result).agg(['mean','std'])



In [9]:
baza = check_model_prefomance(df.select_dtypes(exclude='O'),LGBMRegressor(n_estimators=1000))

100%|██████████| 10/10 [00:11<00:00,  1.11s/it]


In [10]:
baza

Unnamed: 0,mape,rmse,mae
mean,0.183506,101195.845931,58240.966773
std,0.008503,5501.860419,1353.167682


# baseline v1

In [11]:
base_data = df.copy()


In [12]:
base_data['advanced_home_info'].isna().sum()

326

In [13]:
advace_data = pd.json_normalize(base_data['advanced_home_info'].apply(lambda row: dict(zip(row['key'],row['value'])) if row is not None else {}))

In [14]:
advanced_data = advace_data.dropna(thresh=advace_data.shape[0]*.5,axis=1)

In [15]:
advanced_data['age'] = (advanced_data['Год_ввода_в_эксплуатацию'].astype(float) - 2023).abs().where(lambda x: x<100)

In [16]:
clean_advanced_data = advanced_data.apply(pd.to_numeric,errors='ignore',downcast='unsigned')

In [17]:
data_v2 = base_data.drop('advanced_home_info',axis=1).reset_index(drop=True)\
                    .join(clean_advanced_data).rename(columns= lambda x: x.replace(',','_'))\
                        .drop('jkh_url',axis=1)

In [18]:
v1 = check_model_prefomance(data_v2.select_dtypes(exclude='O'),LGBMRegressor(n_estimators=1000))

100%|██████████| 10/10 [00:18<00:00,  1.87s/it]


In [19]:
baza - v1

Unnamed: 0,mape,rmse,mae
mean,0.017638,5743.905991,5451.455029
std,-0.00106,1073.002248,466.540725


# target_encoding

In [20]:
data_v3 = data_v2.copy()

In [21]:
for col in data_v2.select_dtypes('O').columns:
    data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')

In [22]:
v2 = check_model_prefomance(data_v3.select_dtypes(exclude='O'),LGBMRegressor(n_estimators=1000))

100%|██████████| 10/10 [00:23<00:00,  2.38s/it]


In [23]:
baza - v2

Unnamed: 0,mape,rmse,mae
mean,0.073518,37174.366636,22188.700572
std,0.004141,3836.841698,743.613441


In [24]:
v1 - v2

Unnamed: 0,mape,rmse,mae
mean,0.055881,31430.460646,16737.245543
std,0.005201,2763.83945,277.072715


In [25]:
v2

Unnamed: 0,mape,rmse,mae
mean,0.109988,64021.479294,36052.266201
std,0.004362,1665.018721,609.554241


In [26]:
x,xv,y,yv = model_selection.train_test_split(data_v3.drop('rubm2',axis=1),data_v3['rubm2'],train_size=.75)

In [27]:
model_v2 = CatBoostRegressor(5000)

In [28]:
model_v2.fit(x,y,verbose=500)

Learning rate set to 0.014888
0:	learn: 160259.6315282	total: 53.9ms	remaining: 4m 29s
500:	learn: 57639.6864678	total: 1.03s	remaining: 9.25s
1000:	learn: 50529.5965972	total: 2.01s	remaining: 8.04s
1500:	learn: 45936.8764215	total: 2.94s	remaining: 6.84s
2000:	learn: 42084.1926073	total: 3.85s	remaining: 5.77s
2500:	learn: 39292.5721386	total: 4.81s	remaining: 4.81s
3000:	learn: 36910.8612430	total: 5.74s	remaining: 3.82s
3500:	learn: 34750.0994630	total: 6.66s	remaining: 2.85s
4000:	learn: 32910.1570361	total: 7.59s	remaining: 1.9s
4500:	learn: 31197.0638471	total: 8.56s	remaining: 950ms
4999:	learn: 29706.2134125	total: 9.58s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7fe8b7148460>

In [29]:
best_score = metrics.mean_absolute_percentage_error(yv,model_v2.predict(xv))
best_score

0.10212955142984914

In [30]:
pd.Series(model_v2.feature_importances_,model_v2.feature_names_).sort_values()

Электроснабжение                                              0.020161
Тип_поквартирной_разводки_внутридомовой_системы_отопления     0.026068
Водоотведение                                                 0.034870
Состояние_дома                                                0.040441
Тип_системы_водоотведения                                     0.043234
                                                               ...    
Метро                                                         3.317067
m2                                                            5.899749
Район                                                         8.251873
Дом                                                           8.519775
Улица                                                        45.283886
Length: 84, dtype: float64

# v3 normalize

In [31]:
from sklearn import preprocessing

ss = preprocessing.StandardScaler()

In [32]:
data_v4 = data_v3.apply(lambda x: ss.fit_transform(x.to_frame()).ravel() if ((x.nunique()>2 ) & (x.name != 'rubm2')) else x).copy()

In [33]:
v2_normalize = check_model_prefomance(data_v4,LGBMRegressor(n_estimators=1000))

100%|██████████| 10/10 [00:25<00:00,  2.50s/it]


In [34]:
v2 - v2_normalize

Unnamed: 0,mape,rmse,mae
mean,0.000242,-891.213703,56.086316
std,-0.000459,-113.598543,71.179734


# target_transform

In [35]:
data_v5 = data_v3.copy().query('rubm2 < rubm2.quantile(.9)')

In [36]:
v3_normalize = check_model_prefomance(data_v5,LGBMRegressor(n_estimators=1000))

100%|██████████| 10/10 [00:17<00:00,  1.74s/it]


In [37]:
v2 - v3_normalize

Unnamed: 0,mape,rmse,mae
mean,0.0137,27570.442589,10370.343454
std,-0.001744,420.272865,62.156338


In [38]:
v3_normalize

Unnamed: 0,mape,rmse,mae
mean,0.096287,36451.036705,25681.922747
std,0.006106,1244.745856,547.397903


# drop street

In [43]:
data_v5_2 = data_v3.copy().query('rubm2 < rubm2.quantile(.9)').drop(['Улица'],axis=1)

In [44]:
v4_normalize = check_model_prefomance(data_v5_2,LGBMRegressor(n_estimators=1000))

100%|██████████| 10/10 [00:22<00:00,  2.22s/it]


In [45]:
v4_normalize

Unnamed: 0,mape,rmse,mae
mean,0.101337,37965.50998,27011.771522
std,0.006682,1466.030445,738.072221


# Result

In [39]:
x,xv,y,yv = model_selection.train_test_split(data_v5.drop('rubm2',axis=1),data_v5['rubm2'],train_size=.75,random_state=1434)

In [40]:

model_cat = CatBoostRegressor(5000,loss_function='MAE',eval_metric='MAPE',early_stopping_rounds=500,use_best_model=True)
model_cat.fit(x,y,verbose=500,eval_set=(xv,yv))
best_score = metrics.mean_absolute_percentage_error(yv,model_cat.predict(xv))
best_score

0:	learn: 0.2008186	test: 0.1954780	best: 0.1954780 (0)	total: 2.01ms	remaining: 10s
500:	learn: 0.0850476	test: 0.0938041	best: 0.0938041 (500)	total: 1.05s	remaining: 9.47s
1000:	learn: 0.0737272	test: 0.0913878	best: 0.0913838 (999)	total: 2.08s	remaining: 8.3s
1500:	learn: 0.0669547	test: 0.0906712	best: 0.0906681 (1493)	total: 3.11s	remaining: 7.25s
2000:	learn: 0.0620129	test: 0.0903342	best: 0.0903331 (1999)	total: 4.15s	remaining: 6.22s
2500:	learn: 0.0584406	test: 0.0901302	best: 0.0901302 (2500)	total: 5.18s	remaining: 5.17s
3000:	learn: 0.0559619	test: 0.0900653	best: 0.0900554 (2979)	total: 6.22s	remaining: 4.14s
3500:	learn: 0.0539515	test: 0.0899389	best: 0.0899389 (3500)	total: 7.38s	remaining: 3.16s
4000:	learn: 0.0521960	test: 0.0898679	best: 0.0898543 (3735)	total: 8.51s	remaining: 2.12s
4500:	learn: 0.0506545	test: 0.0897371	best: 0.0897361 (4497)	total: 9.65s	remaining: 1.07s
4999:	learn: 0.0493750	test: 0.0897016	best: 0.0896886 (4942)	total: 10.6s	remaining: 0us



0.08968855372694919

In [41]:

model_l = LGBMRegressor(n_estimators=5000,
                        learning_rate=.11,
                        objective = 'mae',
                        boosting_type='dart')

model_l.fit(x,y,eval_set =(xv,yv),verbose=200,eval_metric='mape')
best_score = metrics.mean_absolute_percentage_error(yv,model_l.predict(xv))
best_score

[200]	valid_0's mape: 0.0936945	valid_0's l1: 28567.9
[400]	valid_0's mape: 0.0875949	valid_0's l1: 25812.3
[600]	valid_0's mape: 0.0889957	valid_0's l1: 26607.8
[800]	valid_0's mape: 0.0878961	valid_0's l1: 26155.2
[1000]	valid_0's mape: 0.0871202	valid_0's l1: 25718.4
[1200]	valid_0's mape: 0.0873972	valid_0's l1: 25807
[1400]	valid_0's mape: 0.0870226	valid_0's l1: 25556.5
[1600]	valid_0's mape: 0.0868282	valid_0's l1: 25435
[1800]	valid_0's mape: 0.0871802	valid_0's l1: 25689.1
[2000]	valid_0's mape: 0.0866116	valid_0's l1: 25428.5
[2200]	valid_0's mape: 0.0865148	valid_0's l1: 25307.6
[2400]	valid_0's mape: 0.086484	valid_0's l1: 25304
[2600]	valid_0's mape: 0.0863612	valid_0's l1: 25269.3
[2800]	valid_0's mape: 0.0865145	valid_0's l1: 25310.5
[3000]	valid_0's mape: 0.0861232	valid_0's l1: 25150.5
[3200]	valid_0's mape: 0.0861807	valid_0's l1: 25204.8
[3400]	valid_0's mape: 0.0860867	valid_0's l1: 25083.8
[3600]	valid_0's mape: 0.0861195	valid_0's l1: 25101.6
[3800]	valid_0's mape

0.08598600173930403