In [32]:
import pandas as pd
from sklearn import metrics,model_selection
from catboost import CatBoostRegressor
from tqdm import tqdm
from lightgbm import LGBMRegressor

In [16]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [183]:
df = pd.read_parquet('../../data/sample_data.parquet').dropna(subset='rubm2')
df = df.drop(['datetime','publish_delta','url','id','text','Город','title','price','img_list','metro_branch','metro_name','metro_dist'],axis=1)

In [185]:
df['rubm2'].median()

285714.28571428574

In [186]:

df['postcode'] = df['postcode'].explode().astype(float).groupby(level=0).mean()

In [187]:
df['is_apart'] = df['rooms'].str.contains('апарт')

In [188]:
df['rooms'] = df['rooms'].str.extract('(\d+)').iloc[:,0]

In [189]:
def check_model_prefomance(dataset:pd.DataFrame,model):

    
    folds = model_selection.RepeatedKFold(n_splits= 5,n_repeats= 3,random_state=11)
    model_selection.ShuffleSplit()
    X = dataset.drop('rubm2',axis=1)
    Y = dataset['rubm2']

    result = []

    for train_idx,test_idx in tqdm(folds.split(dataset),total=15):

        _metrics = {}

        x,y = X.iloc[train_idx] , Y.iloc[train_idx]
        xv,yv = X.iloc[test_idx] , Y.iloc[test_idx]

        model.fit(x,y)
        yhat = model.predict(xv)

        _metrics.update({'mape': metrics.mean_absolute_percentage_error(yv, yhat)
        })

        _metrics.update({'rmse': metrics.mean_squared_error(yv, yhat)**.5})
        
        _metrics.update({'mae': metrics.mean_absolute_error(yv, yhat)})

        result.append(_metrics)
        
    return pd.DataFrame(result).agg(['mean','std'])



In [190]:
baza = check_model_prefomance(df.select_dtypes(exclude='O'),LGBMRegressor(n_estimators=1000))

100%|██████████| 15/15 [00:27<00:00,  1.81s/it]


In [191]:
baza

Unnamed: 0,mape,rmse,mae
mean,0.180717,101697.154452,57552.43115
std,0.006253,8726.35684,2221.132673


# baseline v1

In [192]:
base_data = df.copy()

In [193]:
advace_data = pd.json_normalize(base_data['advanced_home_info'].apply(lambda x: pd.DataFrame(x).set_index('key')['value'].to_dict()))

In [194]:
advanced_data = advace_data.dropna(thresh=advace_data.shape[0]*.5,axis=1)

In [195]:
advanced_data['age'] = (advanced_data['Год_ввода_в_эксплуатацию'].astype(float) - 2023).abs().where(lambda x: x<100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  advanced_data['age'] = (advanced_data['Год_ввода_в_эксплуатацию'].astype(float) - 2023).abs().where(lambda x: x<100)


In [196]:
clean_advanced_data = advanced_data.apply(pd.to_numeric,errors='ignore',downcast='unsigned')

In [197]:
data_v2 = base_data.drop('advanced_home_info',axis=1).reset_index(drop=True)\
                    .join(clean_advanced_data).rename(columns= lambda x: x.replace(',','_'))\
                        .drop('jkh_url',axis=1)

In [198]:
v1 = check_model_prefomance(data_v2.select_dtypes(exclude='O'),LGBMRegressor(n_estimators=1000))

100%|██████████| 15/15 [00:46<00:00,  3.10s/it]


In [199]:
baza - v1

Unnamed: 0,mape,rmse,mae
mean,0.020076,6971.250805,6500.354192
std,-0.002857,527.235718,216.847716


# target_encoding

In [200]:
data_v3 = data_v2.copy()

In [201]:
for col in data_v2.select_dtypes('O').columns:
    data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')

  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data

In [202]:
v2 = check_model_prefomance(data_v3.select_dtypes(exclude='O'),LGBMRegressor(n_estimators=1000))

100%|██████████| 15/15 [01:01<00:00,  4.12s/it]


In [203]:
baza - v2

Unnamed: 0,mape,rmse,mae
mean,0.070867,36595.683248,21648.720625
std,0.000944,3964.434992,451.110359


In [204]:
v1 - v2

Unnamed: 0,mape,rmse,mae
mean,0.050791,29624.432442,15148.366433
std,0.003801,3437.199274,234.262643


In [205]:
v2

Unnamed: 0,mape,rmse,mae
mean,0.10985,65101.471204,35903.710524
std,0.005308,4761.921849,1770.022314


In [206]:
x,xv,y,yv = model_selection.train_test_split(data_v3.drop('rubm2',axis=1),data_v3['rubm2'],train_size=.75)

In [207]:
model_v2 = CatBoostRegressor(5000)

In [208]:
model_v2.fit(x,y,verbose=500)

Learning rate set to 0.014282
0:	learn: 161742.2146197	total: 9.57ms	remaining: 47.8s
500:	learn: 55983.5876620	total: 1.59s	remaining: 14.3s
1000:	learn: 48971.3705754	total: 3.1s	remaining: 12.4s
1500:	learn: 43782.8193899	total: 4.61s	remaining: 10.7s
2000:	learn: 40039.4638874	total: 6.12s	remaining: 9.17s
2500:	learn: 36931.6403542	total: 7.64s	remaining: 7.64s
3000:	learn: 34359.6964697	total: 9.2s	remaining: 6.13s
3500:	learn: 32158.1552612	total: 10.7s	remaining: 4.58s
4000:	learn: 30362.9080260	total: 12.2s	remaining: 3.05s
4500:	learn: 28648.4320423	total: 13.7s	remaining: 1.52s
4999:	learn: 27135.2957811	total: 15.2s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7fda5e55ee20>

In [209]:
best_score = metrics.mean_absolute_percentage_error(yv,model_v2.predict(xv))
best_score

0.10563732082076371

In [210]:
pd.Series(model_v2.feature_importances_,model_v2.feature_names_).sort_values()

Тип_системы_водоотведения          0.035760
Водоотведение                      0.047024
Тип_фундамента                     0.048100
Материал_теплоизоляции_сети        0.073445
Тип_наружного_утепления_фасада     0.085356
                                    ...    
Метро                              2.826788
m2                                 6.122657
Район                              7.594913
Дом                                8.175733
Улица                             46.891508
Length: 85, dtype: float64

# v3 normalize

In [214]:
from sklearn import preprocessing

ss = preprocessing.StandardScaler()

In [235]:
data_v4 = data_v3.apply(lambda x: ss.fit_transform(x.to_frame()).ravel() if ((x.nunique()>2 ) & (x.name != 'rubm2')) else x).copy()

In [238]:
v2_normalize = check_model_prefomance(data_v4,LGBMRegressor(n_estimators=1000))

100%|██████████| 15/15 [01:03<00:00,  4.23s/it]


In [239]:
v2 - v2_normalize

Unnamed: 0,mape,rmse,mae
mean,3.8e-05,-113.699178,72.533613
std,-0.000496,112.938522,135.619795


# target_transform

In [251]:
data_v5 = data_v3.copy().query('rubm2 < rubm2.quantile(.9)')

In [254]:
v3_normalize = check_model_prefomance(data_v5,LGBMRegressor(n_estimators=1000))

100%|██████████| 15/15 [01:00<00:00,  4.00s/it]


In [255]:
v2 - v3_normalize

Unnamed: 0,mape,rmse,mae
mean,0.014292,28794.997098,10572.560796
std,-0.002273,3258.608889,957.017


In [256]:
v3_normalize

Unnamed: 0,mape,rmse,mae
mean,0.095557,36306.474106,25331.149728
std,0.007582,1503.31296,813.005314


# Result

In [296]:
x,xv,y,yv = model_selection.train_test_split(data_v5.drop('rubm2',axis=1),data_v5['rubm2'],train_size=.75,random_state=1444)

In [340]:

model_cat = CatBoostRegressor(5000,loss_function='MAE',eval_metric='MAPE',early_stopping_rounds=500,use_best_model=True)
model_cat.fit(x,y,verbose=500,eval_set=(xv,yv))
best_score = metrics.mean_absolute_percentage_error(yv,model_cat.predict(xv))
best_score

0:	learn: 0.2031452	test: 0.1949486	best: 0.1949486 (0)	total: 5.93ms	remaining: 29.6s
500:	learn: 0.0802129	test: 0.0944278	best: 0.0944206 (499)	total: 1.51s	remaining: 13.6s
1000:	learn: 0.0681913	test: 0.0933895	best: 0.0933895 (1000)	total: 3.05s	remaining: 12.2s
1500:	learn: 0.0613380	test: 0.0931041	best: 0.0930916 (1489)	total: 4.54s	remaining: 10.6s
2000:	learn: 0.0559741	test: 0.0929235	best: 0.0929190 (1992)	total: 6.04s	remaining: 9.05s
2500:	learn: 0.0526172	test: 0.0927665	best: 0.0927508 (2490)	total: 7.53s	remaining: 7.52s
3000:	learn: 0.0495807	test: 0.0925954	best: 0.0925924 (2989)	total: 9.03s	remaining: 6.01s
3500:	learn: 0.0472222	test: 0.0925307	best: 0.0925161 (3471)	total: 10.5s	remaining: 4.51s
4000:	learn: 0.0455021	test: 0.0923713	best: 0.0923530 (3983)	total: 12s	remaining: 3s
4500:	learn: 0.0440214	test: 0.0922221	best: 0.0922127 (4469)	total: 13.8s	remaining: 1.53s
4999:	learn: 0.0426153	test: 0.0921489	best: 0.0921276 (4958)	total: 15.5s	remaining: 0us

b

0.09212764748371245

In [329]:

model_l = LGBMRegressor(n_estimators=5000,
                        learning_rate=.15,
                        objective = 'mae',
                        boosting_type='dart')

model_l.fit(x,y,eval_set =(xv,yv),verbose=200,eval_metric='mape')
best_score = metrics.mean_absolute_percentage_error(yv,model_l.predict(xv))
best_score



[200]	valid_0's mape: 0.0938659	valid_0's l1: 27382.8
[400]	valid_0's mape: 0.0909092	valid_0's l1: 25802.8
[600]	valid_0's mape: 0.0917965	valid_0's l1: 26355.9
[800]	valid_0's mape: 0.0907442	valid_0's l1: 25797.9
[1000]	valid_0's mape: 0.0902022	valid_0's l1: 25594.4
[1200]	valid_0's mape: 0.090216	valid_0's l1: 25578.4
[1400]	valid_0's mape: 0.0896688	valid_0's l1: 25292.8
[1600]	valid_0's mape: 0.0897174	valid_0's l1: 25275.1
[1800]	valid_0's mape: 0.0899062	valid_0's l1: 25420.8
[2000]	valid_0's mape: 0.0896917	valid_0's l1: 25280.3
[2200]	valid_0's mape: 0.0895415	valid_0's l1: 25177.6
[2400]	valid_0's mape: 0.0894583	valid_0's l1: 25165.4
[2600]	valid_0's mape: 0.0895098	valid_0's l1: 25145.6
[2800]	valid_0's mape: 0.089485	valid_0's l1: 25148.2
[3000]	valid_0's mape: 0.0893722	valid_0's l1: 25100.8
[3200]	valid_0's mape: 0.0895227	valid_0's l1: 25169.3
[3400]	valid_0's mape: 0.0893399	valid_0's l1: 25041.2
[3600]	valid_0's mape: 0.0894035	valid_0's l1: 25054.6
[3800]	valid_0's

0.08975988752603196