In [1]:
import pandas as pd
from sklearn import metrics,model_selection
from catboost import CatBoostRegressor
from tqdm import tqdm
from lightgbm import LGBMRegressor

In [2]:
import warnings
import matplotlib.pyplot as plt
plt.style.use('ggplot')

warnings.simplefilter('ignore')

In [3]:
df = pd.read_parquet('../../data/sample_data.parquet').dropna(subset='rubm2')
df = df.drop(['datetime','publish_delta','url','id','text','Город','title','price','img_list','metro_branch','metro_name','metro_dist'],axis=1)

In [4]:
df['rubm2'].median()

286274.50980392157

In [5]:

df['postcode'] = df['postcode'].explode().astype(float).groupby(level=0).mean()

In [6]:
df['is_apart'] = df['rooms'].str.contains('апарт')

In [7]:
df['rooms'] = df['rooms'].str.extract('(\d+)').iloc[:,0]

In [8]:
def check_model_prefomance(dataset:pd.DataFrame,model):

    
    folds = model_selection.ShuffleSplit(n_splits= 10,train_size=.75,random_state=11)

    X = dataset.drop('rubm2',axis=1)
    Y = dataset['rubm2']

    result = []

    for train_idx,test_idx in tqdm(folds.split(dataset),total=10):

        _metrics = {}

        x,y = X.iloc[train_idx] , Y.iloc[train_idx]
        xv,yv = X.iloc[test_idx] , Y.iloc[test_idx]

        model.fit(x,y)
        yhat = model.predict(xv)

        _metrics.update({'mape': metrics.mean_absolute_percentage_error(yv, yhat)
        })

        _metrics.update({'rmse': metrics.mean_squared_error(yv, yhat)**.5})
        
        _metrics.update({'mae': metrics.mean_absolute_error(yv, yhat)})

        result.append(_metrics)
        
    return pd.DataFrame(result).agg(['mean','std'])



In [9]:
baza = check_model_prefomance(df.select_dtypes(exclude='O'),LGBMRegressor(n_estimators=1000))

100%|██████████| 10/10 [00:44<00:00,  4.48s/it]


In [10]:
baza

Unnamed: 0,mape,rmse,mae
mean,0.189001,104111.288453,58586.097746
std,0.011903,7024.625883,1732.819644


# baseline v1

In [11]:
base_data = df.copy()

In [12]:
advace_data = pd.json_normalize(base_data['advanced_home_info'].apply(lambda x: pd.DataFrame(x).set_index('key')['value'].to_dict()))

In [13]:
advanced_data = advace_data.dropna(thresh=advace_data.shape[0]*.5,axis=1)

In [14]:
advanced_data['age'] = (advanced_data['Год_ввода_в_эксплуатацию'].astype(float) - 2023).abs().where(lambda x: x<100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  advanced_data['age'] = (advanced_data['Год_ввода_в_эксплуатацию'].astype(float) - 2023).abs().where(lambda x: x<100)


In [15]:
clean_advanced_data = advanced_data.apply(pd.to_numeric,errors='ignore',downcast='unsigned')

In [16]:
data_v2 = base_data.drop('advanced_home_info',axis=1).reset_index(drop=True)\
                    .join(clean_advanced_data).rename(columns= lambda x: x.replace(',','_'))\
                        .drop('jkh_url',axis=1)

In [17]:
v1 = check_model_prefomance(data_v2.select_dtypes(exclude='O'),LGBMRegressor(n_estimators=1000))

100%|██████████| 10/10 [00:45<00:00,  4.56s/it]


In [18]:
baza - v1

Unnamed: 0,mape,rmse,mae
mean,0.02604,7868.871887,7265.199568
std,0.003582,347.955421,204.781318


# target_encoding

In [19]:
data_v3 = data_v2.copy()

In [20]:
for col in data_v2.select_dtypes('O').columns:
    data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')

  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data_v3.loc[:,col] = data_v3.groupby(col)['rubm2'].transform('mean')
  data

In [21]:
v2 = check_model_prefomance(data_v3.select_dtypes(exclude='O'),LGBMRegressor(n_estimators=1000))

100%|██████████| 10/10 [01:13<00:00,  7.39s/it]


In [22]:
baza - v2

Unnamed: 0,mape,rmse,mae
mean,0.076433,38193.481654,22248.301
std,0.00554,3083.23326,492.667254


In [23]:
v1 - v2

Unnamed: 0,mape,rmse,mae
mean,0.050393,30324.609766,14983.101432
std,0.001958,2735.277838,287.885936


In [24]:
v2

Unnamed: 0,mape,rmse,mae
mean,0.112568,65917.806799,36337.796745
std,0.006363,3941.392623,1240.15239


In [25]:
x,xv,y,yv = model_selection.train_test_split(data_v3.drop('rubm2',axis=1),data_v3['rubm2'],train_size=.75)

In [26]:
model_v2 = CatBoostRegressor(5000)

In [27]:
model_v2.fit(x,y,verbose=500)

Learning rate set to 0.014474
0:	learn: 159184.1151782	total: 59ms	remaining: 4m 55s
500:	learn: 54437.9168175	total: 2.79s	remaining: 25.1s
1000:	learn: 46946.1745160	total: 5.24s	remaining: 20.9s
1500:	learn: 41858.7436979	total: 7.75s	remaining: 18.1s
2000:	learn: 38177.7432341	total: 10.2s	remaining: 15.3s
2500:	learn: 35209.1106562	total: 12.6s	remaining: 12.6s
3000:	learn: 32821.0095281	total: 15.2s	remaining: 10.1s
3500:	learn: 30675.7259104	total: 17.6s	remaining: 7.51s
4000:	learn: 28801.6959059	total: 19.9s	remaining: 4.97s
4500:	learn: 27246.6210716	total: 22.2s	remaining: 2.46s
4999:	learn: 25761.8774395	total: 24.6s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7ff3762ae730>

In [28]:
best_score = metrics.mean_absolute_percentage_error(yv,model_v2.predict(xv))
best_score

0.10738368360320025

In [29]:
pd.Series(model_v2.feature_importances_,model_v2.feature_names_).sort_values()

Материал_теплоизоляции_сети      0.032185
Количество_вводов_в_дом__ед.     0.048411
Водоотведение                    0.050055
Форма_крыши                      0.056704
Тип_системы_газоснабжения        0.057339
                                  ...    
Метро                            3.011278
m2                               6.798541
Район                            6.883233
Дом                              7.623599
Улица                           46.945816
Length: 85, dtype: float64

# v3 normalize

In [30]:
from sklearn import preprocessing

ss = preprocessing.StandardScaler()

In [31]:
data_v4 = data_v3.apply(lambda x: ss.fit_transform(x.to_frame()).ravel() if ((x.nunique()>2 ) & (x.name != 'rubm2')) else x).copy()

In [32]:
v2_normalize = check_model_prefomance(data_v4,LGBMRegressor(n_estimators=1000))

100%|██████████| 10/10 [00:57<00:00,  5.73s/it]


In [33]:
v2 - v2_normalize

Unnamed: 0,mape,rmse,mae
mean,-0.001002,-1085.47269,-332.28079
std,0.000108,-50.38243,-19.770045


# target_transform

In [34]:
data_v5 = data_v3.copy().query('rubm2 < rubm2.quantile(.9)')

In [35]:
v3_normalize = check_model_prefomance(data_v5,LGBMRegressor(n_estimators=1000))

100%|██████████| 10/10 [01:00<00:00,  6.00s/it]


In [36]:
v2 - v3_normalize

Unnamed: 0,mape,rmse,mae
mean,0.017446,29352.733794,10741.106347
std,8.3e-05,2864.753873,779.344349


In [37]:
v3_normalize

Unnamed: 0,mape,rmse,mae
mean,0.095123,36565.073005,25596.690398
std,0.00628,1076.638751,460.808041


# Result

In [41]:
x,xv,y,yv = model_selection.train_test_split(data_v5.drop('rubm2',axis=1),data_v5['rubm2'],train_size=.75,random_state=1434)

In [42]:

model_cat = CatBoostRegressor(5000,loss_function='MAE',eval_metric='MAPE',early_stopping_rounds=500,use_best_model=True)
model_cat.fit(x,y,verbose=500,eval_set=(xv,yv))
best_score = metrics.mean_absolute_percentage_error(yv,model_cat.predict(xv))
best_score

0:	learn: 0.2021153	test: 0.1979533	best: 0.1979533 (0)	total: 7.46ms	remaining: 37.3s
500:	learn: 0.0805644	test: 0.0960668	best: 0.0960668 (500)	total: 3.19s	remaining: 28.6s
1000:	learn: 0.0689388	test: 0.0941160	best: 0.0941160 (1000)	total: 6.13s	remaining: 24.5s
1500:	learn: 0.0625385	test: 0.0932339	best: 0.0932339 (1500)	total: 9.06s	remaining: 21.1s
2000:	learn: 0.0580216	test: 0.0927138	best: 0.0927063 (1995)	total: 11.7s	remaining: 17.5s
2500:	learn: 0.0546436	test: 0.0923325	best: 0.0923286 (2475)	total: 14.2s	remaining: 14.2s
3000:	learn: 0.0523334	test: 0.0919895	best: 0.0919845 (2996)	total: 16.7s	remaining: 11.1s
3500:	learn: 0.0502760	test: 0.0918380	best: 0.0918323 (3489)	total: 19.3s	remaining: 8.28s
4000:	learn: 0.0483800	test: 0.0915783	best: 0.0915737 (3989)	total: 22s	remaining: 5.48s
4500:	learn: 0.0467907	test: 0.0914258	best: 0.0914184 (4486)	total: 24.6s	remaining: 2.73s
4999:	learn: 0.0456554	test: 0.0912651	best: 0.0912646 (4998)	total: 27.1s	remaining: 0us

0.09126456033710632

In [44]:

model_l = LGBMRegressor(n_estimators=5000,
                        learning_rate=.11,
                        objective = 'mae',
                        boosting_type='dart')

model_l.fit(x,y,eval_set =(xv,yv),verbose=200,eval_metric='mape')
best_score = metrics.mean_absolute_percentage_error(yv,model_l.predict(xv))
best_score



[200]	valid_0's mape: 0.0989001	valid_0's l1: 29220.7
[400]	valid_0's mape: 0.0921559	valid_0's l1: 26311.4
[600]	valid_0's mape: 0.0932985	valid_0's l1: 27017.3
[800]	valid_0's mape: 0.0919214	valid_0's l1: 26420.5
[1000]	valid_0's mape: 0.0914923	valid_0's l1: 26175.4
[1200]	valid_0's mape: 0.0917374	valid_0's l1: 26228.2
[1400]	valid_0's mape: 0.0910025	valid_0's l1: 25905.5
[1600]	valid_0's mape: 0.090703	valid_0's l1: 25738
[1800]	valid_0's mape: 0.0914262	valid_0's l1: 26080.5
[2000]	valid_0's mape: 0.0908896	valid_0's l1: 25867.5
[2200]	valid_0's mape: 0.0905603	valid_0's l1: 25651.6
[2400]	valid_0's mape: 0.0907072	valid_0's l1: 25706.8
[2600]	valid_0's mape: 0.090585	valid_0's l1: 25660.7
[2800]	valid_0's mape: 0.0905743	valid_0's l1: 25628.9
[3000]	valid_0's mape: 0.0904446	valid_0's l1: 25536.4
[3200]	valid_0's mape: 0.0907515	valid_0's l1: 25669.1
[3400]	valid_0's mape: 0.09039	valid_0's l1: 25449.7
[3600]	valid_0's mape: 0.0905007	valid_0's l1: 25502.9
[3800]	valid_0's map

0.09057472111899124