In [172]:
import pandas as pd
from sklearn import metrics,model_selection

In [65]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [243]:
df = pd.read_parquet('../../data/sample_data.parquet').dropna(subset='rubm2')
df = df.drop(['datetime','publish_delta','url','id','text','Город','title','img_list','metro_branch','metro_name','metro_dist'],axis=1)

In [68]:
df['rubm2'].median()

290000.0

In [69]:
df['rooms'] = df['rooms'].where(lambda x: x.isin(df['rooms'].value_counts().iloc[:6].index),'Other')

In [70]:
df.groupby('rooms')['rubm2'].median()

rooms
1-комн. апарт.    308333.333333
1-комн. кв.       291428.571429
2-комн. апарт.    347406.914894
2-комн. кв.       284482.758621
3-комн. апарт.    476312.525837
3-комн. кв.       284854.070661
Other             338095.238095
Name: rubm2, dtype: float64

In [71]:
df['price'].isna().mean()

0.0

In [72]:
_metrics = {}

# baseline v0

In [74]:
v0 = metrics.mean_absolute_percentage_error(df['price'],df['m2']*df['rubm2'].median())
_metrics.update(
    {'baseline':v0}
)

# baseline v0.1

In [105]:
base_data = df.copy()

In [106]:
base_data['is_apart'] = base_data['rooms'].str.contains('апарт')

In [107]:
base_data['n_rooms'] = base_data['rooms'].str.extract('(\d)').fillna(0).astype(float)

In [184]:
target = base_data['price']
data = base_data.select_dtypes(exclude='O').drop(['datetime','lat','long','price','rubm2'],axis=1)
# data['rub_m2'] = base_data.groupby(['n_rooms','max_floor','is_apart'])['rubm2'].transform('mean')
data = data.fillna(data.median())

In [197]:
x,xv,y,yv = model_selection.train_test_split(data,target,train_size=.25)

In [202]:
from catboost import CatBoostRegressor

In [207]:
model = CatBoostRegressor(2000)

In [208]:
model.fit(x,y,verbose=500)

Learning rate set to 0.024107
0:	learn: 27179240.0953045	total: 1.6ms	remaining: 3.2s
500:	learn: 5832598.4640858	total: 944ms	remaining: 2.82s
1000:	learn: 3846050.6264637	total: 1.84s	remaining: 1.84s
1500:	learn: 3045229.7303033	total: 4.5s	remaining: 1.49s
1999:	learn: 2499985.9407437	total: 6.8s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7f2f0b2f2e20>

In [209]:
_metrics.update(
   {'base_model': metrics.mean_absolute_percentage_error(yv,model.predict(xv))}
)

# v1

In [289]:
v1_data = df.copy()

In [290]:
v1_data['advanced_home_info'] = v1_data['advanced_home_info'].apply(lambda x: pd.DataFrame(x).set_index('key')['value'].to_dict())
v1_data['year_of_build'] = v1_data['advanced_home_info'].apply(lambda x: x.get('Год_ввода_в_эксплуатацию'))
v1_data['rent_counts'] = v1_data['advanced_home_info'].apply(lambda x: x.get('Количество_квартир'))
v1_data['n_enterss'] = v1_data['advanced_home_info'].apply(lambda x: x.get('Количество_подъездов'))
v1_data['m2_house'] = v1_data['advanced_home_info'].apply(lambda x: x.get('Площадь_многоквартирного_дома,_кв.м'))
v1_data['n_rooms'] = v1_data['rooms'].str.extract('(\d)').fillna(0).astype(float)
v1_data['is_apart'] = v1_data['rooms'].str.contains('апарт')


In [291]:
v1_data['Округ'] = v1_data.groupby('Округ')['price'].transform('mean')
v1_data['Метро'] = v1_data.groupby('Метро')['price'].transform('mean')
v1_data['Район'] = v1_data.groupby('Район')['price'].transform('mean')
v1_data['postcode'] = v1_data['postcode'].explode().astype(float).groupby(level=0).mean() // 100

In [292]:
import numpy as np

In [295]:
data = v1_data.select_dtypes(exclude='O').drop(['price','rubm2'],axis=1)
target = v1_data['price']

In [296]:
x,xv,y,yv = model_selection.train_test_split(data,target,train_size=.25)

In [297]:
model_v1 = CatBoostRegressor(2000)

In [298]:
model.fit(x,y,verbose=500)

Learning rate set to 0.024107
0:	learn: 20730488.5235801	total: 8.93ms	remaining: 17.9s
500:	learn: 3415346.4902779	total: 523ms	remaining: 1.56s
1000:	learn: 2310476.6452199	total: 1.11s	remaining: 1.11s
1500:	learn: 1749910.6952148	total: 1.71s	remaining: 568ms
1999:	learn: 1365260.3651847	total: 2.4s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7f2f0b2f2e20>

In [299]:
_metrics.update(
   {'model_v1': metrics.mean_absolute_percentage_error(yv,model.predict(xv))}
)

In [306]:
pd.Series(model.feature_importances_,model.feature_names_).sort_values()

is_max_floor       0.146777
is_apart           0.289264
has_park           0.382278
is_lot             0.425708
Округ              1.564464
lat                1.767759
postcode           1.825455
dist_to_center     2.643301
is_jk              2.925994
long               2.954842
n_rooms            3.415279
floor              3.581719
max_floor          5.907084
Метро              7.023930
Район             22.763073
m2                42.383075
dtype: float64

# Result

In [300]:
_metrics

{'baseline': 0.25244297885044187,
 'base_model': 0.20540931830145231,
 'model_v1': 0.13952217648451065}