In [1]:
import pandas as pd
from sklearn import metrics,model_selection,pipeline

In [2]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [3]:
df = pd.read_parquet('../../data/sample_data.parquet').dropna(subset='rubm2')
df = df.drop(['datetime','publish_delta','url','id','text','Город','title','img_list','metro_branch','metro_name','metro_dist'],axis=1)

In [110]:
def general_prepare(df):
    data = df.copy()
    data['rooms'] = data['rooms'].where(lambda x: x.isin(data['rooms'].value_counts().iloc[:6].index),'Other')
    
    data['postcode'] = data['postcode'].explode().astype(float).groupby(level=0).mean() // 100
    data['is_apart'] = data['rooms'].str.contains('апарт')
    data['n_rooms'] = data['rooms'].str.extract('(\d)').fillna(0).astype(float)
    return data

In [111]:
def get_meta_features(data):
    
    data = df.copy()
    data['area_obj_cnt'] = data.groupby('Округ')['rubm2'].transform('count')
    data['metro_obj_cnt'] = data.groupby('Метро')['rubm2'].transform('count')
    data['district_obj_cnt'] = data.groupby('Район')['rubm2'].transform('count')
    data['Округ'] = data.groupby('Округ')['rubm2'].transform('mean')
    data['Метро'] = data.groupby('Метро')['rubm2'].transform('mean')
    data['Район'] = data.groupby('Район')['rubm2'].transform('mean')

    return data[['area_obj_cnt','metro_obj_cnt','district_obj_cnt','Округ','Метро','Район']]

In [117]:
def adv_home_prepare(df):

    feature_dict = df.apply(lambda x: pd.DataFrame(x).set_index('key')['value'].to_dict()).rename('advacned_info')
    result_dict = {}
    result_dict['year_of_build'] = feature_dict.apply(lambda x: x.get('Год_ввода_в_эксплуатацию'))
    result_dict['rent_counts'] = feature_dict.apply(lambda x: x.get('Количество_квартир'))
    result_dict['n_enterss'] = feature_dict.apply(lambda x: x.get('Количество_подъездов'))
    result_dict['m2_house'] = feature_dict.apply(lambda x: x.get('Площадь_многоквартирного_дома,_кв.м'))

    return pd.DataFrame(result_dict)

In [118]:
def data_filter(df):
    data = df.copy()

    return data.select_dtypes(execlude='O').pipe(lambda x: x.fillna(x.mean()))

In [129]:
def data_pipeline(df):
    data = general_prepare(df)
    home_data = adv_home_prepare(data['advanced_home_info'])
    meta_data = get_meta_features(data[['Округ','Метро','Район']])
    concated = data.drop(['Округ','Метро','Район'],axis=1).join(home_data).join(meta_data)
    result = concated
    
    return result

In [126]:
from catboost import CatBoostRegressor

In [None]:
def fit_model(data,target):
    df = data_pipeline(data)
    X = df.drop(target,axis=1)
    Y = df[target]

    x,xv,y,yv = model_selection.train_test_split(X,Y,train_size=.85)

    model = CatBoostRegressor(iterations=2000,verbose=500)
    model.fit(x,y,verbose=5000,eval_set=(xv,yv))
    
    return model