# Импорт библиотек

In [38]:
import pandas as pd
from catboost import *
from lightgbm import *
from xgboost import *
import os
import contextlib
from sklearn.model_selection import GridSearchCV, train_test_split,StratifiedKFold
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_percentage_error as mape
import numpy as np
import pickle
import warnings

# Загрузка данных

In [8]:
df_le=pd.read_csv(r"C:\Users\Stepan\Desktop\ВУЗ\Клиент-серверные\папка датафреймов\df_after_delete_le.csv")
df_mte=pd.read_csv(r"C:\Users\Stepan\Desktop\ВУЗ\Клиент-серверные\папка датафреймов\df_after_delete_mte.csv")
df_le=df_le.drop('Unnamed: 0', axis=1)
df_mte=df_mte.drop('Unnamed: 0', axis=1)
df_le.columns

Index(['metro_name', 'time_to_metro', 'num_rooms', 'total_area', 'year_house',
       'kitchen_area', 'repair', 'price', 'floors', 'num_floor', 'address',
       'date', 'coord_width', 'coord_len', 'link', 'distance_to_Kremlin',
       'district', 'metro_line', 'metro_distance', 'distr*time', 'Kre*time',
       'Kre*area', 'year*area', 'year*floors', 'area/num+1'],
      dtype='object')

In [9]:
X_le, y_le=df_le.drop(['metro_name','price','address','date','coord_width','coord_len','link'], axis=1), df_le['price']
X_mte, y_mte=df_mte.drop(['metro_name','price','address','date','coord_width','coord_len','link'], axis=1), df_mte['price']

# Стеккинг моделей

Будем использовать три градиентных бустинга: catboost, xgboost, lightgbm.

In [10]:
cat=CatBoostRegressor(random_state=42, silent=True)
lgbm=LGBMRegressor(verbose=-1, random_state=42, bagging_fraction=0.5)
xgb=XGBRegressor(random_state=42)

Потюним модели. Для xgboost будем использовать данные с mean target encoding, так как xgboost лучше работает на этих данных. Для lightgbm и catboost будем использовать label encoding.

Подбор гиперпараметров для lightgbm.

In [11]:
param_grid={ 'lambda_l1':[1,5,15,30], 'learning_rate':[0.1, 0.2, 0.3],
                         'n_estimators':[1000, 2000, 4000]}
with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
    gd_lgbm=GridSearchCV(lgbm, param_grid, cv=5, scoring='neg_mean_absolute_percentage_error',verbose=-1 )
    gd_lgbm.fit(X_le, y_le)


Подбор гиперпараметров для catboost.

In [12]:
grid = {'learning_rate': [0.03, 0.1, 0.2],
        'depth': [2,4, 6,8],
        'l2_leaf_reg': [1, 10, 30],
       'iterations':[1000,2000, 4000],
       'learning_rate':[0.1, 0.2, 0.3]}
with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
    grid_search_result = cat.grid_search(grid, X=X_le, y=y_le, plot=False, verbose=False, cv=5, calc_cv_statistics=False,log_cout=f)

Подбор гиперпараметров для xgboost.

In [13]:
param_grid={ 'lambda_l1':[1,15,30], 'learning_rate':[0.1, 0.2],
                         'n_estimators':[ 2000, 4000], 'depth':[2,4, 8]}
with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
    gd_xgb=GridSearchCV(xgb, param_grid, cv=5, scoring='neg_mean_absolute_percentage_error',verbose=1 )
    gd_xgb.fit(X_mte, y_mte)
 

Модели с подобранными гиперпараметрами.

In [14]:
cat=CatBoostRegressor(random_state=42, **grid_search_result['params'], verbose=False)  
xgb=XGBRegressor(random_state=42, **gd_xgb.best_params_)  
lgbm=LGBMRegressor(verbose=-1, random_state=42, bagging_fraction=0.5, **gd_lgbm.best_params_)   

Разделим данные на трейн, тест.

In [51]:
X_ltr,X_lv,y_ltr,y_lv=train_test_split(X_le, y_le, test_size=0.1, shuffle=True, random_state=42)
X_mtr,X_mv,y_mtr,y_mv=train_test_split(X_mte, y_mte, test_size=0.1, shuffle=True, random_state=42)

Обучим модели на кроссвалидации и сохраним их. Из предсказаний на кросс-валидации сделаем датасет для обучения стек-модели.

In [57]:
k=0
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
y,y1,y2,y3=np.zeros(X_ltr.shape[0]),np.zeros(X_ltr.shape[0]),np.zeros(X_ltr.shape[0]),np.zeros(X_ltr.shape[0])


with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        for tr_ind, val_ind in skf.split(X_ltr, y_ltr):

            cat=CatBoostRegressor(random_state=42, **grid_search_result['params'], verbose=False)  
            xgb=XGBRegressor(random_state=42, **gd_xgb.best_params_)  
            lgbm=LGBMRegressor(verbose=-1, random_state=42, bagging_fraction=0.5, **gd_lgbm.best_params_)   
            cat.fit(X_ltr.iloc[tr_ind], y_ltr.iloc[tr_ind])
            xgb.fit(X_mtr.iloc[tr_ind],y_mtr.iloc[tr_ind])
            lgbm.fit(X_mtr.iloc[tr_ind], y_mtr.iloc[tr_ind])


            y[val_ind]=y_ltr.iloc[val_ind]
            y1[val_ind]=cat.predict(X_ltr.iloc[val_ind])
            y2[val_ind]=xgb.predict(X_mtr.iloc[val_ind])
            y3[val_ind]=lgbm.predict(X_mtr.iloc[val_ind])



            with open(fr"C:\Users\Stepan\Desktop\ВУЗ\Клиент-серверные\Jupyter notebooks\utils\cat{k}.pkl", "wb") as f:
                pickle.dump(cat, f)
            with open(fr"C:\Users\Stepan\Desktop\ВУЗ\Клиент-серверные\Jupyter notebooks\utils\xgb{k}.pkl", "wb") as f:
                pickle.dump(xgb, f)
            with open(fr"C:\Users\Stepan\Desktop\ВУЗ\Клиент-серверные\Jupyter notebooks\utils\lgbm{k}.pkl", "wb") as f:
                pickle.dump(lgbm, f)

            k+=1
    


Обучение стек модели на кросс-валидации

In [58]:
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
k=0

df_stack=pd.DataFrame(columns=['y1','y2','y3','y'])
df_stack['y']=y
df_stack['y1']=y1
df_stack['y2']=y2
df_stack['y3']=y3

errors=[]


with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        for tr_ind, val_ind in skf.split(X_ltr, y_ltr):

            s_model=Ridge(alpha=1,fit_intercept=False )
            s_model.fit(df_stack.drop('y', axis=1).iloc[tr_ind], df_stack.iloc[tr_ind,3])
            with open(fr"C:\Users\Stepan\Desktop\ВУЗ\Клиент-серверные\Jupyter notebooks\utils\s_model{k}.pkl", "wb") as f:
                pickle.dump(s_model, f)
            k+=1
            errors.append(mape(df_stack.iloc[val_ind,3], s_model.predict(df_stack.drop('y', axis=1).iloc[val_ind])))
print(errors, np.mean(errors))
    


[0.14972913316615152, 0.14706691880950357, 0.14072319471489222, 0.14767801470563993, 0.14711528319328634] 0.14646250891789472


Протестируем нашу стек модель на отложенной выборке

In [59]:
y1=[]
y2=[]
y3=[]

for i in range(5):
    with open(fr"C:\Users\Stepan\Desktop\ВУЗ\Клиент-серверные\Jupyter notebooks\utils\cat{i}.pkl", "rb") as f:
        cat=pickle.load(f)
    with open(fr"C:\Users\Stepan\Desktop\ВУЗ\Клиент-серверные\Jupyter notebooks\utils\xgb{i}.pkl", "rb") as f:
        xgb=pickle.load(f)
    with open(fr"C:\Users\Stepan\Desktop\ВУЗ\Клиент-серверные\Jupyter notebooks\utils\lgbm{i}.pkl", "rb") as f:
        lgbm=pickle.load(f)
    y1.append(cat.predict(X_lv))
    y2.append(xgb.predict(X_mv))
    y3.append(lgbm.predict(X_mv))
    
y1=np.mean(y1, axis=0)
y2=np.mean(y2, axis=0)
y3=np.mean(y3, axis=0)

df_test=pd.DataFrame(columns=['y1','y2','y3'])

df_test['y1']=y1
df_test['y2']=y2
df_test['y3']=y3

In [60]:
res=[]

for i in range(5):
    with open(fr"C:\Users\Stepan\Desktop\ВУЗ\Клиент-серверные\Jupyter notebooks\utils\s_model{i}.pkl", "rb") as f:
        s_model=pickle.load(f)
    res.append(s_model.predict(df_test))

res=np.mean(res, axis=0)
mape(y_lv, res)

0.13736349061687375

0.13735134844436467 Такого качество нашей стек модели на отложенной выборке