# Библиотеки

In [None]:
!pip install catboost

In [4]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, train_test_split
from catboost import CatBoostRegressor, Pool
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

# Тюнер

In [5]:
def RMSE(y_true, y_pred):
    """
    Compute RMSE loss function.
    """

    return np.sqrt(np.mean(np.power(y_true - y_pred, 2)))



class Tuner:
    """
    Search best hyperparameters for model.
    """

    def __init__(self, model, train_pool, val_pool, space, common_params={}, fit_params={}, int_params=[]):
        """
        Initialization.

        Parameters:
            model (Model) - ML model with sklearn API.
            train_pool (catboost.Pool, dim=(n,m)) - training data.
            val_pool (catboost.Pool, dim=(k,m)) - validation data.
            space (dict) - space for searching hyperparameters.
            common_params (dict) - common model's parameters.
            fit_params (dict) - parameters for method 'fit'.
            int_params (list) - list of integer parameters.
        """
        
        self.__model = model
        self.__train_pool = train_pool
        self.__val_pool = val_pool
        self.__space = space
        self.__common_params = common_params
        self.__fit_params = fit_params
        self.__int_params = int_params
    

    def __params2int(self, params):
        """
        Transform some hyperopt formats to integer.

        Parameters:
            params (dict) - model parameters.
        """
        
        for par in self.__int_params:
            params[par] = int(params[par])

        return params
    

    def __score(self, params):
        """
        Compute score over dictionary of parameters.

        Parameters:
            params (dict) - model parameters.
        """

        model = self.__model(**self.__common_params, **self.__params2int(params))\
                    .fit(self.__train_pool, 
                         eval_set=self.__val_pool, 
                         **self.__fit_params)
        
        y_pred = model.predict(self.__val_pool)
        loss = - RMSE(self.__val_pool.get_label(), y_pred)
        
        return {"loss": loss, "status": STATUS_OK}
    

    def make_hyperopt(self, max_evals=50):
        """
        Make searching of hyperparameters.

        Parameters:
            max_evals (int) - max number of iterations for optimization.
        """

        self.trials = Trials()
        best = fmin(self.__score,
                    space=self.__space,
                    trials=self.trials,
                    algo=tpe.suggest,
                    max_evals=max_evals)
        self.best_parameters = self.__params2int(best)


# Загрузка данных

In [7]:
data = pd.read_csv("/content/drive/MyDrive/hacaton_pandemic/data/train.csv", sep=";")
data.head(2)

Unnamed: 0,id,position,region,industry,locality,locality_name,education_type,drive_licences,citizenship,schedule,employement_type,age,gender,experience,salary_desired,relocation_ready,travel_ready,retraining_ready,is_worldskills_participant,has_qualifications,completeness_rate,creation_date,modification_date,publish_date,salary
0,0,Специалист,Тульская область,"Государственная служба, некоммерческие организ...",7100000100000,Тула,Незаконченное высшее,[B],Российская Федерация,Полный рабочий день,Полная занятость,21.0,Мужской,3,35000,False,False,True,,,64.0,2020-05-07,2020-05-08,2020-05-07,37500
1,1,Лаборант,Алтайский край,"Государственная служба, некоммерческие организ...",2200000100000,Барнаул,Высшее,[B],Российская Федерация,Полный рабочий день,Полная занятость,26.0,Женский,4,15000,False,True,True,,,88.0,2020-10-21,2020-10-27,2020-10-27,14000


# Настройка параметров

In [13]:
TEXT_FEATURES = ["position"]

CAT_FEATURES = ["region", "industry", "locality", "locality_name", "education_type", "drive_licences",
                "citizenship", "schedule", "employement_type", "gender", "relocation_ready",
                "travel_ready", "retraining_ready", "is_worldskills_participant", "has_qualifications",
                "rr_cl", "tr_cl", "gr_cl", "rrr_cl"
]
CAT_FEATURES = ["region"]

SPACE = {
    "iterations": hp.quniform("iterations", 100, 1000, 50),
    "learning_rate": hp.uniform("learning_rate", 1e-1, 5e-1),
    "depth": hp.quniform("depth", 3, 16, 1),

    "min_child_samples": hp.quniform("min_child_samples", 10, 100, 10),

    "reg_lambda": hp.uniform("reg_lambda", 0, 10),

    
    "random_strength": hp.lognormal("random_strength", 1e-9, 1),
    "bagging_temperature": hp.quniform("bagging_temperature", 0, 10, 1),

    "colsample_bylevel": hp.uniform("colsample_bylevel", 0, 1),
    "subsample": hp.uniform("subsample", 0, 1)
}

COMMON_PARAMS = {
    "random_state": 42,
    "task_type": "CPU", #"GPU"
    "objective": "RMSE",
    "eval_metric": "RMSE",
    
    "od_type": "Iter",
    "od_wait": 20,
    "use_best_model": True
}

FIT_PARAMS = {
    "verbose": 0
}

INT_PARAMS = [
    "iterations", "depth", "min_child_samples", "bagging_temperature"
]


MODEL = CatBoostRegressor

# Подготовка данных

In [14]:
X, y = data.loc[:, ["region"]].iloc[:1000].fillna("unknown"), data["salary"].iloc[:1000]

kf = KFold(n_splits=3, random_state=42, shuffle=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    shuffle=True,
                                                    random_state=42)

train_pool = Pool(data=X_train, label=y_train, cat_features=CAT_FEATURES)#, text_features=TEXT_FEATURES)
val_pool = Pool(data=X_test, label=y_test, cat_features=CAT_FEATURES)#, text_features=TEXT_FEATURES)

# Тюнинг гиперпараметров

In [15]:
tuner = Tuner(MODEL, train_pool, val_pool, SPACE, COMMON_PARAMS, FIT_PARAMS, INT_PARAMS)
tuner.make_hyperopt(max_evals=4)

100%|██████████| 4/4 [00:00<00:00, 13.46it/s, best loss: -37394.23824603276]
