In [1]:
import os
import random
import typing as tp
import warnings

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier

warnings.filterwarnings("ignore")

def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

TEST_SIZE = 0.15
set_seed(560)

In [2]:
TEST_PATH = 'data/test_data.pqt'
test = pd.read_parquet(TEST_PATH)

In [3]:
def predict_start(
    dataset_df: pd.DataFrame,
    cat_features: tp.List[str],
    task_type: str = 'GPU',
    model = None,
    params = None,
):
    """
    Заполняем пропуски в start_cluster для 6 месяца, обучаемся на 5
    Возвращаем копию датасета и модель
    """
    dataset = dataset_df.copy()
    dataset.fillna(0, inplace=True)
    cat_features = [feat for feat in cat_features if feat != 'start_cluster']
    cat_features = cat_features + ['prev_month']
    if not params:
        params = {
            'random_state': 560,
            'task_type': task_type,
            'cat_features': cat_features,
        }
    if not model:
        model1 = CatBoostClassifier(**params)
    month_4 = dataset[dataset.date == 'month_4'].copy()
    month_5 = dataset[dataset.date == 'month_5'].copy()
    month_6 = dataset[dataset.date == 'month_6'].copy()
    train_dataset = month_5.merge(month_4[['id', 'start_cluster']], on='id', how='right')
    pred_dataset = month_6.merge(month_5[['id', 'start_cluster']], on='id', how='right')
    train_dataset.rename(columns={'start_cluster_x': 'target', 'start_cluster_y': 'prev_month'}, inplace=True)
    pred_dataset.rename(columns={'start_cluster_x': 'target', 'start_cluster_y': 'prev_month'}, inplace=True)
    X = train_dataset.drop(["id", "date", "target"], axis=1)
    y = train_dataset["target"]
    model1.fit(X, y, verbose=False)
    dataset.loc[dataset.date == 'month_6', 'start_cluster'] = model1.predict(pred_dataset.drop(["id", "date", "target"], axis=1))
    return dataset, model

In [4]:
cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
]

dataset_test, _ = predict_start(test, cat_cols)
test['start_cluster'] = dataset_test['start_cluster']

In [None]:
test.to_parquet("data/test_data_filled.pqt", index=False)