### Импорт библиотек

In [1]:
import numpy as np
import pyarrow
import pandas as pd
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import roc_auc_score

### Загрузка данных

In [2]:
train = pd.read_parquet('data/train_main_features.parquet')
train_ext = pd.read_parquet('data/train_extra_features.parquet')
test = pd.read_parquet('data/test_main_features.parquet')
test_ext = pd.read_parquet('data/test_extra_features.parquet')
target = pd.read_parquet('data/train_target.parquet')
sample_submit = pd.read_parquet('data/sample_submit.parquet')

print('Тренировочные данные:', train.shape)
print('Тренировочные данные extra:', train_ext.shape)
print('Тестовые данные:', test.shape)
print('Тестовые данные extra:', test_ext.shape)

Тренировочные данные: (750000, 200)
Тренировочные данные extra: (750000, 2242)
Тестовые данные: (250000, 200)
Тестовые данные extra: (250000, 2242)


### Объединение данных

In [3]:
import pandas as pd

cols_to_add_train = [col for col in train_ext.columns if col not in train.columns]
train_full = pd.concat([train, train_ext[cols_to_add_train]], axis=1)

cols_to_add_test = [col for col in test_ext.columns if col not in test.columns]
test_full = pd.concat([test, test_ext[cols_to_add_test]], axis=1)

In [4]:
print('Полные тренировочные данные:', train_full.shape)
print('Полные тестовые данные:', test_full.shape)

Полные тренировочные данные: (750000, 2441)
Полные тестовые данные: (250000, 2441)


In [5]:
train_full.isnull().sum().sum()

np.int64(1027782741)

In [6]:
cat_feature_names = [
    col_name for col_name in train_full.columns 
    if col_name.startswith("cat_feature")
]

train_full[cat_feature_names] = train_full[cat_feature_names].astype(str)
test_full[cat_feature_names] = test_full[cat_feature_names].astype(str)

### Обучение модели (13 минут обучения на RTX 4080)

In [7]:
train_pool = Pool(data = train_full.drop("customer_id", axis=1), 
                  label = target.drop("customer_id", axis=1), 
                  cat_features = cat_feature_names)

model = CatBoostClassifier(
    iterations=1000,
    depth=6,
    learning_rate=0.1,
    loss_function="MultiLogloss",
    random_seed=42,
    verbose=100,
    task_type="GPU",
)
model.fit(train_pool)

0:	learn: 0.4839986	total: 1.08s	remaining: 17m 57s
100:	learn: 0.0855812	total: 1m 29s	remaining: 13m 15s
200:	learn: 0.0832911	total: 2m 56s	remaining: 11m 42s
300:	learn: 0.0818387	total: 4m 25s	remaining: 10m 15s
400:	learn: 0.0808172	total: 5m 51s	remaining: 8m 45s
500:	learn: 0.0801150	total: 7m 15s	remaining: 7m 13s
600:	learn: 0.0796137	total: 8m 35s	remaining: 5m 42s
700:	learn: 0.0791833	total: 9m 53s	remaining: 4m 13s
800:	learn: 0.0787886	total: 11m 8s	remaining: 2m 46s
900:	learn: 0.0784760	total: 12m 24s	remaining: 1m 21s
999:	learn: 0.0782026	total: 13m 39s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x171f50732f0>

### Предсказание и генерация submission

In [8]:
test_pool = Pool(data = test_full.drop("customer_id", axis=1), 
                 cat_features = cat_feature_names)
test_predict = model.predict(test_pool, prediction_type = "RawFormulaVal")
result_df = sample_submit.copy()
result_df.iloc[:, 1:] = test_predict
result_df['customer_id'] = result_df['customer_id'].astype('int32')
current_type = result_df['customer_id'].dtype
result_df.to_parquet('submission.parquet', index=False)