In [1]:
%load_ext autoreload
%autoreload 2

from __future__ import annotations

import os
from functools import partial
from typing import Optional

import catboost
import numpy as np
import pandas as pd
import polars as pl
from tqdm import tqdm

from mts_ml_cup.modeling import catboost as cb
from mts_ml_cup.preprocessing import urls as u

# Подготовка

In [2]:
%%time
train = pl.read_parquet("../data/processed/train.pq")
test = pl.read_parquet("../data/processed/test.pq")

CPU times: user 32.6 ms, sys: 20.5 ms, total: 53.1 ms
Wall time: 21.3 ms


In [5]:
%%time
url_cleaner = partial(
    u.clean_url,
    preprocessors=[
        u.decode_from_punycode,
        u.lower,
        u.replace_hyphens_with_dots,
    ],
)

sessions = pl.read_parquet("../data/processed/sessions.pq")
sessions = (
    sessions
    .join(
        other=sessions
            .select("url_host")
            .unique()
            .with_columns(pl.col("url_host").apply(url_cleaner).alias("url_cleaned")),
        on="url_host",
        how="left",
    )
    .select(pl.exclude("url_host"))
    .with_columns(pl.col("url_cleaned").alias("url_host"))
)

CPU times: user 3min 24s, sys: 1min 56s, total: 5min 20s
Wall time: 47.5 s


In [6]:
%%time
sessions = (
    sessions
    .join(
        other=sessions
        .select("url_host")
        .unique()
        .with_row_count()
        .select(["url_host", pl.col("row_nr").alias("url_id")]),
        how="left",
        on="url_host",
    )
    .join(
        other=sessions
        .select("date")
        .unique()
        .with_row_count()
        .select(["date", pl.col("row_nr").alias("date_id")]),
        how="left",
        on="date",
    )
)

CPU times: user 2min 51s, sys: 10min 53s, total: 13min 44s
Wall time: 39.9 s


In [9]:
%%time
(
    sessions
    .join(train.with_columns(pl.lit(True).alias("is_train")), how="left", on="user_id")
    .filter(pl.col("is_train"))
    .select(pl.exclude("is_train"))
    .filter(pl.col("is_male").is_not_null() & pl.col("age_bucket").is_not_null())
    .with_columns(pl.col("age_bucket").clip_min(1))
).write_parquet("../data/len1/train.pq")

CPU times: user 1min 46s, sys: 2min 20s, total: 4min 7s
Wall time: 1min 31s


In [10]:
%%time
(
    sessions
    .join(test.with_columns(pl.lit(True).alias("is_test")), how="left", on="user_id")
    .filter(pl.col("is_test"))
    .select(pl.exclude("is_test"))
).write_parquet("../data/len1/test.pq")

CPU times: user 51.6 s, sys: 43.6 s, total: 1min 35s
Wall time: 43.5 s


# Нарезание фолдов

In [2]:
%%time
train = pl.read_parquet(
    "../data/len1/train.pq",
    columns=[
        "region_id", "city_id",
        "manufacturer_id", "model_id", "type_id", "os_id", "price",
        "date_id", "part_of_day_id", "request_cnt", "url_id",
        "user_id", "is_male", "age_bucket",
    ]
)

CPU times: user 21.4 s, sys: 4.14 s, total: 25.5 s
Wall time: 3.11 s


## kfold

In [3]:
train_short = (
    train
    .groupby(
        [
            "region_id", "city_id", 
            "manufacturer_id", "model_id", "type_id", "os_id",
            "url_id",
            "is_male", "age_bucket"
        ]
    )
    .agg([pl.col("price").mean(), pl.col("user_id").n_unique().alias("n_users")])
)

In [6]:
from sklearn.model_selection import KFold

def kfold_split(x: pl.DataFrame, n_splits: int = 5) -> Generator:
    return KFold(n_splits=n_splits, shuffle=True, random_state=777).split(x)

In [8]:
for i, (train_idx, val_idx) in tqdm(enumerate(kfold_split(train_short))):
    (
        train_short[train_idx]
        .with_columns((pl.col("n_users") / pl.col("n_users").sum() * pl.col("n_users").count()).alias("weight"))
        .write_parquet(f"../data/len1/folds-k/{i}/train.pq")
    )
    (
        train_short[val_idx]
        .with_columns((pl.col("n_users") / pl.col("n_users").sum() * pl.col("n_users").count()).alias("weight"))
        .write_parquet(f"../data/len1/folds-k/{i}/val.pq")
    )

5it [00:28,  5.67s/it]


## fold by user

In [6]:
folds = pd.read_csv("../data/processed/folds.csv")

In [10]:
for i in tqdm(range(5)):
    train_users = folds.loc[folds[f"fold_{i}_tr"] == 1, "user_id"].tolist()
    val_users = folds.loc[folds[f"fold_{i}_va"] == 1, "user_id"].tolist()
    
    train.filter(pl.col("user_id").is_in(train_users)).write_parquet(f"../data/len1/folds/{i}/train.pq")
    train.filter(pl.col("user_id").is_in(val_users)).write_parquet(f"../data/len1/folds/{i}/val.pq")

100%|█████████████████████████████████████████████| 5/5 [02:19<00:00, 27.80s/it]


### Подготовка фолдов к обучению

In [5]:
for fold in tqdm(range(5)):
    train = pl.read_parquet(
        f"../data/len1/folds/{fold}/train.pq",
        columns=[
            "region_id", "city_id",
            "manufacturer_id", "model_id", "type_id", "os_id", "price",
            "url_id",
            "user_id", "is_male", "age_bucket",
        ]
    )
    val = pl.read_parquet(
        f"../data/len1/folds/{fold}/val.pq",
        columns=[
            "region_id", "city_id",
            "manufacturer_id", "model_id", "type_id", "os_id", "price",
            "url_id",
            "user_id", "is_male", "age_bucket",
        ]
    )
    
    train_sex = (
        train
        .select(
            [
                "region_id", "city_id", 
                "manufacturer_id", "model_id", "type_id", "os_id", "price",
                "url_id",
                "user_id", "is_male", "age_bucket",
            ]
        )
        .unique()
    )
    val_sex = (
        val
        .select(
            [
                "region_id", "city_id", 
                "manufacturer_id", "model_id", "type_id", "os_id", "price",
                "url_id",
                "user_id", "is_male", "age_bucket",
            ]
        )
        .unique()
    )
    
    train_age = (
        train
        .select(
            [
                "region_id", "city_id", 
                "manufacturer_id", "model_id", "type_id", "os_id", "price",
                "url_id",
                "user_id", "is_male", "age_bucket",
            ]
        )
        .unique()
    )
    val_age = (
        val
        .select(
            [
                "region_id", "city_id", 
                "manufacturer_id", "model_id", "type_id", "os_id", "price",
                "url_id",
                "user_id", "is_male", "age_bucket",
            ]
        )
        .unique()
    )
    
    train_sex.write_parquet(f"../data/len1/folds-short/{fold}/train-sex.pq")
    val_sex.write_parquet(f"../data/len1/folds-short/{fold}/val-sex.pq")
    
    train_age.write_parquet(f"../data/len1/folds-short/{fold}/train-age.pq")
    val_age.write_parquet(f"../data/len1/folds-short/{fold}/val-age.pq")

100%|█████████████████████████████████████████████| 5/5 [03:03<00:00, 36.64s/it]


# Обучение

In [2]:
def fit(fold: int = 0) -> None:
    cat_features = [
        "region_id", "city_id",
        "manufacturer_id", "model_id", "type_id", "os_id",
        "url_id",
    ]
    num_features = ["price"]
    
    train = pd.read_parquet(
        f"../data/len1/folds-k/{fold}/train.pq",
        columns=cat_features + num_features + ["is_male", "age_bucket", "weight"],
    )
    val = pd.read_parquet(
        f"../data/len1/folds-k/{fold}/val.pq",
        columns=cat_features + num_features + ["is_male", "age_bucket", "weight"],
    )
    
    x_train = train.loc[:, cat_features + num_features]
    x_val = val.loc[:, cat_features + num_features]
    
    sex_model = catboost.CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="Logloss",
        iterations=1_000,
        early_stopping_rounds=20,
        random_seed=777,
    )
    sex_model.fit(
        catboost.Pool(
            data=x_train,
            label=train["is_male"],
            cat_features=cat_features,
            weight=train["weight"],
        ),
        eval_set=catboost.Pool(
            data=x_val,
            label=val["is_male"],
            cat_features=cat_features,
            weight=val["weight"],
        ),
        verbose=20,
    )
    sex_model.save_model(f"../data/len1/models/{fold}/sex.cbm")
    print(pd.Series(sex_model.feature_importances_, index=sex_model.feature_names_).sort_values(ascending=False))
    
    age_model = catboost.CatBoostClassifier(
        loss_function="MultiClass",
        eval_metric="MultiClass",
        iterations=1_000,
        early_stopping_rounds=20,
        random_seed=777,
    )
    age_model.fit(
        catboost.Pool(
            data=x_train,
            label=train["age_bucket"],
            cat_features=cat_features,
            weight=train["weight"],
        ),
        eval_set=catboost.Pool(
            data=x_val,
            label=val["age_bucket"],
            cat_features=cat_features,
            weight=val["weight"],
        ),
        verbose=20,
    )
    age_model.save_model(f"../data/len1/models/{fold}/age.cbm")
    print(pd.Series(age_model.feature_importances_, index=age_model.feature_names_).sort_values(ascending=False))

In [None]:
for fold in range(5):
    fit(fold)

Learning rate set to 0.330432
0:	learn: 0.6452697	test: 0.6432581	best: 0.6432581 (0)	total: 9.81s	remaining: 2h 43m 25s
20:	learn: 0.5551090	test: 0.5107918	best: 0.5107918 (20)	total: 2m 34s	remaining: 2h 16s
40:	learn: 0.5404688	test: 0.4953117	best: 0.4953117 (40)	total: 5m 33s	remaining: 2h 10m 5s
60:	learn: 0.5329458	test: 0.4873856	best: 0.4873856 (60)	total: 7m 58s	remaining: 2h 2m 44s
80:	learn: 0.5280933	test: 0.4815460	best: 0.4815460 (80)	total: 10m 54s	remaining: 2h 3m 47s
100:	learn: 0.5226146	test: 0.4761456	best: 0.4761456 (100)	total: 13m 20s	remaining: 1h 58m 46s
120:	learn: 0.5176886	test: 0.4709931	best: 0.4709931 (120)	total: 15m 56s	remaining: 1h 55m 48s
140:	learn: 0.5085624	test: 0.4612203	best: 0.4612203 (140)	total: 18m 36s	remaining: 1h 53m 19s
160:	learn: 0.5050536	test: 0.4578019	best: 0.4578019 (160)	total: 21m 34s	remaining: 1h 52m 24s
180:	learn: 0.5035963	test: 0.4559925	best: 0.4559925 (180)	total: 24m 12s	remaining: 1h 49m 33s
200:	learn: 0.4987559	te