In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from __future__ import annotations

import functools as ft
import os

import numpy as np
import pandas as pd
import polars as pl
from tqdm import tqdm

from mts_ml_cup.modeling import catboost as cb
from mts_ml_cup.modeling import validation as v

In [3]:
def join_precomputed_features(dataset: pl.DataFrame, black_list: set[str] = None) -> pl.DataFrame:
    black_list = black_list or set()
    for feat_type in os.listdir("../data/feat"):
        for features_file in os.listdir(f"../data/feat/{feat_type}"):
            if not features_file.endswith(".pq") or features_file in black_list:
                continue
            
            features = (
                pl.read_parquet(f"../data/feat/{feat_type}/{features_file}")
                .with_columns(pl.col("user_id").cast(pl.UInt32))
            )
            dataset = dataset.join(features, how="left", on="user_id")
    return dataset

In [5]:
%%time
black_list = {}

train = join_precomputed_features(
    pl.read_parquet("../data/processed/train.pq"),
    black_list=black_list,
)
test = join_precomputed_features(
    pl.read_parquet("../data/processed/test.pq"),
    black_list=black_list,
)

CPU times: user 57.9 s, sys: 10.8 s, total: 1min 8s
Wall time: 1min 47s


# Model

In [10]:
model = cb.CatBoostCV(
    pool_params={
        "cat_features": [
            "time_top_part_of_day",
            "time_first_day",
            "time_last_day",
            "time_first_month",
            "time_last_month",
            "time_total_months",
            "time_first_year",
            "time_last_year",
            "time_total_years",
            
            "geo_top_city_id",
            "geo_top_region_id",
            
            "device_manufacturer_id",
            "device_model_id",
            "device_os_id",
            "device_type_id",
        ] 
        + [f"url_top_{i}_url" for i in range(1, 121)],
        "text_features": ["url_all_visited_urls"],
        "embedding_features": ["mini_lm_embeddings", "ptls_embeddings"],
    },
    model_params={
        "iterations": 100_000,
        "early_stopping_rounds": 1_000,
        "random_seed": 777,
    },
    splitter=ft.partial(v.manual_split, folds=pd.read_csv("../data/processed/folds.csv")),
)

In [12]:
%%time
model.fit(train.with_columns([pl.col(["time_total_months", "time_total_years"]).cast(pl.Int32)]))

Learning rate set to 0.006712
0:	learn: 0.6900032	test: 0.6899907	best: 0.6899907 (0)	total: 18.6ms	remaining: 31m 4s
1000:	learn: 0.4510995	test: 0.4560918	best: 0.4560918 (1000)	total: 45.9s	remaining: 1h 15m 39s
2000:	learn: 0.4425687	test: 0.4499387	best: 0.4499387 (2000)	total: 1m 44s	remaining: 1h 25m 31s
3000:	learn: 0.4362540	test: 0.4459944	best: 0.4459944 (3000)	total: 2m 27s	remaining: 1h 19m 26s
4000:	learn: 0.4316788	test: 0.4437244	best: 0.4437244 (4000)	total: 3m 15s	remaining: 1h 18m 4s
5000:	learn: 0.4278874	test: 0.4422237	best: 0.4422230 (4999)	total: 4m 2s	remaining: 1h 16m 47s
6000:	learn: 0.4245554	test: 0.4409968	best: 0.4409968 (6000)	total: 4m 54s	remaining: 1h 16m 49s
7000:	learn: 0.4214845	test: 0.4400397	best: 0.4400397 (7000)	total: 5m 42s	remaining: 1h 15m 52s
8000:	learn: 0.4186443	test: 0.4392560	best: 0.4392560 (8000)	total: 6m 29s	remaining: 1h 14m 42s
9000:	learn: 0.4159807	test: 0.4386026	best: 0.4386026 (9000)	total: 7m 18s	remaining: 1h 13m 57s
100

[{'sex ROC-AUC': 0.8818433295188401,
  'age F1 Weighted': 0.46564825435879326,
  'mts-ml-cup metric': 1.6949831677552667},
 {'sex ROC-AUC': 0.8866351874817093,
  'age F1 Weighted': 0.466254171300664,
  'mts-ml-cup metric': 1.7057787175647467},
 {'sex ROC-AUC': 0.8842707756820734,
  'age F1 Weighted': 0.468980718779262,
  'mts-ml-cup metric': 1.7065029889226708},
 {'sex ROC-AUC': 0.884443153594586,
  'age F1 Weighted': 0.4711750976426655,
  'mts-ml-cup metric': 1.711236502474503},
 {'sex ROC-AUC': 0.8848525866648471,
  'age F1 Weighted': 0.4629277700577184,
  'mts-ml-cup metric': 1.695560713445131}]

In [16]:
%%time
oof_preds = model.predict_oof(train.with_columns([pl.col(["time_total_months", "time_total_years"]).cast(pl.Int32)]))
oof_preds.to_csv("fold_preds.csv", index=False)

CPU times: user 1h 16min 40s, sys: 16.3 s, total: 1h 16min 57s
Wall time: 9min 57s


In [17]:
%%time
test_preds = model.predict(test.with_columns([pl.col(["time_total_months", "time_total_years"]).cast(pl.Int32)]))
test_preds.to_csv("test_preds.csv", index=False)
test_preds[["user_id", "is_male", "age"]].to_csv("../submissions/v3.csv", index=False)

CPU times: user 41min 13s, sys: 6.76 s, total: 41min 20s
Wall time: 5min 19s


In [18]:
%%time
model.save_models("../models/v3")

CPU times: user 1.43 s, sys: 9.92 s, total: 11.3 s
Wall time: 1min 4s


# top-urls encoded

In [7]:
%%time
models_sex, models_age, metrics = cb.fit(
    train=train,
    pool_params={
        "cat_features": [
            "top_part_of_day",
            # geo
            "geo_top_city_id",
            "geo_top_region_id",
            # device
            "device_manufacturer_id",
            "device_model_id",
            "device_os_id",
            "device_type_id",
        ],
        "text_features": ["urls_text"],
        "embedding_features": ["mini_lm_embeddings", "ptls_embeddings"],
    },
    model_params={
        "iterations": 100_000,
        "early_stopping_rounds": 1_000,
        "random_seed": 777,
    },
    splitter=ft.partial(v.manual_split, folds=pd.read_csv("../data/processed/folds.csv")),
)

Learning rate set to 0.006712
0:	learn: 0.6898803	test: 0.6898612	best: 0.6898612 (0)	total: 19ms	remaining: 31m 43s
1000:	learn: 0.4503746	test: 0.4552817	best: 0.4552817 (1000)	total: 17.2s	remaining: 28m 24s
2000:	learn: 0.4418514	test: 0.4494584	best: 0.4494584 (2000)	total: 34.5s	remaining: 28m 8s
3000:	learn: 0.4358314	test: 0.4461214	best: 0.4461214 (3000)	total: 51.3s	remaining: 27m 38s
4000:	learn: 0.4311140	test: 0.4440697	best: 0.4440697 (4000)	total: 1m 8s	remaining: 27m 13s
5000:	learn: 0.4271113	test: 0.4426846	best: 0.4426846 (5000)	total: 1m 24s	remaining: 26m 48s
6000:	learn: 0.4234809	test: 0.4416338	best: 0.4416334 (5999)	total: 1m 41s	remaining: 26m 30s
7000:	learn: 0.4201328	test: 0.4408182	best: 0.4408182 (7000)	total: 1m 58s	remaining: 26m 7s
8000:	learn: 0.4169743	test: 0.4401366	best: 0.4401366 (8000)	total: 2m 14s	remaining: 25m 50s
9000:	learn: 0.4139720	test: 0.4395662	best: 0.4395646 (8999)	total: 2m 31s	remaining: 25m 33s
10000:	learn: 0.4110443	test: 0.43

# +top-urls as is

In [8]:
%%time
models_sex, models_age, metrics = cb.fit(
    train=train,
    pool_params={
        "cat_features": [
            "top_part_of_day",
            # geo
            "geo_top_city_id",
            "geo_top_region_id",
            # device
            "device_manufacturer_id",
            "device_model_id",
            "device_os_id",
            "device_type_id",
        ],
        "text_features": ["urls_text"],
        "embedding_features": ["mini_lm_embeddings", "ptls_embeddings"],
    },
    model_params={
        "iterations": 100_000,
        "early_stopping_rounds": 1_000,
        "random_seed": 777,
    },
    splitter=ft.partial(v.manual_split, folds=pd.read_csv("../data/processed/folds.csv")),
)

Learning rate set to 0.006712
0:	learn: 0.6899730	test: 0.6899407	best: 0.6899407 (0)	total: 38.2ms	remaining: 1h 3m 35s
1000:	learn: 0.4499896	test: 0.4549004	best: 0.4549004 (1000)	total: 29s	remaining: 47m 49s
2000:	learn: 0.4415848	test: 0.4489820	best: 0.4489820 (2000)	total: 1m 5s	remaining: 53m 5s
3000:	learn: 0.4354834	test: 0.4452224	best: 0.4452224 (3000)	total: 1m 31s	remaining: 49m 25s
4000:	learn: 0.4309941	test: 0.4430360	best: 0.4430360 (4000)	total: 2m	remaining: 48m 19s
5000:	learn: 0.4273377	test: 0.4415788	best: 0.4415788 (5000)	total: 2m 31s	remaining: 48m 6s
6000:	learn: 0.4240330	test: 0.4404612	best: 0.4404612 (6000)	total: 3m 1s	remaining: 47m 20s
7000:	learn: 0.4210521	test: 0.4396201	best: 0.4396201 (7000)	total: 3m 32s	remaining: 47m 5s
8000:	learn: 0.4182886	test: 0.4389278	best: 0.4389278 (8000)	total: 4m 3s	remaining: 46m 44s
9000:	learn: 0.4156395	test: 0.4383015	best: 0.4383014 (8999)	total: 4m 32s	remaining: 45m 59s
10000:	learn: 0.4131017	test: 0.43777

In [8]:
fold_preds = []
for i, (train_idx, val_idx) in tqdm(enumerate(
    v.manual_split(train, folds=pd.read_csv("../data/processed/folds.csv"))
)):
    val_fold = train[val_idx]
    val_fold_pred = cb.predict(
        test=val_fold
        .select(pl.exclude(["is_male", "age", "age_bucket"])),
        pool_params={
            "cat_features": [
                "top_part_of_day",
                # geo
                "geo_top_city_id",
                "geo_top_region_id",
                # device
                "device_manufacturer_id",
                "device_model_id",
                "device_os_id",
                "device_type_id",

            ],
            "text_features": ["urls_text"],
            "embedding_features": ["mini_lm_embeddings", "ptls_embeddings"],
        },
        models_sex=models_sex[i:i+1],
        models_age=models_age[i:i+1],
        type="val",
    )
    fold_preds.append(val_fold_pred)
for i in range(len(fold_preds)):
    fold_preds[i].loc[:, "fold"] = i
fold_preds = pd.concat(fold_preds).reset_index(drop=True)

5it [01:51, 22.26s/it]


In [9]:
fold_preds.to_csv("fold_preds-cb-encoder.csv", index=False)

In [12]:
%%time
submission = cb.predict(
    test=test,
    pool_params={
        "cat_features": [
            "top_part_of_day",
            # geo
            "geo_top_city_id",
            "geo_top_region_id",
            # device
            "device_manufacturer_id",
            "device_model_id",
            "device_os_id",
            "device_type_id",

        ],
        "text_features": ["urls_text"],
        "embedding_features": ["mini_lm_embeddings", "ptls_embeddings"],
    },
    models_sex=models_sex,
    models_age=models_age,
    type="test",
)
# submission.to_csv("../submissions/mini-lm-2.csv", index=False)

CPU times: user 38min 1s, sys: 2.39 s, total: 38min 4s
Wall time: 4min 55s


In [13]:
submission[["user_id", "is_male", "age"]].to_csv("../submissions/top-120-encoded.csv", index=False)

In [14]:
submission.to_csv("test_preds-cb-encoder.csv", index=False)

In [20]:
sex_fi = 0
for model in models_sex:
    sex_fi += pd.Series(model.feature_importances_, index=model.feature_names_) / len(models_sex)

In [24]:
sex_fi.sort_values(ascending=False).head(50)

urls_text                   60.315387
mini_lm_embeddings           2.255766
ptls_embeddings              2.141832
device_mean_price            1.566090
n_cities                     1.242033
n_urls                       1.119571
avg_url_length               1.064800
avg_requests_by_session      1.051927
morning_requests_share       1.006729
morning_urls_share           0.960265
device_model                 0.946565
morning_days_share           0.887833
day_urls_share               0.880050
night_requests_share         0.815331
night_urls_share             0.813550
day_requests_share           0.811611
day_days_share               0.785143
avg_urls_by_day_part         0.750446
evening_urls_share           0.750273
evening_requests_share       0.749193
avg_day_parts_by_day         0.721046
avg_urls_by_day              0.707422
n_days                       0.696851
n_day_parts                  0.676240
n_sessions                   0.660734
evening_days_share           0.599683
device_manuf

In [15]:
from pathlib import Path

In [16]:
p = Path("../models/new")

In [17]:
p.mkdir()

In [18]:
Path(p)

PosixPath('../models/new')

In [15]:
submission.to_csv("test_preds-ptls.csv", index=False)

In [25]:
for i, model in enumerate(models_sex):
    model.save_model(f"../models/mini-lm-2-ptls-old-top-70/sex/model_{i}.cbm")

In [26]:
for i, model in enumerate(models_age):
    model.save_model(f"../models/mini-lm-2-ptls-old-top-70/age/model_{i}.cbm")