In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from __future__ import annotations

import functools as ft
import os

import numpy as np
import pandas as pd
import polars as pl
from tqdm import tqdm

from mts_ml_cup.modeling import catboost as cb
from mts_ml_cup.modeling import validation as v

In [3]:
def join_precomputed_features(dataset: pl.DataFrame, black_list: set[str] = None) -> pl.DataFrame:
    black_list = black_list or set()
    for feat_type in os.listdir("../data/feat"):
        for features_file in os.listdir(f"../data/feat/{feat_type}"):
            if not features_file.endswith(".pq") or features_file in black_list:
                continue
            
            features = (
                pl.read_parquet(f"../data/feat/{feat_type}/{features_file}")
                .with_columns(pl.col("user_id").cast(pl.UInt32))
            )
            dataset = dataset.join(features, how="left", on="user_id")
    return dataset

In [4]:
%%time
black_list = {}

train = join_precomputed_features(
    pl.read_parquet("../data/processed/train.pq"),
    black_list=black_list,
)
test = join_precomputed_features(
    pl.read_parquet("../data/processed/test.pq"),
    black_list=black_list,
)

CPU times: user 1min 3s, sys: 13.8 s, total: 1min 17s
Wall time: 56.7 s


# Model

In [7]:
model = cb.CatBoostCV(
    pool_params={
        "cat_features": [
            "time_top_part_of_day",
            "time_first_day",
            "time_last_day",
            "time_first_month",
            "time_last_month",
            "time_total_months",
            "time_first_year",
            "time_last_year",
            "time_total_years",
            
            "geo_top_city_id",
            "geo_top_region_id",
            
            "device_manufacturer_id",
            "device_model_id",
            "device_os_id",
            "device_type_id",
        ] 
        + [f"url_top_{i}_url" for i in range(1, 121)],
        "text_features": ["url_all_visited_urls", "url_all_visited_urls_2grams"],
        "embedding_features": ["mini_lm_embeddings", "ptls_embeddings"],
    },
    model_params={
        "iterations": 100_000,
        "early_stopping_rounds": 1_000,
        "random_seed": 777,
    },
    splitter=ft.partial(v.manual_split, folds=pd.read_csv("../data/processed/folds.csv")),
)

In [8]:
%%time
model.fit(train.with_columns([pl.col(["time_total_months", "time_total_years"]).cast(pl.Int32)]))

Learning rate set to 0.006712
0:	learn: 0.6899085	test: 0.6898223	best: 0.6898223 (0)	total: 103ms	remaining: 2h 51m 33s
1000:	learn: 0.4496452	test: 0.4544755	best: 0.4544755 (1000)	total: 44.7s	remaining: 1h 13m 42s
2000:	learn: 0.4411951	test: 0.4485204	best: 0.4485204 (2000)	total: 1m 41s	remaining: 1h 23m 1s
3000:	learn: 0.4350979	test: 0.4447817	best: 0.4447817 (3000)	total: 2m 22s	remaining: 1h 16m 34s
4000:	learn: 0.4306506	test: 0.4426463	best: 0.4426463 (4000)	total: 3m 4s	remaining: 1h 13m 43s
5000:	learn: 0.4270334	test: 0.4412249	best: 0.4412249 (5000)	total: 3m 46s	remaining: 1h 11m 47s
6000:	learn: 0.4238732	test: 0.4401254	best: 0.4401254 (6000)	total: 4m 33s	remaining: 1h 11m 23s
7000:	learn: 0.4209851	test: 0.4393108	best: 0.4393107 (6998)	total: 5m 14s	remaining: 1h 9m 41s
8000:	learn: 0.4183368	test: 0.4386038	best: 0.4386038 (8000)	total: 5m 56s	remaining: 1h 8m 23s
9000:	learn: 0.4158030	test: 0.4380004	best: 0.4380001 (8999)	total: 6m 40s	remaining: 1h 7m 28s
100

[{'sex ROC-AUC': 0.8816769366808068,
  'age F1 Weighted': 0.46694164573919256,
  'mts-ml-cup metric': 1.6972371648399989},
 {'sex ROC-AUC': 0.886359137541354,
  'age F1 Weighted': 0.4694240810116773,
  'mts-ml-cup metric': 1.7115664371060626},
 {'sex ROC-AUC': 0.8843098716263631,
  'age F1 Weighted': 0.47050855160260735,
  'mts-ml-cup metric': 1.7096368464579408},
 {'sex ROC-AUC': 0.8842293703641484,
  'age F1 Weighted': 0.47183322816579315,
  'mts-ml-cup metric': 1.712125197059883},
 {'sex ROC-AUC': 0.8847180097553442,
  'age F1 Weighted': 0.46178461624998113,
  'mts-ml-cup metric': 1.6930052520106507}]

In [9]:
%%time
oof_preds = model.predict_oof(train.with_columns([pl.col(["time_total_months", "time_total_years"]).cast(pl.Int32)]))
oof_preds.to_csv("fold_preds-bigrams.csv", index=False)

CPU times: user 1h 19min 35s, sys: 21 s, total: 1h 19min 56s
Wall time: 10min 21s


In [10]:
%%time
test_preds = model.predict(test.with_columns([pl.col(["time_total_months", "time_total_years"]).cast(pl.Int32)]))
test_preds.to_csv("test_preds-bigrams.csv", index=False)
test_preds[["user_id", "is_male", "age"]].to_csv("../submissions/bigrams.csv", index=False)

CPU times: user 42min 45s, sys: 7.33 s, total: 42min 53s
Wall time: 5min 34s


In [11]:
%%time
model.save_models("../models/bigrams")

CPU times: user 1.71 s, sys: 7.6 s, total: 9.3 s
Wall time: 44.5 s


In [5]:
%%time
model = cb.CatBoostCV.from_snapshot(
    "../models/bigrams/", 
    pool_params={
        "cat_features": [
            "time_top_part_of_day",
            "time_first_day",
            "time_last_day",
            "time_first_month",
            "time_last_month",
            "time_total_months",
            "time_first_year",
            "time_last_year",
            "time_total_years",
            
            "geo_top_city_id",
            "geo_top_region_id",
            
            "device_manufacturer_id",
            "device_model_id",
            "device_os_id",
            "device_type_id",
        ] 
        + [f"url_top_{i}_url" for i in range(1, 121)],
        "text_features": ["url_all_visited_urls", "url_all_visited_urls_2grams"],
        "embedding_features": ["mini_lm_embeddings", "ptls_embeddings"],
    },
    model_params={
        "iterations": 100_000,
        "early_stopping_rounds": 1_000,
        "random_seed": 777,
    },
    splitter=ft.partial(v.manual_split, folds=pd.read_csv("../data/processed/folds.csv")),
)

IOStream.flush timed out
IOStream.flush timed out


CPU times: user 4.02 s, sys: 12.2 s, total: 16.2 s
Wall time: 2min 2s


In [10]:
%%time
oof_preds = model.predict_oof(train.with_columns([pl.col(["time_total_months", "time_total_years"]).cast(pl.Int32)]))
oof_preds.to_csv("fold_preds-bigrams.csv", index=False)

CPU times: user 16min 32s, sys: 27.6 s, total: 17min
Wall time: 2min 44s
