In [1]:
%load_ext autoreload
%autoreload 2

from __future__ import annotations

import os

import numpy as np
import pandas as pd
import polars as pl

from mts_ml_cup import catboost as cb

In [2]:
def join_precomputed_features(dataset: pl.DataFrame, black_list: set[str] = None) -> pl.DataFrame:
    black_list = black_list or set()
    for feat_type in os.listdir("../data/features"):
        for features_file in os.listdir(f"../data/features/{feat_type}"):
            if not features_file.endswith(".pq") or features_file in black_list:
                continue
            
            features = (
                pl.read_parquet(f"../data/features/{feat_type}/{features_file}")
                .with_columns(pl.col("user_id").cast(pl.UInt32))
            )
            dataset = dataset.join(features, how="left", on="user_id")
    return dataset

In [3]:
%%time
black_list = {
    # "urls_text.pq",
    "urls_text-cleaned.pq", 
    "urls_text-filtered.pq", 
    "urls_text-ultra-filtered.pq", 
    "vectors.pq", 
    "vectors-v2-hist.pq", 
    "vectors-v2.pq",
    "vectors-v3.pq",
}

train = join_precomputed_features(
    pl.read_parquet("../data/processed/targets.pq"),
    black_list=black_list,
)
test = join_precomputed_features(
    pl.read_parquet("../data/processed/test.pq"),
    black_list=black_list,
)

CPU times: user 33.3 s, sys: 4.97 s, total: 38.3 s
Wall time: 32 s


In [9]:
%%time
models_sex, models_age, metrics = cb.fit(
    train=train,
    pool_params={
        "cat_features": [
            "top_part_of_day",
            "top_city",
            "top_region",
            "device_manufacturer",
            "device_model",
        ],
        "text_features": ["urls_text"],
        "embedding_features": ["mini_lm_embeddings"],
    },
    cb_params={
        "iterations": 100_000,
        "early_stopping_rounds": 1_000,
        "random_seed": 777,
    }
)

0:	learn: 0.6907620	test: 0.6907378	best: 0.6907378 (0)	total: 16.6ms	remaining: 27m 35s
1000:	learn: 0.4551641	test: 0.4562776	best: 0.4562776 (1000)	total: 14.9s	remaining: 24m 37s
2000:	learn: 0.4468782	test: 0.4502190	best: 0.4502190 (2000)	total: 30.1s	remaining: 24m 34s
3000:	learn: 0.4412135	test: 0.4463702	best: 0.4463702 (3000)	total: 44.3s	remaining: 23m 51s
4000:	learn: 0.4369266	test: 0.4438634	best: 0.4438634 (4000)	total: 58s	remaining: 23m 12s
5000:	learn: 0.4335411	test: 0.4421901	best: 0.4421901 (5000)	total: 1m 11s	remaining: 22m 44s
6000:	learn: 0.4305705	test: 0.4409187	best: 0.4409187 (6000)	total: 1m 25s	remaining: 22m 22s
7000:	learn: 0.4279347	test: 0.4398972	best: 0.4398972 (7000)	total: 1m 39s	remaining: 22m 1s
8000:	learn: 0.4255739	test: 0.4391336	best: 0.4391333 (7999)	total: 1m 53s	remaining: 21m 41s
9000:	learn: 0.4233257	test: 0.4383995	best: 0.4383995 (9000)	total: 2m 6s	remaining: 21m 23s
10000:	learn: 0.4211804	test: 0.4378429	best: 0.4378429 (10000)	

In [11]:
submission = cb.predict(
    test=test,
    pool_params={
        "cat_features": [
            "top_part_of_day",
            "top_city",
            "top_region",
            "device_manufacturer",
            "device_model",
        ],
        "text_features": ["urls_text"],
        "embedding_features": ["mini_lm_embeddings"],
    },
    models_sex=models_sex,
    models_age=models_age,
)
submission.to_csv("../submissions/mini-lm-lr-0005.csv", index=False)

# росстат как бейзлайн

In [4]:
train_sex_baseline = train.select(["women_share", "men_share"]).to_pandas().to_numpy()
train_age_baseline = train.select([f"age_bucket_{i}_share" for i in range(1, 7)]).to_pandas().to_numpy()

test_sex_baseline = test.select(["women_share", "men_share"]).to_pandas().to_numpy()
test_age_baseline = test.select([f"age_bucket_{i}_share" for i in range(1, 7)]).to_pandas().to_numpy()

In [5]:
rosstat_features = [
    col for col in pl.read_parquet_schema("../data/features/geo/rosstat.pq").keys() if col != "user_id"
]

In [15]:
%%time
models_sex, models_age, metrics = cb.fit(
    train=train,
    pool_params={
        "cat_features": [
            "top_part_of_day",
            "top_city",
            "top_region",
            "device_manufacturer",
            "device_model",
        ],
        "text_features": ["urls_text"],
    },
    cb_params={
        "iterations": 100_000,
        "early_stopping_rounds": 1_000,
        "random_seed": 777,
    },
    sex_baseline="men_share",
    age_baseline=[f"age_bucket_{i}_share" for i in range(1, 7)],
)

Learning rate set to 0.006712
0:	learn: 0.7109198	test: 0.7108430	best: 0.7108430 (0)	total: 16.4ms	remaining: 27m 19s
1000:	learn: 0.4537878	test: 0.4556966	best: 0.4556966 (1000)	total: 15.2s	remaining: 25m 2s
2000:	learn: 0.4452023	test: 0.4494775	best: 0.4494775 (2000)	total: 29.8s	remaining: 24m 20s
3000:	learn: 0.4392760	test: 0.4458533	best: 0.4458533 (3000)	total: 43.5s	remaining: 23m 27s
4000:	learn: 0.4348357	test: 0.4436680	best: 0.4436680 (4000)	total: 57.4s	remaining: 22m 56s
5000:	learn: 0.4311726	test: 0.4422103	best: 0.4422099 (4999)	total: 1m 11s	remaining: 22m 29s
6000:	learn: 0.4278826	test: 0.4410787	best: 0.4410787 (6000)	total: 1m 24s	remaining: 22m 6s
7000:	learn: 0.4248491	test: 0.4401552	best: 0.4401552 (7000)	total: 1m 38s	remaining: 21m 54s
8000:	learn: 0.4220671	test: 0.4393754	best: 0.4393751 (7995)	total: 1m 52s	remaining: 21m 35s
9000:	learn: 0.4194138	test: 0.4386938	best: 0.4386938 (9000)	total: 2m 6s	remaining: 21m 21s
10000:	learn: 0.4168624	test: 0.4

In [16]:
import catboost

test_pool_sex = catboost.Pool(
    data=test.to_pandas(),
    **{
        "cat_features": [
            "top_part_of_day",
            "top_city",
            "top_region",
            "device_manufacturer",
            "device_model",
        ],
        "text_features": ["urls_text"],
    },
    baseline=test["men_share"].to_numpy(),
)

test_pool_age = catboost.Pool(
    data=test.to_pandas(),
    **{
        "cat_features": [
            "top_part_of_day",
            "top_city",
            "top_region",
            "device_manufacturer",
            "device_model",
        ],
        "text_features": ["urls_text"],
    },
    baseline=test.select([f"age_bucket_{i}_share" for i in range(1, 7)]).to_numpy(),
)

In [17]:
is_male = 0
for model_sex in models_sex:
    is_male += model_sex.predict_proba(test_pool_sex)[:, 1] / len(models_sex)
    
age_probas = 0
for model_age in models_age:
    age_probas += model_age.predict_proba(test_pool_age) / len(models_age)
age_bucket = np.argmax(age_probas, axis=1) + 1

submission = pd.DataFrame()
submission["user_id"] = test["user_id"].to_pandas()
submission["is_male"] = is_male
submission["age"] = age_bucket

In [18]:
submission.to_csv("../submissions/rosstat-baseline-features.csv", index=False)

In [None]:
submission = cb.predict(
    test=test,
    pool_params={
        "cat_features": [
            "top_part_of_day",
            "top_city",
            "top_region",
            "device_manufacturer",
            "device_model",
        ],
        "text_features": ["urls_text"],
    },
    models_sex=models_sex,
    models_age=models_age,
    
)
submission.to_csv("../submissions/rosstat-raw-urls.csv", index=False)

# росстат как фичи

In [None]:
%%time
models_sex, models_age, metrics = cb.fit(
    train=train,
    pool_params={
        "cat_features": [
            "top_part_of_day",
            "top_city",
            "top_region",
            "device_manufacturer",
            "device_model",
        ],
        "text_features": ["urls_text"],
    },
    cb_params={
        "iterations": 100_000,
        "early_stopping_rounds": 1_000,
        "random_seed": 777,
    }
)

Learning rate set to 0.006712
0:	learn: 0.6899492	test: 0.6899163	best: 0.6899163 (0)	total: 15.5ms	remaining: 25m 53s
1000:	learn: 0.4538486	test: 0.4556774	best: 0.4556774 (1000)	total: 15s	remaining: 24m 48s
2000:	learn: 0.4452672	test: 0.4494752	best: 0.4494752 (2000)	total: 29.6s	remaining: 24m 7s
3000:	learn: 0.4393482	test: 0.4458764	best: 0.4458764 (3000)	total: 43.2s	remaining: 23m 17s
4000:	learn: 0.4349058	test: 0.4436914	best: 0.4436914 (4000)	total: 57.2s	remaining: 22m 51s
5000:	learn: 0.4312189	test: 0.4422205	best: 0.4422203 (4999)	total: 1m 10s	remaining: 22m 26s
6000:	learn: 0.4279171	test: 0.4410652	best: 0.4410652 (6000)	total: 1m 24s	remaining: 22m 3s
7000:	learn: 0.4249056	test: 0.4401638	best: 0.4401638 (7000)	total: 1m 38s	remaining: 21m 43s
8000:	learn: 0.4221199	test: 0.4394084	best: 0.4394082 (7995)	total: 1m 51s	remaining: 21m 24s
9000:	learn: 0.4194425	test: 0.4387322	best: 0.4387322 (9000)	total: 2m 5s	remaining: 21m 8s
10000:	learn: 0.4169075	test: 0.4381

In [None]:
submission = cb.predict(
    test=test,
    pool_params={
        "cat_features": [
            "top_part_of_day",
            "top_city",
            "top_region",
            "device_manufacturer",
            "device_model",
        ],
        "text_features": ["urls_text"],
    },
    models_sex=models_sex,
    models_age=models_age,
)
submission.to_csv("../submissions/rosstat-raw-urls.csv", index=False)

In [9]:
submission.to_csv("../submissions/rosstat-raw-urls.csv", index=False)

# фильтрованные

In [4]:
%%time
models_sex, models_age, metrics = cb.fit(
    train=train,
    pool_params={
        "cat_features": [
            "top_part_of_day",
            "top_city",
            "top_region",
            "device_manufacturer",
            "device_model",
        ],
        "text_features": ["urls_text_filtered"],
    },
    cb_params={
        "iterations": 100_000,
        "early_stopping_rounds": 1_000,
        "random_seed": 777,
    }
)

Learning rate set to 0.006712
0:	learn: 0.6899599	test: 0.6899248	best: 0.6899248 (0)	total: 15.5ms	remaining: 25m 53s
1000:	learn: 0.4537921	test: 0.4555206	best: 0.4555206 (1000)	total: 15s	remaining: 24m 39s
2000:	learn: 0.4453251	test: 0.4493076	best: 0.4493076 (2000)	total: 29.3s	remaining: 23m 54s
3000:	learn: 0.4394937	test: 0.4457622	best: 0.4457622 (3000)	total: 42.8s	remaining: 23m 4s
4000:	learn: 0.4351310	test: 0.4436022	best: 0.4436022 (4000)	total: 56.3s	remaining: 22m 30s
5000:	learn: 0.4315595	test: 0.4421647	best: 0.4421647 (5000)	total: 1m 9s	remaining: 22m 8s
6000:	learn: 0.4283641	test: 0.4410484	best: 0.4410484 (6000)	total: 1m 23s	remaining: 21m 48s
7000:	learn: 0.4254332	test: 0.4401483	best: 0.4401483 (7000)	total: 1m 37s	remaining: 21m 29s
8000:	learn: 0.4227638	test: 0.4394138	best: 0.4394135 (7999)	total: 1m 50s	remaining: 21m 10s
9000:	learn: 0.4201954	test: 0.4387803	best: 0.4387803 (9000)	total: 2m 4s	remaining: 20m 53s
10000:	learn: 0.4177389	test: 0.4382

In [5]:
submission = cb.predict(
    test=test,
    pool_params={
        "cat_features": [
            "top_part_of_day",
            "top_city",
            "top_region",
            "device_manufacturer",
            "device_model",
        ],
        "text_features": ["urls_text_filtered"],
    },
    models_sex=models_sex,
    models_age=models_age
)

In [7]:
submission.to_csv("../submissions/only-test-urls.csv", index=False)