In [1]:
import os
import functools as ft

import catboost as cb
import pandas as pd
import numpy as np
import polars as pl
from sklearn.model_selection import KFold, StratifiedKFold

In [2]:
df = pl.read_parquet("../data/processed/sessions-noncat.pq")

In [3]:
%%time
urls = (
    df
    .groupby(["user_id", "url_host"])
    .agg(pl.col("request_cnt").sum())
    .sort(["user_id", "request_cnt", "url_host"], reverse=[False, True, False])
    .groupby("user_id").agg(pl.col("url_host"))
    .with_columns(
        [
            pl.col("url_host").apply(lambda urls: " ".join(urls)).alias("urls_text"),
            pl.col("url_host").apply(
                lambda urls: " ".join(
                    set(ft.reduce(
                        lambda a, b: a + b, map(lambda url: url.split("."), urls)
                    ))
                )
            ).alias("url_tokens_text")
        ]
    )
)

CPU times: user 6min 54s, sys: 2min 16s, total: 9min 10s
Wall time: 3min 24s


In [3]:
train = pl.read_parquet("../data/processed/train-users.pq")
test = pl.read_parquet("../data/processed/test-users.pq")

In [4]:
def join_precomputed_features(dataset: pl.DataFrame) -> pl.DataFrame:
    for feat_type in os.listdir("../data/features"):
        for features_file in os.listdir(f"../data/features/{feat_type}"):
            if not features_file.endswith(".pq"):
                continue
            
            features = pl.read_parquet(f"../data/features/{feat_type}/{features_file}")
            dataset = dataset.join(features, how="left", on="user_id")
    return dataset

# url as text

In [6]:
%%time
train_urls_as_text = (
    join_precomputed_features(train)
    .join(urls.select(["user_id", "urls_text"]), how="inner", on="user_id")
    
)
test_urls_as_text = (
    join_precomputed_features(test)
    .join(urls.select(["user_id", "urls_text"]), how="inner", on="user_id")
)

CPU times: user 9.79 s, sys: 6.2 s, total: 16 s
Wall time: 2.83 s


In [7]:
%%time
train_sex_text = (
    train_urls_as_text
    .filter(pl.col("is_male").is_not_null())
    .select(pl.exclude(["user_id", "age", "age_bucket"]))
)
train_age_text = (
    train_urls_as_text
    .filter(pl.col("age_bucket") > 0)
    .select(pl.exclude(["user_id", "age", "is_male"]))
)

CPU times: user 1.07 s, sys: 3.27 s, total: 4.35 s
Wall time: 1.18 s


In [8]:
train_sex_text_pool = cb.Pool(
    data=train_sex_text.select(pl.exclude("is_male")).to_pandas(),
    label=train_sex_text["is_male"].to_pandas(),
    cat_features=[
        "top_part_of_day",
        "top_city",
        "top_region",
        "device_manufacturer",
        "device_model",
    ],
    text_features=["urls_text"],
)
train_age_text_pool = cb.Pool(
    data=train_age_text.select(pl.exclude("age_bucket")).to_pandas(),
    label=train_age_text["age_bucket"].to_pandas(),
    cat_features=[
        "top_part_of_day",
        "top_city",
        "top_region",
        "device_manufacturer",
        "device_model",
    ],
    text_features=["urls_text"],
)

In [9]:
%%time
_, cv_models_sex_text = cb.cv(
    pool=train_sex_text_pool,
    params={
        "eval_metric": "Logloss",
        "loss_function": "Logloss",
        "iterations": 10_000,
        "early_stopping_rounds": 20,
        "random_seed": 777,
        "allow_writing_files": False,
    },
    folds=KFold(5, shuffle=True, random_state=777),
    verbose=100,
    return_models=True,
)

Training on fold [0/5]
0:	learn: 0.6929682	test: 0.6929323	best: 0.6929323 (0)	total: 138ms	remaining: 22m 58s
100:	learn: 0.5770304	test: 0.5779740	best: 0.5779740 (100)	total: 9.39s	remaining: 15m 20s
200:	learn: 0.5479977	test: 0.5500185	best: 0.5500185 (200)	total: 18.5s	remaining: 14m 59s
300:	learn: 0.5305322	test: 0.5337562	best: 0.5337562 (300)	total: 27.3s	remaining: 14m 40s
400:	learn: 0.5189949	test: 0.5231744	best: 0.5231744 (400)	total: 36.4s	remaining: 14m 30s
500:	learn: 0.5095985	test: 0.5151352	best: 0.5151352 (500)	total: 46s	remaining: 14m 32s
600:	learn: 0.5022368	test: 0.5089833	best: 0.5089833 (600)	total: 55.9s	remaining: 14m 33s
700:	learn: 0.4963636	test: 0.5043682	best: 0.5043682 (700)	total: 1m 4s	remaining: 14m 16s
800:	learn: 0.4911794	test: 0.5003375	best: 0.5003375 (800)	total: 1m 14s	remaining: 14m 11s
900:	learn: 0.4867332	test: 0.4970246	best: 0.4970246 (900)	total: 1m 22s	remaining: 13m 53s
1000:	learn: 0.4827016	test: 0.4940905	best: 0.4940905 (1000)

CatBoostError: catboost/libs/model/model.cpp:1837: Models summation is not supported for models with text features

In [10]:
%%time
_, cv_models_age_text = cb.cv(
    pool=train_age_text_pool,
    params={
        "eval_metric": "MultiClass",
        "loss_function": "MultiClass",
        "iterations": 10_000,
        "early_stopping_rounds": 20,
        "random_seed": 777,
        "allow_writing_files": False,
    },
    folds=KFold(5, shuffle=True, random_state=777),
    verbose=100,
    return_models=True,
)

Training on fold [0/5]
0:	learn: 1.7751695	test: 1.7752288	best: 1.7752288 (0)	total: 422ms	remaining: 1h 10m 20s
100:	learn: 1.3858756	test: 1.3910840	best: 1.3910840 (100)	total: 47s	remaining: 1h 16m 46s
200:	learn: 1.3434891	test: 1.3511746	best: 1.3511746 (200)	total: 1m 30s	remaining: 1h 13m 25s
300:	learn: 1.3227559	test: 1.3327021	best: 1.3327021 (300)	total: 2m 11s	remaining: 1h 10m 49s
400:	learn: 1.3072832	test: 1.3190720	best: 1.3190720 (400)	total: 2m 53s	remaining: 1h 9m 17s
500:	learn: 1.2941968	test: 1.3082818	best: 1.3082818 (500)	total: 3m 35s	remaining: 1h 7m 59s
600:	learn: 1.2843813	test: 1.3006943	best: 1.3006943 (600)	total: 4m 16s	remaining: 1h 6m 50s
700:	learn: 1.2763102	test: 1.2947668	best: 1.2947668 (700)	total: 4m 57s	remaining: 1h 5m 49s
800:	learn: 1.2695754	test: 1.2899941	best: 1.2899941 (800)	total: 5m 38s	remaining: 1h 4m 43s
900:	learn: 1.2633377	test: 1.2857955	best: 1.2857955 (900)	total: 6m 19s	remaining: 1h 3m 51s
1000:	learn: 1.2577522	test: 1.

In [33]:
test_text_pool = cb.Pool(
    data=test_urls_as_text.select(pl.exclude("user_id")).to_pandas(),
    cat_features=[
        "top_part_of_day",
        "top_city",
        "top_region",
        "device_manufacturer",
        "device_model",
    ],
    text_features=["urls_text"],
)

In [34]:
test_pred_sex_text = 0
for model in cv_models_sex_text:
    test_pred_sex_text += model.predict(test_text_pool, prediction_type="Probability")[:, 1]
test_pred_sex_text /= len(cv_models_sex_text)

In [51]:
test_pred_age_text = 0
for model in cv_models_age_text:
    test_pred_age_text += model.predict(test_text_pool, prediction_type="Probability")
test_pred_age_text /= len(cv_models_age_text)
test_pred_age_text = np.argmax(test_pred_age_text, axis=1) + 1

In [52]:
submission_text = pd.DataFrame()
submission_text["user_id"] = test_urls_as_text["user_id"].to_pandas()
submission_text["age"] = test_pred_age_text
submission_text["is_male"] = test_pred_sex_text

In [54]:
submission_text.to_csv("../submissions/cb-text.csv", index=False)

# url as text with different validation

In [5]:
def decode_url(url: str) -> str:
    try:
        return bytearray(url, "utf-8").decode("idna")
    except Exception:
        return url

In [6]:
%%time
urls_ = (
    df
    .with_columns(pl.col("url_host").apply(decode_url))
    .groupby(["user_id", "url_host"])
    .agg(pl.col("request_cnt").sum())
    .sort(["user_id", "request_cnt", "url_host"], reverse=[False, True, False])
    .groupby("user_id").agg(pl.col("url_host"))
    .with_columns(pl.col("url_host").apply(lambda urls: " ".join(urls)).alias("urls_text"))
)

CPU times: user 13min 48s, sys: 5min 28s, total: 19min 17s
Wall time: 12min 43s


In [19]:
%%time
df_ = df.with_columns(pl.col("url_host").apply(decode_url).alias("url_decoded"))

CPU times: user 8min 15s, sys: 2min, total: 10min 16s
Wall time: 10min 19s


In [22]:
users_by_url = df_.groupby("url_decoded").agg(pl.col("user_id").n_unique())

In [27]:
users_by_url.sort("user_id", reverse=True).with_columns(pl.col("user_id") / df["user_id"].n_unique())

url_decoded,user_id
str,f64
"""googleads.g.do...",0.950026
"""yandex.ru""",0.930386
"""avatars.mds.ya...",0.921446
"""i.ytimg.com""",0.918017
"""yastatic.net""",0.915539
"""ad.mail.ru""",0.902294
"""vk.com""",0.900753
"""tpc.googlesynd...",0.843081
"""ads.adfox.ru""",0.798436
"""online.sberban...",0.759504


In [7]:
%%time
train_urls_as_text_ = (
    join_precomputed_features(train)
    .join(urls_.select(["user_id", "urls_text"]), how="inner", on="user_id")
    
)
test_urls_as_text_ = (
    join_precomputed_features(test)
    .join(urls_.select(["user_id", "urls_text"]), how="inner", on="user_id")
)

CPU times: user 9.87 s, sys: 3.37 s, total: 13.2 s
Wall time: 2.47 s


In [8]:
%%time
train_sex_text_ = (
    train_urls_as_text_
    .filter(pl.col("is_male").is_not_null())
    .select(pl.exclude(["user_id", "age", "age_bucket"]))
)
train_age_text_ = (
    train_urls_as_text_
    .filter(pl.col("age_bucket") > 0)
    .select(pl.exclude(["user_id", "age", "is_male"]))
)

CPU times: user 1.31 s, sys: 2.02 s, total: 3.33 s
Wall time: 1.19 s


In [9]:
train_sex_text_pool_ = cb.Pool(
    data=train_sex_text_.select(pl.exclude("is_male")).to_pandas(),
    label=train_sex_text_["is_male"].to_pandas(),
    cat_features=[
        "top_part_of_day",
        "top_city",
        "top_region",
        "device_manufacturer",
        "device_model",
    ],
    text_features=["urls_text"],
)
train_age_text_pool_ = cb.Pool(
    data=train_age_text_.select(pl.exclude("age_bucket")).to_pandas(),
    label=train_age_text_["age_bucket"].to_pandas(),
    cat_features=[
        "top_part_of_day",
        "top_city",
        "top_region",
        "device_manufacturer",
        "device_model",
    ],
    text_features=["urls_text"],
)

test_text_pool_ = cb.Pool(
    data=test_urls_as_text_.select(pl.exclude("user_id")).to_pandas(),
    cat_features=[
        "top_part_of_day",
        "top_city",
        "top_region",
        "device_manufacturer",
        "device_model",
    ],
    text_features=["urls_text"],
)

In [10]:
%%time
_, cv_models_sex_text_ = cb.cv(
    pool=train_sex_text_pool_,
    params={
        "eval_metric": "AUC",
        "loss_function": "Logloss",
        "iterations": 10_000,
        "early_stopping_rounds": 50,
        "random_seed": 777,
        "allow_writing_files": False,
    },
    folds=KFold(5, shuffle=True, random_state=777),
    verbose=100,
    return_models=True,
)

Training on fold [0/5]
0:	test: 0.6306302	best: 0.6306302 (0)	total: 144ms	remaining: 24m 2s
100:	test: 0.7923721	best: 0.7923721 (100)	total: 9.88s	remaining: 16m 8s
200:	test: 0.8130002	best: 0.8130002 (200)	total: 20.3s	remaining: 16m 29s
300:	test: 0.8230711	best: 0.8230711 (300)	total: 30s	remaining: 16m 6s
400:	test: 0.8296933	best: 0.8296933 (400)	total: 40.4s	remaining: 16m 6s
500:	test: 0.8344916	best: 0.8344916 (500)	total: 50.4s	remaining: 15m 56s
600:	test: 0.8380213	best: 0.8380213 (600)	total: 1m	remaining: 15m 43s
700:	test: 0.8408500	best: 0.8408500 (700)	total: 1m 9s	remaining: 15m 20s
800:	test: 0.8433848	best: 0.8433848 (800)	total: 1m 18s	remaining: 15m 3s
900:	test: 0.8456232	best: 0.8456232 (900)	total: 1m 27s	remaining: 14m 48s
1000:	test: 0.8472572	best: 0.8472572 (1000)	total: 1m 37s	remaining: 14m 35s
1100:	test: 0.8488054	best: 0.8488054 (1100)	total: 1m 47s	remaining: 14m 32s
1200:	test: 0.8500654	best: 0.8500654 (1200)	total: 1m 57s	remaining: 14m 18s
1300:

In [12]:
%%time
_, cv_models_age_text_ = cb.cv(
    pool=train_age_text_pool_,
    params={
        "eval_metric": "TotalF1",
        "loss_function": "MultiClass",
        "iterations": 10_000,
        "early_stopping_rounds": 100,
        "random_seed": 777,
        "allow_writing_files": False,
    },
    folds=KFold(5, shuffle=True, random_state=777),
    verbose=100,
    return_models=True,
)

Training on fold [0/5]
0:	learn: 0.2736465	test: 0.2733458	best: 0.2733458 (0)	total: 424ms	remaining: 1h 10m 35s
100:	learn: 0.3485671	test: 0.3486997	best: 0.3486997 (100)	total: 52.3s	remaining: 1h 25m 21s
200:	learn: 0.3790908	test: 0.3776639	best: 0.3776639 (200)	total: 1m 38s	remaining: 1h 20m 7s
300:	learn: 0.3971832	test: 0.3917956	best: 0.3917956 (300)	total: 2m 22s	remaining: 1h 16m 34s
400:	learn: 0.4091668	test: 0.4023217	best: 0.4023700 (399)	total: 3m 6s	remaining: 1h 14m 13s
500:	learn: 0.4182278	test: 0.4097129	best: 0.4097129 (500)	total: 3m 50s	remaining: 1h 12m 42s
600:	learn: 0.4249657	test: 0.4149265	best: 0.4149265 (600)	total: 4m 33s	remaining: 1h 11m 21s
700:	learn: 0.4311196	test: 0.4190512	best: 0.4191690 (697)	total: 5m 16s	remaining: 1h 10m
800:	learn: 0.4354184	test: 0.4223843	best: 0.4223843 (800)	total: 5m 59s	remaining: 1h 8m 48s
900:	learn: 0.4391975	test: 0.4243675	best: 0.4246602 (896)	total: 6m 42s	remaining: 1h 7m 44s
1000:	learn: 0.4429720	test: 0.

In [13]:
test_pred_sex_text_ = 0
for model in cv_models_sex_text_:
    test_pred_sex_text_ += model.predict(test_text_pool_, prediction_type="Probability")[:, 1]
test_pred_sex_text_ /= len(cv_models_sex_text_)

In [14]:
test_pred_age_text_ = 0
for model in cv_models_age_text_:
    test_pred_age_text_ += model.predict(test_text_pool_, prediction_type="Probability")
test_pred_age_text_ /= len(cv_models_age_text_)
test_pred_age_text_ = np.argmax(test_pred_age_text_, axis=1) + 1

In [15]:
submission_text_ = pd.DataFrame()
submission_text_["user_id"] = test_urls_as_text_["user_id"].to_pandas()
submission_text_["age"] = test_pred_age_text_
submission_text_["is_male"] = test_pred_sex_text_

In [17]:
submission_text_.to_csv("../submissions/cb-text-dval.csv", index=False)

In [18]:
df.

region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id
str,str,str,str,str,str,str,f32,datetime[ns],str,u8,i32
"""Краснодарский ...","""Краснодар""","""Apple""","""iPhone 7""","""ad.adriver.ru""","""smartphone""","""iOS""",20368.0,2022-06-15 00:00:00,"""morning""",1,45098
"""Краснодарский ...","""Краснодар""","""Apple""","""iPhone 7""","""apple.com""","""smartphone""","""iOS""",20368.0,2022-06-19 00:00:00,"""morning""",1,45098
"""Краснодарский ...","""Краснодар""","""Apple""","""iPhone 7""","""avatars.mds.ya...","""smartphone""","""iOS""",20368.0,2022-06-12 00:00:00,"""day""",1,45098
"""Краснодарский ...","""Краснодар""","""Apple""","""iPhone 7""","""googleads.g.do...","""smartphone""","""iOS""",20368.0,2022-05-16 00:00:00,"""day""",1,45098
"""Краснодарский ...","""Краснодар""","""Apple""","""iPhone 7""","""googleads.g.do...","""smartphone""","""iOS""",20368.0,2022-05-30 00:00:00,"""day""",1,45098
"""Краснодарский ...","""Краснодар""","""Apple""","""iPhone 7""","""i.ytimg.com""","""smartphone""","""iOS""",20368.0,2022-03-29 00:00:00,"""evening""",2,45098
"""Краснодарский ...","""Краснодар""","""Apple""","""iPhone 7""","""icloud.com""","""smartphone""","""iOS""",20368.0,2022-03-17 00:00:00,"""morning""",1,45098
"""Краснодарский ...","""Краснодар""","""Apple""","""iPhone 7""","""m.avito.ru""","""smartphone""","""iOS""",20368.0,2022-05-19 00:00:00,"""morning""",1,45098
"""Краснодарский ...","""Краснодар""","""Apple""","""iPhone 7""","""relap.io""","""smartphone""","""iOS""",20368.0,2022-03-29 00:00:00,"""night""",1,45098
"""Краснодарский ...","""Краснодар""","""Apple""","""iPhone 7""","""sun9-5.userapi...","""smartphone""","""iOS""",20368.0,2022-06-16 00:00:00,"""day""",1,45098


# url as tokens

In [11]:
%%time
train_urls_as_tokens = (
    join_precomputed_features(train)
    .join(urls.select(["user_id", "url_tokens_text"]), how="inner", on="user_id")
    
)
test_urls_as_tokens = (
    join_precomputed_features(test)
    .join(urls.select(["user_id", "url_tokens_text"]), how="inner", on="user_id")
)

CPU times: user 9.03 s, sys: 2.46 s, total: 11.5 s
Wall time: 2.11 s


In [14]:
%%time
train_sex_tokens = (
    train_urls_as_tokens
    .filter(pl.col("is_male").is_not_null())
    .select(pl.exclude(["user_id", "age", "age_bucket"]))
)
train_age_tokens = (
    train_urls_as_tokens
    .filter(pl.col("age_bucket") > 0)
    .select(pl.exclude(["user_id", "age", "is_male"]))
)

CPU times: user 889 ms, sys: 1.27 s, total: 2.16 s
Wall time: 612 ms


In [15]:
train_sex_tokens_pool = cb.Pool(
    data=train_sex_tokens.select(pl.exclude("is_male")).to_pandas(),
    label=train_sex_tokens["is_male"].to_pandas(),
    cat_features=[
        "top_part_of_day",
        "top_city",
        "top_region",
        "device_manufacturer",
        "device_model",
    ],
    text_features=["url_tokens_text"],
)
train_age_tokens_pool = cb.Pool(
    data=train_age_tokens.select(pl.exclude("age_bucket")).to_pandas(),
    label=train_age_tokens["age_bucket"].to_pandas(),
    cat_features=[
        "top_part_of_day",
        "top_city",
        "top_region",
        "device_manufacturer",
        "device_model",
    ],
    text_features=["url_tokens_text"],
)

In [16]:
%%time
_, cv_models_sex_tokens = cb.cv(
    pool=train_sex_tokens_pool,
    params={
        "eval_metric": "Logloss",
        "loss_function": "Logloss",
        "iterations": 10_000,
        "early_stopping_rounds": 20,
        "random_seed": 777,
        "allow_writing_files": False,
    },
    folds=KFold(5, shuffle=True, random_state=777),
    verbose=100,
    return_models=True,
)

Training on fold [0/5]
0:	learn: 0.6930788	test: 0.6930021	best: 0.6930021 (0)	total: 86.6ms	remaining: 14m 25s
100:	learn: 0.5747810	test: 0.5759673	best: 0.5759673 (100)	total: 9.06s	remaining: 14m 48s
200:	learn: 0.5445128	test: 0.5471663	best: 0.5471663 (200)	total: 18s	remaining: 14m 37s
300:	learn: 0.5290514	test: 0.5326568	best: 0.5326568 (300)	total: 26.6s	remaining: 14m 18s
400:	learn: 0.5188423	test: 0.5233124	best: 0.5233124 (400)	total: 35.5s	remaining: 14m 10s
500:	learn: 0.5098133	test: 0.5155341	best: 0.5155341 (500)	total: 44.5s	remaining: 14m 3s
600:	learn: 0.5024415	test: 0.5094397	best: 0.5094397 (600)	total: 53.1s	remaining: 13m 51s
700:	learn: 0.4964505	test: 0.5046268	best: 0.5046268 (700)	total: 1m 1s	remaining: 13m 40s
800:	learn: 0.4916171	test: 0.5008927	best: 0.5008869 (799)	total: 1m 10s	remaining: 13m 29s
900:	learn: 0.4873344	test: 0.4977547	best: 0.4977547 (900)	total: 1m 19s	remaining: 13m 18s
1000:	learn: 0.4834743	test: 0.4949668	best: 0.4949668 (1000)

In [17]:
%%time
_, cv_models_age_tokens = cb.cv(
    pool=train_age_tokens_pool,
    params={
        "eval_metric": "MultiClass",
        "loss_function": "MultiClass",
        "iterations": 10_000,
        "early_stopping_rounds": 20,
        "random_seed": 777,
        "allow_writing_files": False,
    },
    folds=KFold(5, shuffle=True, random_state=777),
    verbose=100,
    return_models=True,
)

Training on fold [0/5]
0:	learn: 1.7756977	test: 1.7758303	best: 1.7758303 (0)	total: 503ms	remaining: 1h 23m 54s
100:	learn: 1.3934243	test: 1.3980664	best: 1.3980664 (100)	total: 47.9s	remaining: 1h 18m 15s
200:	learn: 1.3497837	test: 1.3569849	best: 1.3569849 (200)	total: 1m 32s	remaining: 1h 15m 4s
300:	learn: 1.3277835	test: 1.3370490	best: 1.3370490 (300)	total: 2m 13s	remaining: 1h 11m 34s
400:	learn: 1.3126306	test: 1.3240232	best: 1.3240232 (400)	total: 2m 54s	remaining: 1h 9m 32s
500:	learn: 1.2998901	test: 1.3135461	best: 1.3135461 (500)	total: 3m 35s	remaining: 1h 8m 2s
600:	learn: 1.2898288	test: 1.3058308	best: 1.3058308 (600)	total: 4m 16s	remaining: 1h 6m 46s
700:	learn: 1.2817605	test: 1.3000452	best: 1.3000452 (700)	total: 4m 56s	remaining: 1h 5m 37s
800:	learn: 1.2747830	test: 1.2952900	best: 1.2952900 (800)	total: 5m 36s	remaining: 1h 4m 27s
900:	learn: 1.2686001	test: 1.2912223	best: 1.2912223 (900)	total: 6m 17s	remaining: 1h 3m 29s
1000:	learn: 1.2632603	test: 1.

In [57]:
test_tokens_pool = cb.Pool(
    data=test_urls_as_tokens.select(pl.exclude("user_id")).to_pandas(),
    cat_features=[
        "top_part_of_day",
        "top_city",
        "top_region",
        "device_manufacturer",
        "device_model",
    ],
    text_features=["url_tokens_text"],
)

In [58]:
test_pred_sex_tokens = 0
for model in cv_models_sex_tokens:
    test_pred_sex_tokens += model.predict(test_tokens_pool, prediction_type="Probability")[:, 1]
test_pred_sex_tokens /= len(cv_models_sex_tokens)

In [60]:
test_pred_age_tokens = 0
for model in cv_models_age_tokens:
    test_pred_age_tokens += model.predict(test_tokens_pool, prediction_type="Probability")
test_pred_age_tokens /= len(cv_models_age_tokens)
test_pred_age_tokens = np.argmax(test_pred_age_tokens, axis=1) + 1

In [62]:
submission_tokens = pd.DataFrame()
submission_tokens["user_id"] = test_urls_as_tokens["user_id"].to_pandas()
submission_tokens["age"] = test_pred_age_tokens
submission_tokens["is_male"] = test_pred_sex_tokens

In [64]:
submission_tokens.to_csv("../submissions/cb-tokens.csv", index=False)