In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from functools import partial

import joblib as jbl
import polars as pl

from mts_ml_cup import feature_engineering as fe
from mts_ml_cup.preprocessing import urls as u
from mts_ml_cup import utils

In [3]:
%%time
url_cleaner = partial(
    u.clean_url,
    preprocessors=[
        u.decode_from_punycode,
        u.lower,
        u.replace_hyphens_with_dots,
    ],
)

sessions = pl.read_parquet("../data/processed/sessions.pq")
sessions = (
    sessions
    .join(
        other=sessions
            .select("url_host")
            .unique()
            .with_columns(pl.col("url_host").apply(url_cleaner).alias("url_cleaned")),
        on="url_host",
        how="left",
    )
    .select(pl.exclude("url_host"))
    .with_columns(pl.col("url_cleaned").alias("url_host"))
)

CPU times: user 2min 9s, sys: 44.2 s, total: 2min 53s
Wall time: 1min 45s


# device

In [4]:
%%time
fe.device.manufacturer_by_user(sessions).write_parquet("../data/feat/device/manufacturer.pq")

CPU times: user 18.2 s, sys: 6.44 s, total: 24.6 s
Wall time: 3.34 s


In [5]:
%%time
fe.device.os_by_user(sessions).write_parquet("../data/feat/device/os.pq")

CPU times: user 18.1 s, sys: 6.18 s, total: 24.3 s
Wall time: 3.22 s


In [6]:
%%time
fe.device.model_by_user(sessions).write_parquet("../data/feat/device/model.pq")

CPU times: user 18.3 s, sys: 6.02 s, total: 24.3 s
Wall time: 3.24 s


In [7]:
%%time
(
    fe.device.model_by_user(sessions)
    .join(fe.device.price_by_model(sessions), how="left", on="device_model_id")
    .select(pl.exclude("device_model_id"))
).write_parquet("../data/feat/device/price.pq")

CPU times: user 39.3 s, sys: 10.8 s, total: 50.1 s
Wall time: 9.61 s


In [8]:
%%time
fe.device.type_by_user(sessions).write_parquet("../data/feat/device/type.pq")

CPU times: user 18 s, sys: 5.91 s, total: 23.9 s
Wall time: 3.17 s


# geo

In [9]:
%%time
fe.geo.region_stats_by_user(sessions).write_parquet("../data/feat/geo/region-stats.pq")

CPU times: user 20.1 s, sys: 6.51 s, total: 26.6 s
Wall time: 3.62 s


In [10]:
%%time
fe.geo.city_stats_by_user(sessions).write_parquet("../data/feat/geo/city-stats.pq")

CPU times: user 21.9 s, sys: 6.14 s, total: 28.1 s
Wall time: 3.96 s


# rosstat

In [11]:
stats = (
    pl.read_csv("../data/processed/rosstat-mts.csv")
    .join(
        other=utils.polars_map(
            jbl.load("../data/mappings/regions.jbl"), 
            key_name="region_mts", 
            id_name="region_id", 
            id_dtype=pl.UInt8,
        ),
        how="left",
        on="region_mts",
    )
)
user_regions = fe.geo.region_stats_by_user(sessions).select(["user_id", "geo_top_region_id"])

In [12]:
%%time
(
    user_regions
    .join(
        other=fe.rosstat.region_stats(stats),
        how="left",
        left_on="geo_top_region_id",
        right_on="region_id",
    )
    .select(pl.exclude("geo_top_region_id"))
).write_parquet("../data/feat/rosstat/region-stats.pq")

CPU times: user 58.1 ms, sys: 20.7 ms, total: 78.9 ms
Wall time: 52.7 ms


In [13]:
%%time
(
    user_regions
    .join(
        other=fe.rosstat.sex_share_by_region(stats),
        how="left",
        left_on="geo_top_region_id",
        right_on="region_id",
    )
    .select(pl.exclude("geo_top_region_id"))
).write_parquet("../data/feat/rosstat/sex-share.pq")

CPU times: user 49.2 ms, sys: 0 ns, total: 49.2 ms
Wall time: 29.7 ms


In [14]:
%%time
(
    user_regions
    .join(
        other=fe.rosstat.age_share_by_region(stats),
        how="left",
        left_on="geo_top_region_id",
        right_on="region_id",
    )
    .select(pl.exclude("geo_top_region_id"))
).write_parquet("../data/feat/rosstat/age-share.pq")

CPU times: user 133 ms, sys: 55.6 ms, total: 189 ms
Wall time: 166 ms


In [15]:
%%time
(
    user_regions
    .join(
        other=fe.rosstat.sex_age_share_by_region(stats),
        how="left",
        left_on="geo_top_region_id",
        right_on="region_id",
    )
    .select(pl.exclude("geo_top_region_id"))
).write_parquet("../data/feat/rosstat/sex-age-share.pq")

CPU times: user 226 ms, sys: 36.5 ms, total: 262 ms
Wall time: 230 ms


# url

In [16]:
%%time
(
    fe.url.urls_stats_by_user(sessions)
    .write_parquet("../data/feat/url/stats.pq")
)

CPU times: user 2min, sys: 18.9 s, total: 2min 19s
Wall time: 1min 10s


In [17]:
%%time
(
    fe.url.top_n_urls_by_user(sessions, top_n=120)
    .write_parquet("../data/feat/url/top-120.pq")
)

CPU times: user 3min 8s, sys: 22.2 s, total: 3min 30s
Wall time: 50.1 s


In [19]:
%%time
(
    fe.url.all_urls_by_user_as_text(sessions)
    .write_parquet("../data/feat/url/all.pq")
)

CPU times: user 3min 30s, sys: 16 s, total: 3min 46s
Wall time: 1min 37s


In [None]:
%%time
url_cleaner = partial(
    u.clean_url,
    preprocessors=[
        u.decode_from_punycode,
        u.lower,
        u.replace_hyphens_with_dots,
    ],
)

sessions = pl.read_parquet("../data/processed/sessions.pq", columns=["user_id", "url_host"])
sessions = (
    sessions
    .join(
        other=sessions
            .select("url_host")
            .unique()
            .with_columns(pl.col("url_host").apply(url_cleaner).alias("url_cleaned")),
        on="url_host",
        how="left",
    )
    .select(pl.exclude("url_host"))
    .with_columns(pl.col("url_cleaned").alias("url_host"))
)

(
    fe.url.all_urls_combinations_as_text(sessions, k=2)
    .write_parquet("../data/feat/url/combinations-k2.pq")
)

# time

In [4]:
%%time
fe.time.time_period_by_user(sessions).write_parquet("../data/feat/time/period.pq")

CPU times: user 1min 20s, sys: 3.97 s, total: 1min 24s
Wall time: 19 s


In [21]:
%%time
fe.time.top_part_of_day_by_user(sessions).write_parquet("../data/feat/time/top-part-of-day.pq")

CPU times: user 26.2 s, sys: 5.78 s, total: 32 s
Wall time: 4.32 s


In [22]:
%%time
fe.time.part_of_day_distribution_by_user(sessions).write_parquet("../data/feat/time/part-of-day-dist.pq")

CPU times: user 38.3 s, sys: 7.77 s, total: 46.1 s
Wall time: 9.63 s


# usage

In [23]:
%%time
fe.usage.total_usage_stats_by_user(sessions).write_parquet("../data/feat/usage/total.pq")

CPU times: user 4min 58s, sys: 4.12 s, total: 5min 2s
Wall time: 38.2 s


In [24]:
%%time
fe.usage.usage_stats_per_date(sessions).write_parquet("../data/feat/usage/per-date.pq")

CPU times: user 10min 44s, sys: 9.45 s, total: 10min 54s
Wall time: 1min 23s


In [25]:
%%time
fe.usage.usage_stats_per_part_of_day(sessions).write_parquet("../data/feat/usage/per-part-of-day.pq")

CPU times: user 3min 12s, sys: 20.9 s, total: 3min 32s
Wall time: 29.7 s


In [26]:
%%time
fe.usage.usage_stats_per_url(sessions).write_parquet("../data/feat/usage/per-url.pq")

CPU times: user 18min 17s, sys: 16.1 s, total: 18min 33s
Wall time: 2min 22s


In [27]:
%%time
fe.usage.usage_stats_per_session(sessions).write_parquet("../data/feat/usage/per-session.pq")

CPU times: user 4min 5s, sys: 13.7 s, total: 4min 18s
Wall time: 35.3 s


In [28]:
%%time
fe.usage.usage_stats_per_daily_visit(sessions).write_parquet("../data/feat/usage/per-daily-visit.pq")

CPU times: user 8min 44s, sys: 47.9 s, total: 9min 32s
Wall time: 1min 44s


In [29]:
%%time
fe.usage.usage_stats_per_partly_visit(sessions).write_parquet("../data/feat/usage/per-partly-visit.pq")

CPU times: user 4min 27s, sys: 23.1 s, total: 4min 51s
Wall time: 46.1 s


In [None]:
%%time
fe.usage.usage_stats_per_visit(sessions).write_parquet("../data/feat/usage/per-visit.pq")