In [11]:
from functools import partial

import implicit
import polars as pl
import pandas as pd
import numpy as np
import scipy.sparse

from mts_ml_cup.preprocessing import urls as u

In [4]:
%%time
url_cleaner = partial(
    u.clean_url,
    preprocessors=[
        u.decode_from_punycode,
        u.lower,
        u.replace_hyphens_with_dots,
    ],
)

sessions = pl.read_parquet("../data/processed/sessions.pq", columns=["user_id", "url_host", "request_cnt"])
sessions = (
    sessions
    .join(
        other=sessions
            .select("url_host")
            .unique()
            .with_columns(pl.col("url_host").apply(url_cleaner).alias("url_cleaned")),
        on="url_host",
        how="left",
    )
    .select(pl.exclude("url_host"))
    .with_columns(pl.col("url_cleaned").alias("url_host"))
)

CPU times: user 3min 42s, sys: 1min 21s, total: 5min 3s
Wall time: 1min 8s


In [6]:
%%time
data_agg = (
    sessions
    .groupby(["user_id", "url_host"])
    .agg(pl.col("request_cnt").sum())
)

CPU times: user 2min 57s, sys: 42.4 s, total: 3min 40s
Wall time: 34.9 s


In [10]:
data_agg.write_parquet("../data/processed/data-agg.pq")

In [2]:
data_agg = pl.read_parquet("../data/processed/data-agg.pq")

In [3]:
url_set = set(data_agg["url_host"].unique())
print(f"{len(url_set)} urls")
url_dict = {url: idurl for url, idurl in zip(url_set, range(len(url_set)))}
usr_set = set(data_agg["user_id"].unique())
print(f"{len(usr_set)} users")
usr_dict = {usr: user_id for usr, user_id in zip(usr_set, range(len(usr_set)))}

199508 urls
415317 users


In [4]:
%%time
values = np.array(data_agg["request_cnt"].to_pandas())
rows = np.array(data_agg["user_id"].to_pandas().map(usr_dict))
cols = np.array(data_agg["url_host"].to_pandas().map(url_dict))
mat = scipy.sparse.coo_matrix((values, (rows, cols)), shape=(rows.max() + 1, cols.max() + 1)).tocsr()

CPU times: user 23.6 s, sys: 3.59 s, total: 27.2 s
Wall time: 27.1 s


In [5]:
%%time
als = implicit.approximate_als.FaissAlternatingLeastSquares(
    factors=50, 
    iterations=30, 
    use_gpu=False,
    calculate_training_loss=False, 
    regularization=0.1,
)
als.fit(mat)

100%|███████████████████████████████████████████| 30/30 [05:57<00:00, 11.92s/it]


CPU times: user 1h 31min 53s, sys: 1h 15min 4s, total: 2h 46min 57s
Wall time: 6min 3s


In [6]:
u_factors = als.model.user_factors 
d_factors = als.model.item_factors

In [14]:
%%time
users = [None] * len(usr_dict)
for user_id, user_idx in usr_dict.items():
    users[user_idx] = user_id

user_embs = pd.DataFrame()
user_embs["user_id"] = users
user_embs["als_embeddings"] = pd.DataFrame(u_factors).to_numpy(np.float32).tolist()
user_embs = (
    pl.from_pandas(user_embs)
    .select(
        [
            pl.col("user_id").cast(pl.UInt32),
            pl.col("als_embeddings").cast(pl.List(pl.Float32)),
        ]
    )
)

CPU times: user 2.94 s, sys: 647 ms, total: 3.58 s
Wall time: 3.6 s


In [17]:
%%time
urls = [None] * len(url_dict)
for url_id, url_idx in url_dict.items():
    urls[url_idx] = url_id

url_embs = pd.DataFrame()
url_embs["url_host"] = urls
url_embs["als_embeddings"] = pd.DataFrame(d_factors).to_numpy(np.float32).tolist()
url_embs = (
    pl.from_pandas(url_embs)
    .select(
        [
            pl.col("url_host").cast(pl.Utf8),
            pl.col("als_embeddings").cast(pl.List(pl.Float32)),
        ]
    )
)

CPU times: user 2.73 s, sys: 406 ms, total: 3.13 s
Wall time: 3.34 s


In [19]:
user_embs.write_parquet("../data/features/als-user-embeddings.pq")

In [20]:
url_embs.write_parquet("../data/features/als-urls-embeddings.pq")