In [19]:
import polars as pl
import pandas as pd
import scipy.sparse as sp
import numpy as np
from tqdm import tqdm
import joblib as jbl

In [3]:
%%time
df = pl.read_parquet("../data/combinations/k=2/mapped/0.pq")

CPU times: user 1.47 s, sys: 599 ms, total: 2.07 s
Wall time: 2.06 s


In [21]:
n_users = 0
max_comb_id = 0
for i in tqdm(range(42)):
    batch = pl.read_parquet(f"../data/combinations/k=2/mapped/{i}.pq")
    n_users += batch["user_id"].n_unique()
    max_comb_id = max(
        max_comb_id,
        batch
        .select(pl.col("url_combination_id").str.split(" "))
        .to_series()
        .explode()
        .cast(pl.Int32)
        .max()
    )

100%|███████████████████████████████████████████| 42/42 [12:02<00:00, 17.21s/it]


In [22]:
max_comb_id

163791897

In [23]:
n_users

415317

In [2]:
def to_sparse_batch(i: int) -> None:
    batch = pl.read_parquet(f"../data/combinations/mapped/{i}.pq")
    
    users = dict(zip(batch["user_id"].to_list(), range(len(batch))))
    combinations_matrix = sp.dok_matrix((len(users), 163791897+1), dtype=np.int32)
    for user_id, combs in tqdm(batch.iter_rows(), desc=f"batch={i}", total=len(batch)):
        for comb in map(int, combs.split()):
            combinations_matrix[users[user_id], comb] += 1
    combinations_matrix = combinations_matrix.tocsr()
    
    jbl.dump(users, f"../data/combinations/sparse/{i}/users.jbl")
    sp.save_npz(f"../data/combinations/sparse/{i}/combinations.npz", combinations_matrix)

In [None]:
jbl.Parallel(n_jobs=14)(jbl.delayed(to_sparse_batch)(i) for i in range(42))

batch=1:  93%|█████████▎| 9288/10000 [15:29<01:09, 10.20it/s]]  
batch=0:  99%|█████████▉| 9914/10000 [16:18<00:11,  7.70it/s]]]
batch=1:  98%|█████████▊| 9835/10000 [16:24<00:05, 29.49it/s]]
batch=1:  99%|█████████▊| 9864/10000 [16:27<00:09, 14.71it/s]]
batch=12:  95%|█████████▌| 9515/10000 [16:55<01:08,  7.08it/s]
batch=12:  96%|█████████▌| 9571/10000 [17:02<00:58,  7.28it/s]]
batch=14:  11%|█▏        | 1134/10000 [01:37<10:14, 14.42it/s]
batch=16:   5%|▍         | 477/10000 [00:46<15:19, 10.35it/s]]]
batch=20:   1%|          | 111/10000 [00:11<34:47,  4.74it/s]] 
batch=21:   2%|▏         | 193/10000 [00:20<14:45, 11.08it/s]] 
batch=22:   2%|▏         | 175/10000 [00:19<22:52,  7.16it/s]]
batch=14:  15%|█▌        | 1545/10000 [02:17<11:35, 12.15it/s]  
batch=19:   4%|▍         | 410/10000 [00:48<27:37,  5.79it/s]]]
batch=18:  10%|▉         | 973/10000 [01:21<09:24, 15.99it/s]] 
batch=17:  85%|████████▍ | 8489/10000 [14:15<01:28, 17.01it/s]] 
batch=19:  98%|█████████▊| 9783/10000 [15:

In [5]:
combinations_matrix = sp.vstack(
    [
        sp.load_npz(f"../data/combinations/sparse/{i}/combinations.npz")
        for i in tqdm(range(42))
    ]
)

100%|███████████████████████████████████████████| 42/42 [01:18<00:00,  1.86s/it]


In [7]:
%%time
sp.save_npz("../data/combinations/matrix.npz", combinations_matrix)

CPU times: user 33min 47s, sys: 11.7 s, total: 33min 59s
Wall time: 34min 4s


In [15]:
%%time
combinations_matrix.max(axis=1).max()

CPU times: user 3min 12s, sys: 2.43 s, total: 3min 14s
Wall time: 3min 15s


1

In [16]:
%%time
combinations_matrix = combinations_matrix.astype(np.uint8)

CPU times: user 9.56 s, sys: 8.42 s, total: 18 s
Wall time: 18 s


In [19]:
%%time
sp.save_npz("../data/combinations/matrix-uint8.npz", combinations_matrix)

CPU times: user 21min 24s, sys: 9.13 s, total: 21min 33s
Wall time: 21min 37s


In [20]:
from sklearn.feature_extraction.text import TfidfTransformer

In [21]:
tfidf = TfidfTransformer()

In [24]:
%%time
combinations_tfidf = tfidf.fit_transform(combinations_matrix).astype(np.float32)

CPU times: user 48min 52s, sys: 4min 44s, total: 53min 37s
Wall time: 53min 44s


In [27]:
%%time
sp.save_npz("../data/combinations/matrix-tfidf.npz", combinations_tfidf)

CPU times: user 28min 26s, sys: 27.5 s, total: 28min 53s
Wall time: 28min 57s


In [2]:
%%time
combinations_tfidf = sp.load_npz("../data/combinations/matrix-tfidf.npz")

CPU times: user 2min 2s, sys: 14.9 s, total: 2min 17s
Wall time: 2min 17s


In [3]:
combinations_tfidf

<415317x163791898 sparse matrix of type '<class 'numpy.float32'>'
	with 2259411066 stored elements in Compressed Sparse Row format>

In [4]:
from sklearn.decomposition import TruncatedSVD

In [5]:
svd = TruncatedSVD(n_components=32, random_state=777)

In [6]:
%%time
combinations_svd_32 = svd.fit_transform(combinations_tfidf)

CPU times: user 3h 59min 57s, sys: 31min 34s, total: 4h 31min 32s
Wall time: 3h 31min 17s


In [10]:
np.save("../data/combinations/svd-32.npy", combinations_svd_32)

In [12]:
combinations_svd_32.shape

(415317, 32)

In [13]:
users = [None] * 415317
for i in tqdm(range(42)):
    batch_users = jbl.load(f"../data/combinations/sparse/{i}/users.jbl")
    for user_id, idx in batch_users.items():
        users[i * 10_000 + idx] = user_id

100%|███████████████████████████████████████████| 42/42 [00:01<00:00, 22.06it/s]


In [20]:
embs = pd.DataFrame()
embs["user_id"] = users
embs["url_combinations_svd_embeddings"] = pd.DataFrame(combinations_svd_32).to_numpy(np.float32).tolist()

In [23]:
e =(
    pl.from_pandas(embs)
    .select(
        [
            pl.col("user_id").cast(pl.UInt32),
            pl.col("url_combinations_svd_embeddings").cast(pl.List(pl.Float32))
        ]
    )
)

In [25]:
e.write_parquet("../data/features/svd-32.pq")