### Data loading
Cell bellow will load small preprocessed part of VK dataset with 1% of random users and 1% of most frequent items

In [1]:
from huggingface_hub import hf_hub_download
import polars as pl
import numpy as np

subsample_name = 'up0.01_ip0.01'
# Original script fores us to use embedding with size 32
content_embedding_size = 32
# That is suboptimal cause we loose data, I will use 64
# content_embedding_size = 64

train_interactions_files = [f'subsamples/{subsample_name}/train/week_{i:02}.parquet'
                            for i in range(25)]
val_interactions_file = [f'subsamples/{subsample_name}/validation/week_25.parquet']

metadata_files = ['metadata/users_metadata.parquet',
                  'metadata/items_metadata.parquet',
                  'metadata/item_embeddings.npz']

for file in (train_interactions_files +
             val_interactions_file +
             metadata_files):
    hf_hub_download(
        repo_id='deepvk/VK-LSVD', repo_type='dataset',
        filename=file, local_dir='VK-LSVD'
    )

train_interactions = pl.concat([pl.scan_parquet(f'VK-LSVD/{file}')
                                for file in train_interactions_files])
# Train raw data
train_interactions = train_interactions.collect(engine='streaming')
# Validataion raw data
val_interactions = pl.read_parquet(f'VK-LSVD/{val_interactions_file[0]}')

# List of unique users in train dataset
train_users = train_interactions.select('user_id').unique()
# List of unique items in train dataset
train_items = train_interactions.select('item_id').unique()

# Metadata of 1% of most frequent videos
item_ids = np.load('VK-LSVD/metadata/item_embeddings.npz')['item_id']
# VK prearanged embeddings that describe video data
item_embeddings = np.load('VK-LSVD/metadata/item_embeddings.npz')['embedding']

# Select only items in our subset from global items metadata
mask = np.isin(item_ids, train_items.to_numpy())
# Leave only train metadata indices
item_ids = item_ids[mask]
# Leave only train metadata embeddings
item_embeddings = item_embeddings[mask]

# Here we chose embedding size; In exampe script we crop embedding to 32 positions
# That is suboptimal approach, we have full 64 positions, so here we loose useful data
# However it is unclear what features lie in second half
print(f"Crop embedding - item embedding shape: {item_embeddings.shape}")
item_embeddings = item_embeddings[:, :content_embedding_size]
# Temporary disable normalization to test GMM for class selection
print(f"Item embedding example before normalization {item_embeddings[0]}")
# item_embeddings = item_embeddings / np.linalg.norm(item_embeddings, axis=1).reshape((item_embeddings.shape[0], 1))
# print(f"Item embedding example after normalization {item_embeddings[0]}")

users_metadata = pl.read_parquet('VK-LSVD/metadata/users_metadata.parquet')
items_metadata = pl.read_parquet('VK-LSVD/metadata/items_metadata.parquet')

users_metadata = users_metadata.join(train_users, on='user_id')
items_metadata = items_metadata.join(train_items, on='item_id')
items_metadata = items_metadata.join(pl.DataFrame({'item_id': item_ids, 
                                                   'embedding': item_embeddings}), 
                                                    on='item_id')


  from .autonotebook import tqdm as notebook_tqdm


Crop embedding - item embedding shape: (196277, 64)
Item embedding example before normalization [-0.5225   -0.1632    0.133    -0.007618  0.1466    0.3093    0.01971
 -0.0708    0.02953   0.2052   -0.1664    0.213     0.013504  0.1641
 -0.2498   -0.146    -0.0669   -0.004204  0.03156  -0.02571  -0.0659
 -0.1031    0.09924   0.06976   0.10284  -0.0633    0.0561   -0.002018
  0.0637    0.03955  -0.0916    0.02654 ]


In [2]:
users_metadata

user_id,age,gender,geo,train_interactions_rank
u32,u8,u8,u8,u32
136302664,18,1,0,64701
347489880,18,1,0,40800
200182184,18,1,1,82233
202612548,18,1,1,965
417607951,18,1,1,17023
…,…,…,…,…
220172774,70,2,79,8146
310638477,70,2,79,1363
361209246,70,2,79,31077
368937023,70,2,79,70482


In [3]:
items_metadata

item_id,author_id,duration,train_interactions_rank,embedding
u32,u32,u8,u32,"array[f16, 32]"
1222,274696,35,98839,"[-0.522461, -0.163208, … 0.026535]"
2376,936009,9,97985,"[-0.270752, 0.321533, … 0.064697]"
2425,219847,63,149209,"[-0.505859, -0.174438, … -0.032288]"
5967,504767,59,56611,"[-0.541016, -0.079773, … 0.003593]"
8553,687320,28,142411,"[-0.307129, -0.172974, … 0.035645]"
…,…,…,…,…
608049069,330884,12,23765,"[-0.317627, -0.120422, … 0.098633]"
608053295,946780,13,114670,"[-0.522949, -0.110229, … 0.003149]"
608059538,930725,5,155307,"[-0.36792, 0.172363, … 0.067444]"
608061840,1222813,59,64327,"[-0.631348, -0.17749, … 0.011719]"


In [4]:
train_interactions

user_id,item_id,place,platform,agent,timespent,like,dislike,share,bookmark,click_on_author,open_comments
u32,u32,u8,u8,u8,u8,bool,bool,bool,bool,bool,bool
4862415,175404824,0,0,0,48,false,false,false,false,false,true
276873582,97755319,1,1,1,44,false,false,false,false,false,false
434112541,254862034,1,1,1,41,false,false,false,false,false,false
37377677,132750843,1,1,1,8,false,false,false,false,false,false
425914526,163619500,1,0,0,27,false,false,false,false,false,false
…,…,…,…,…,…,…,…,…,…,…,…
309417267,42883716,1,0,0,59,false,false,false,false,false,false
226669287,532713837,1,0,0,17,false,false,false,false,false,false
216075342,115770061,1,0,0,36,false,false,false,false,false,false
509866772,205500526,1,0,0,1,false,false,false,false,false,false


In [5]:
train_users

user_id
u32
11398714
174616941
4624110
9864944
44979817
…
257191124
35013803
168071981
216696997


In [6]:
train_items

item_id
u32
238164879
154677717
288483185
57214553
375514613
…
35766595
207035619
253149666
221615638


In [2]:
def ndcg_100(submission, intereactions):
    final_metric = 0
    for j, entry in enumerate(submission.iter_rows(named=True)):
        dcg = 0
        idcg = 0
        iid = entry["item_id"]
        for i, uid in enumerate(entry["user_id"]):
            row = intereactions.filter((pl.col("user_id") == uid) & (pl.col("item_id") == iid),)
            row_arr = np.array([row["like"], row["dislike"], row["share"], row["bookmark"], row["click_on_author"], row["open_comments"]])
            idcg += pow(2.0, np.int32(row_arr.any()) - 1) / (np.log2(i + 2))
            dcg += np.int32(row_arr.any()) / (np.log2(i + 2))
        final_metric += dcg / idcg
        if (j % 1000 == 0):
            print(final_metric)
    return final_metric / submission.shape[0]

## Ideas to test
### First user preference embedding
We should take 100 clips given to us and create several features out of it.
First we must found average user embedding. For that we take items that user
Have watched and calculate average embedding. Also we may calculate average embedding
with videos with which he intereacted most. I.e only liked, shared. 
And one with wich he don't want to intereact: disliked.

Average 100 items embedding: [0.5, xxx, xxx, ..., xxx] - len 64

Average liked and shared and watched embedding: [xxx, xxx, xxx, ..., xxx] - len 64

Average disliked embedding: [xxx, xxx, xxx, ..., xxx] - len 64

Problem with idea - I don't have guarantee that average embedding will hold some real meaning.
Hope that tree will filter meaningless embedding positions.
### Second idea use HSNW to cluster the user pseudoclass into "abstract bubbles" and later average this bubbles
Find all clips watched by some user and split them into interest.Then for each interest we will have separate average embedding

This approach already exists and called Gaussian Mixture GM

After we split dataset to number of classes we must calculate classes for each user.

Later we must use XGBoost to build trees that will predict users for given item based on created labels, classes.

### First idea realization

In [None]:
# print(train_users)
result = pl.DataFrame({"user_id": np.uint32(0), "embedding": [np.zeros(64, dtype=np.float32)]})
# print(result)
for tu in train_users.to_numpy():
    # print(tu)
    mask = np.isin(train_interactions["user_id"], np.asarray([tu,]))
    if not mask.any():
        continue
    user_watched_items = train_interactions.filter(mask)
    all_unique_watched_items = user_watched_items["item_id"].unique()
    mask = np.isin(item_ids, all_unique_watched_items.to_numpy())
    watched_item_ids_within_all = item_ids[mask]
    watched_item_embeddings_within_all = item_embeddings[mask]
    # TODO(d-desiatkin): Here we must add normalization, cause extended embedding is not normalized
    average_embedding = watched_item_embeddings_within_all.mean(axis=0)
    single_user_processing_result = pl.DataFrame({"user_id": tu, "embedding": [average_embedding]})
    result = result.extend(single_user_processing_result)
result = result[1:]

In [None]:
np.save("average_user_embedding.npy", result, allow_pickle=True)

In [None]:
users_metadata = users_metadata.join(result, on='user_id')

In [None]:
users_metadata

In [None]:
items_metadata

In [None]:
def cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

def cosine_distance(a, b):
    return 1 - cosine_similarity(a, b)

def length_simmilarity(a,b):
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    similarity = min(norm_a, norm_b) / max(norm_a, norm_b)
    return similarity

def total_simmilarity(a,b):
    first = cosine_similarity(a,b)
    second = length_simmilarity(a,b)
    # Angle is 80% and Distance 20% of final simmilarity 
    return first * 0.8 + second * 0.2


In [None]:
result = pl.DataFrame({"item_id": np.uint32(0), "user_id": [np.zeros(100, dtype=np.uint32)]})
for item_entry in items_metadata.iter_rows(named=True):
    simmilarity_list = []
    for user_entry in users_metadata.iter_rows(named=True):
        simmilarity_list.append([user_entry["user_id"], cosine_distance(item_entry["embedding"], user_entry["embedding"])])
    simmilarity_list = sorted(simmilarity_list, key=lambda entry: entry[1])
    closest_user_ids = np.array([x[0] for x in simmilarity_list[:100]], dtype=np.uint32)
    single_item_processing_result = pl.DataFrame({"item_id": np.uint32(item_entry["item_id"]), "user_id": [closest_user_ids]})
    result = result.extend(single_item_processing_result)
result = result[1:]

In [None]:
print(result)

In [None]:
np.save("embeedding_similarity_based_submission.npy", result, allow_pickle=True)

In [None]:
tmp = np.load("embeedding_similarity_based_submission.npy", allow_pickle=True)
result = pl.DataFrame({"item_id": tmp.T[0].astype(np.uint32), "user_id": tmp.T[1].tolist()})
result.write_parquet('submission.parquet')

In [None]:
ndcg_100(result, train_interactions)

### Second Idea Realization

In [3]:
import hnswlib
import pickle

dim = 32
# num_elements = 10000
num_elements = item_embeddings.shape[0]

# Aparantly algorythm requires indices...
# ids = np.arange(num_elements)

# Declaring index
p = hnswlib.Index(space = 'cosine', dim = dim) # possible options are l2, cosine or ip

# Initializing index - the maximum number of elements should be known beforehand
p.init_index(max_elements = num_elements, ef_construction = 4000, M = 640)

# Element insertion (can be called several times):
p.add_items(item_embeddings)

# Controlling the recall by setting ef:
p.set_ef(1000) # ef should always be > k

# Query dataset, k - number of the closest elements (returns 2 numpy arrays)
labels, distances = p.knn_query(item_embeddings[1000:1010], k = 500)

# Index objects support pickling
# WARNING: serialization via pickle.dumps(p) or p.__getstate__() is NOT thread-safe with p.add_items method!
# Note: ef parameter is included in serialization; random number generator is initialized with random_seed on Index load
p_copy = pickle.loads(pickle.dumps(p)) # creates a copy of index p using pickle round-trip

### Index parameters are exposed as class properties:
print(f"Parameters passed to constructor:  space={p_copy.space}, dim={p_copy.dim}") 
print(f"Index construction: M={p_copy.M}, ef_construction={p_copy.ef_construction}")
print(f"Index size is {p_copy.element_count} and index capacity is {p_copy.max_elements}")
print(f"Search speed/quality trade-off parameter: ef={p_copy.ef}")

Parameters passed to constructor:  space=cosine, dim=32
Index construction: M=640, ef_construction=4000
Index size is 196277 and index capacity is 196277
Search speed/quality trade-off parameter: ef=1000


In [None]:
distances

In [8]:
from sklearn.mixture import GaussianMixture

gm = GaussianMixture(n_components=128,
                     covariance_type="full",
                     tol=1e-6,
                     reg_covar=1e-16,
                     n_init=1, 
                     max_iter=2000,
                     init_params="k-means++",
                     random_state=1094).fit(item_embeddings)

In [10]:
if gm.converged_:
    print(f"Score lower bound: {gm.lower_bound_}")
    print(f"Number of iterations to converge: {gm.n_iter_}")
else:
    print("Error! Gaussian Mixture not converged!")
print(gm.get_params())
print(gm.means_[:1])
print(gm.covariances_[:1])

Score lower bound: 32.40050297306434
Number of iterations to converge: 641
{'covariance_type': 'full', 'init_params': 'k-means++', 'max_iter': 2000, 'means_init': None, 'n_components': 128, 'n_init': 1, 'precisions_init': None, 'random_state': 1094, 'reg_covar': 1e-16, 'tol': 1e-06, 'verbose': 0, 'verbose_interval': 10, 'warm_start': False, 'weights_init': None}
[[-0.41019333  0.22170685  0.31923749  0.07133626  0.09925637 -0.0421611
  -0.04554408 -0.0899527   0.10958078 -0.06040252  0.04602672 -0.01733871
  -0.00628473 -0.01694732 -0.105821    0.05195312 -0.01966997 -0.0652695
  -0.00567655 -0.01537108  0.06947336  0.07542811  0.04550196 -0.03104853
  -0.04413151  0.00611915 -0.02442662 -0.01587331 -0.05129029  0.02467013
   0.05349485 -0.00157624]]
[[[ 0.01695752  0.00250496  0.00333141 ...  0.00186054 -0.000286
   -0.00082551]
  [ 0.00250496  0.01306879 -0.00240423 ...  0.00191387 -0.0026256
   -0.00021778]
  [ 0.00333141 -0.00240423  0.01233728 ... -0.00093395  0.00072251
   -0.002

In [12]:
import pickle
with open('gaussian_mixture_128_full.pickle', 'wb') as f:
    pickle.dump(gm, f, protocol=None)

### Second Idea Results

[x] 64  components - spherical - final loss: 25.78482731680628 // Powers of two

[x] 123 components - spherical - final loss: 26.612402741531618 // Lucas

[x] 128 components -   full    - final loss: 32.40050297306434 // Powers of two

[x] 128 components - spherical - final loss: 26.66317279508462 // Powers of two

[x] 144 components - spherical - final loss: 26.822238865843392 // Fibonacchi

[x] 199 components - spherical - final loss: 27.235624615038315 // Lucas

[x] 233 components - spherical - final loss: 27.44773975338467 // Fibonacchi

[x] 377 components - spherical - final loss: 28.065711959610123 // Fibonacchi

In [7]:
import pickle
with open('gaussian_mixture_128_full.pickle', 'rb') as f:
    gm = pickle.load(f)

In [15]:
# result = pl.DataFrame({"user_id": np.uint32(0)})
# for i in range(128):
#     tmp_subres = pl.DataFrame({"user_id": np.uint32(0), f"{i}": np.float64(0.0)})
#     result = result.join(tmp_subres, on='user_id')

# for counter, tu in enumerate(train_users.to_numpy()):
#     # print(tu)
#     mask = np.isin(train_interactions["user_id"], np.asarray([tu,]))
#     if not mask.any():
#         continue
#     user_watched_items = train_interactions.filter(mask)
#     all_unique_watched_items = user_watched_items["item_id"].unique()
#     mask = np.isin(item_ids, all_unique_watched_items.to_numpy())
#     watched_item_ids_within_all = item_ids[mask]
#     watched_item_embeddings_within_all = item_embeddings[mask]
#     labels = gm.predict(watched_item_embeddings_within_all)
#     probability = gm.predict_proba(watched_item_embeddings_within_all)
#     single_user_processing_result = pl.DataFrame({"user_id": tu})
#     for i in range(128):
#         indices = np.where(labels == i)
#         selected_prob = probability[indices]
#         if (selected_prob.size == 0): 
#             mean_prob = np.float64(0.0)
#         else:
#             mean_prob = (selected_prob).mean()
#         tmp_subres = pl.DataFrame({"user_id": tu, f"{i}": mean_prob})
#         single_user_processing_result = single_user_processing_result.join(tmp_subres, on='user_id')
#     result = result.extend(single_user_processing_result)
#     if(counter == 10):
#         result.write_parquet('user_classes_checkpoint.parquet')
#         break
# result = result[1:]

In [27]:
# Read and learn how to use polars efficiently... And write smth in python for God Sake... It could be a game
# Assume gm, train_users, train_interactions, item_embeddings, and item_ids are defined.
# We also assume 'result' is initialized correctly (e.g., result = pl.DataFrame()).

item_ids = train_items["item_id"].to_numpy()

# Pre-calculate GMM probabilities for all items
all_item_labels = gm.predict(item_embeddings)
all_item_probabilities = gm.predict_proba(item_embeddings)
num_components = all_item_probabilities.shape[1] # Should be 128

# Create a Polars DataFrame mapping every item_id to its GMM results
item_gmm_results = pl.DataFrame({
    "item_id": item_ids,
    "label": all_item_labels,
})
# Append probability columns efficiently using numpy to Polars conversion
prob_df = pl.DataFrame(all_item_probabilities, schema={str(i): pl.Float64 for i in range(num_components)})
item_gmm_results = pl.concat([item_gmm_results, prob_df], how="horizontal")

# Iterate through users and aggregate results using efficient Polars operations
user_results_list = []
checkpoint_counter = 0

for tu in train_users["user_id"].to_numpy():
    # Filter user interactions efficiently using Polars' native filtering
    user_watched_items_df = train_interactions.filter(
        pl.col("user_id") == tu
    )

    if user_watched_items_df.is_empty():
        continue
    
    user_items_with_probs = user_watched_items_df.join(
        item_gmm_results, on="item_id", how="inner"
    )
    
    # Calculate the mean probability for each component in a single aggregation step
    # The resulting dataframe will have one row for the user, and N columns for the means
    mean_probs_df = user_items_with_probs.group_by("user_id").agg(
        [pl.mean(str(i)).alias(str(i)) for i in range(num_components)]
    )
    
    user_results_list.append(mean_probs_df)

    checkpoint_counter += 1
    if checkpoint_counter == 10000:
        # Checkpoint logic (if needed)
        pl.concat(user_results_list).write_parquet(f'user_embeddings_checkpoints/user_classes_checkpoint_{checkpoint_counter}.parquet')

# Combine all results outside the loop
result = pl.concat(user_results_list)

print(result)


shape: (100_000, 129)
┌───────────┬──────────┬──────────┬──────────┬───┬──────────┬──────────┬──────────┬──────────┐
│ user_id   ┆ 0        ┆ 1        ┆ 2        ┆ … ┆ 124      ┆ 125      ┆ 126      ┆ 127      │
│ ---       ┆ ---      ┆ ---      ┆ ---      ┆   ┆ ---      ┆ ---      ┆ ---      ┆ ---      │
│ u32       ┆ f64      ┆ f64      ┆ f64      ┆   ┆ f64      ┆ f64      ┆ f64      ┆ f64      │
╞═══════════╪══════════╪══════════╪══════════╪═══╪══════════╪══════════╪══════════╪══════════╡
│ 289340968 ┆ 0.009508 ┆ 0.008891 ┆ 0.011923 ┆ … ┆ 0.007554 ┆ 0.009235 ┆ 0.006205 ┆ 0.009471 │
│ 425277009 ┆ 0.008624 ┆ 0.008808 ┆ 0.014256 ┆ … ┆ 0.008797 ┆ 0.009919 ┆ 0.007015 ┆ 0.008722 │
│ 142975948 ┆ 0.01011  ┆ 0.008743 ┆ 0.012185 ┆ … ┆ 0.009537 ┆ 0.009418 ┆ 0.007246 ┆ 0.008156 │
│ 312217955 ┆ 0.011273 ┆ 0.008195 ┆ 0.013888 ┆ … ┆ 0.010035 ┆ 0.010922 ┆ 0.006945 ┆ 0.009624 │
│ 421832753 ┆ 0.009773 ┆ 0.008709 ┆ 0.012444 ┆ … ┆ 0.007505 ┆ 0.008635 ┆ 0.006358 ┆ 0.008614 │
│ …         ┆ …        ┆ …  

In [28]:
print(result.glimpse())

Rows: 100000
Columns: 129
$ user_id <u32> 289340968, 425277009, 142975948, 312217955, 421832753, 59400572, 1721738, 418725112, 475356093, 237128298
$ 0       <f64> 0.009507613075687007, 0.008623661803511443, 0.01011038345661317, 0.011273278918878214, 0.009772568256754764, 0.010358335191715104, 0.011395766174367482, 0.009592418852777009, 0.011527279963901321, 0.01048617782744945
$ 1       <f64> 0.008890555884012531, 0.008807884576614826, 0.008742928712338177, 0.008195345444905822, 0.008708635832665033, 0.009482166024138927, 0.007705813231315855, 0.008423163158835213, 0.0081991595203696, 0.008787433169665822
$ 2       <f64> 0.011922815685329885, 0.014255816030476418, 0.012185116440803502, 0.013887914141222993, 0.01244437676207402, 0.011990060523624857, 0.012013027019351785, 0.012256935570770334, 0.011669601124806837, 0.013048086777027823
$ 3       <f64> 0.00755919478323653, 0.008033169340362953, 0.007425544850904302, 0.006268333331259937, 0.008193426266729427, 0.007099437811764313, 0.007

In [29]:
result.write_parquet("calculated_user_classes.parquet")

In [9]:
result = pl.read_parquet("calculated_user_classes.parquet")

In [10]:
# Here we sort items_metadata based on train_interactions_rank I will traverse
# this array item by item and create array of users item pairs that will serve as y for predict
# I will use information bellow
sorted_items_metadata_subset = items_metadata.select(["item_id", "train_interactions_rank"]).sort("train_interactions_rank").select("item_id")

# Here we get only item with which user had intereaction
train_interactions_subset = train_interactions.select(["user_id", "item_id", "like", "dislike", "share", "bookmark", "click_on_author", "open_comments"])
popular_items_train_intereactions = train_interactions_subset.filter(
    pl.any_horizontal("like", "dislike", "share", "bookmark", "click_on_author", "open_comments")
)
popular_items_train_intereactions = popular_items_train_intereactions.select(["user_id", "item_id"])

user_item_pairs = (
    sorted_items_metadata_subset
    .join(popular_items_train_intereactions, on="item_id", how="full")
    .filter(
        pl.all_horizontal("item_id", "user_id", "item_id_right")
    ).select("user_id", "item_id")
)

result = (
    result.join(user_item_pairs, on="user_id", how="inner")
)

result.write_parquet("my_training_data.parquet")

### Train XGBoost ranker model here
This allow us to have analogue of train_intereactions_rank

In [1]:
import polars as pl
import numpy as np
import pickle
import xgboost as xgb

my_training_data = pl.read_parquet("my_training_data.parquet")
with open('gaussian_mixture_128_full.pickle', 'rb') as f:
    gm = pickle.load(f)
    
selection_list = [f"{i}" for i in range(128)]
selection_list = ["user_id"] + selection_list
X = my_training_data.select(selection_list).to_numpy()
del selection_list
y = my_training_data.select(["item_id"]).to_numpy()
del my_training_data

In [2]:
seed = 1996
rng = np.random.default_rng(seed)
n_query_groups = 511331
qid = rng.integers(0, n_query_groups, size=X.shape[0])

In [3]:
# Sort the inputs based on query index
sorted_idx = np.argsort(qid)
X = X[sorted_idx, :]
y = y[sorted_idx]
qid = qid[sorted_idx]

In [4]:
ranker = xgb.XGBRanker(tree_method="hist", lambdarank_num_pair_per_sample=100, objective="rank:ndcg", lambdarank_pair_method="topk", ndcg_exp_gain=False, n_jobs=70)
ranker.fit(X, y, qid=qid)

0,1,2
,objective,'rank:ndcg'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [15]:
scores = ranker.predict(X[0:100, :])
sorted_idx = np.argsort(scores)[::-1]
scores = scores[sorted_idx]
y_res = y[sorted_idx]

In [5]:
X

NameError: name 'X' is not defined

In [6]:
ranker.save_model("xgboost_ranker_model.ubj") # Saves as a binary file

### Verify everything with xgboost model


In [2]:
from huggingface_hub import hf_hub_download
import polars as pl
import numpy as np

subsample_name = 'up0.01_ip0.01'
content_embedding_size = 32
val_interactions_file = [f'subsamples/{subsample_name}/validation/week_25.parquet']
# Validataion raw data
val_interactions = pl.read_parquet(f'VK-LSVD/{val_interactions_file[0]}')

# List of unique users in validation dataset
val_users = val_interactions.select('user_id').unique()
# List of unique items in validation dataset
val_items = val_interactions.select('item_id').unique()

# Metadata of 1% of most frequent videos
item_ids = np.load('VK-LSVD/metadata/item_embeddings.npz')['item_id']
# VK prearanged embeddings that describe video data
item_embeddings = np.load('VK-LSVD/metadata/item_embeddings.npz')['embedding']

# Select only items in our subset from global items metadata
mask = np.isin(item_ids, val_items.to_numpy())
# Leave only validation metadata indices
item_ids = item_ids[mask]
# Leave only validation metadata embeddings
item_embeddings = item_embeddings[mask]

print(f"Crop embedding - item embedding shape: {item_embeddings.shape}")
item_embeddings = item_embeddings[:, :content_embedding_size]

print(f"Item embedding example before normalization {item_embeddings[0]}")
# item_embeddings = item_embeddings / np.linalg.norm(item_embeddings, axis=1).reshape((item_embeddings.shape[0], 1))
# print(f"Item embedding example after normalization {item_embeddings[0]}")

users_metadata = pl.read_parquet('VK-LSVD/metadata/users_metadata.parquet')
items_metadata = pl.read_parquet('VK-LSVD/metadata/items_metadata.parquet')

users_metadata = users_metadata.join(val_users, on='user_id')
items_metadata = items_metadata.join(val_items, on='item_id')
data_for_query_search = pl.DataFrame({'item_id': item_ids, 
                                      'embedding': item_embeddings})
items_metadata = items_metadata.join(data_for_query_search,
                                     on='item_id')


Crop embedding - item embedding shape: (113102, 64)
Item embedding example before normalization [-0.5225   -0.1632    0.133    -0.007618  0.1466    0.3093    0.01971
 -0.0708    0.02953   0.2052   -0.1664    0.213     0.013504  0.1641
 -0.2498   -0.146    -0.0669   -0.004204  0.03156  -0.02571  -0.0659
 -0.1031    0.09924   0.06976   0.10284  -0.0633    0.0561   -0.002018
  0.0637    0.03955  -0.0916    0.02654 ]


In [2]:
val_users

user_id
u32
43166063
256826987
487540233
409752952
214168995
…
59396112
148858155
117012554
121166607


In [3]:
import pickle
with open('gaussian_mixture_128_full.pickle', 'rb') as f:
    gm = pickle.load(f)

In [4]:
# Read and learn how to use polars efficiently... And write smth in python for God Sake... It could be a game
# Assume gm, val_users, val_interactions, item_embeddings, and item_ids are defined.
# We also assume 'result' is initialized correctly (e.g., result = pl.DataFrame()).

item_ids = val_items["item_id"].to_numpy()

# Pre-calculate GMM probabilities for all items
all_item_labels = gm.predict(item_embeddings)
all_item_probabilities = gm.predict_proba(item_embeddings)
num_components = all_item_probabilities.shape[1] # Should be 128

# Create a Polars DataFrame mapping every item_id to its GMM results
item_gmm_results = pl.DataFrame({
    "item_id": item_ids,
    "label": all_item_labels,
})
# Append probability columns efficiently using numpy to Polars conversion
prob_df = pl.DataFrame(all_item_probabilities, schema={str(i): pl.Float64 for i in range(num_components)})
item_gmm_results = pl.concat([item_gmm_results, prob_df], how="horizontal")

user_results_list = []
checkpoint_counter = 0

for tu in val_users["user_id"].to_numpy():
    # Filter user interactions efficiently using Polars' native filtering
    user_watched_items_df = val_interactions.filter(
        pl.col("user_id") == tu
    )

    if user_watched_items_df.is_empty():
        continue
    
    user_items_with_probs = user_watched_items_df.join(
        item_gmm_results, on="item_id", how="inner"
    )
    
    # Calculate the mean probability for each component in a single aggregation step
    # The resulting dataframe will have one row for the user, and N columns for the means
    mean_probs_df = user_items_with_probs.group_by("user_id").agg(
        [pl.mean(str(i)).alias(str(i)) for i in range(num_components)]
    )
    
    user_results_list.append(mean_probs_df)

    checkpoint_counter += 1
    if checkpoint_counter % 10000 == 0:
        pl.concat(user_results_list).write_parquet(f'user_embeddings_checkpoints/user_val_classes_checkpoint_{checkpoint_counter}.parquet')

result = pl.concat(user_results_list)

print(result)

shape: (98_725, 129)
┌───────────┬──────────┬──────────┬──────────┬───┬──────────┬──────────┬──────────┬──────────┐
│ user_id   ┆ 0        ┆ 1        ┆ 2        ┆ … ┆ 124      ┆ 125      ┆ 126      ┆ 127      │
│ ---       ┆ ---      ┆ ---      ┆ ---      ┆   ┆ ---      ┆ ---      ┆ ---      ┆ ---      │
│ u32       ┆ f64      ┆ f64      ┆ f64      ┆   ┆ f64      ┆ f64      ┆ f64      ┆ f64      │
╞═══════════╪══════════╪══════════╪══════════╪═══╪══════════╪══════════╪══════════╪══════════╡
│ 289523827 ┆ 0.007418 ┆ 0.015644 ┆ 0.010816 ┆ … ┆ 0.000037 ┆ 0.003784 ┆ 0.012866 ┆ 0.01561  │
│ 128563318 ┆ 0.002439 ┆ 0.000077 ┆ 0.017689 ┆ … ┆ 0.006396 ┆ 0.003059 ┆ 0.004455 ┆ 0.012153 │
│ 47393521  ┆ 0.00149  ┆ 0.009897 ┆ 0.029279 ┆ … ┆ 0.005068 ┆ 0.005996 ┆ 0.007327 ┆ 0.007399 │
│ 383942659 ┆ 0.00553  ┆ 0.007999 ┆ 0.025506 ┆ … ┆ 0.004179 ┆ 0.021648 ┆ 0.008246 ┆ 0.004669 │
│ 440752566 ┆ 0.012066 ┆ 0.008416 ┆ 0.017751 ┆ … ┆ 0.003315 ┆ 0.011614 ┆ 0.006415 ┆ 0.008823 │
│ …         ┆ …        ┆ …   

In [64]:
item_embeddings

array([[-0.5225  , -0.1632  ,  0.133   , ...,  0.03955 , -0.0916  ,
         0.02654 ],
       [-0.2708  ,  0.3215  ,  0.09204 , ..., -0.07715 ,  0.003315,
         0.0647  ],
       [-0.506   , -0.1744  ,  0.0882  , ...,  0.02196 , -0.10596 ,
        -0.0323  ],
       ...,
       [-0.66    , -0.4836  ,  0.1611  , ...,  0.01613 , -0.04727 ,
         0.05542 ],
       [-0.6313  , -0.1775  , -0.0776  , ...,  0.00652 ,  0.007195,
         0.01172 ],
       [-0.3877  ,  0.0655  ,  0.0761  , ..., -0.1558  ,  0.02031 ,
        -0.06665 ]], shape=(113102, 32), dtype=float16)

In [6]:
result.write_parquet("calculated_user_val_classes.parquet")

In [3]:
from huggingface_hub import hf_hub_download
import polars as pl
import numpy as np

result = pl.read_parquet("calculated_user_val_classes.parquet")

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
uid_iid_pairs = val_interactions.select(["user_id", "item_id"])
result = (
    result.join(
        uid_iid_pairs, on="user_id", how="inner"
    )
)
result.write_parquet("my_validation_data.parquet")

In [9]:
result

user_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,…,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,item_id
u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,u32
123183820,0.000009,0.01923,0.005777,2.0597e-7,0.009902,8.7081e-9,2.6929e-7,0.007962,0.000264,0.007772,0.03847,0.00483,3.9568e-7,3.5604e-8,0.007911,0.015754,0.008277,0.020308,0.007979,0.001583,0.008429,1.5827e-24,0.014107,0.000226,0.017102,0.000055,0.022624,0.008458,9.4034e-13,0.000188,0.000164,0.014637,2.1479e-15,0.012573,0.010539,0.001057,…,0.00491,0.029204,0.000027,4.0527e-7,0.007937,0.010883,0.00796,0.000252,0.015323,0.023623,0.023413,0.003077,5.7360e-14,0.007792,0.023572,0.00024,0.007574,0.001449,0.004188,0.006242,0.010262,0.007456,0.023356,0.000009,0.000002,0.014711,0.015873,0.000189,0.000473,3.0328e-7,0.000858,0.007937,0.000021,0.013432,0.010173,0.007937,165415337
271257689,0.004111,0.008598,0.012953,0.011554,0.011689,0.002418,0.008068,0.002484,0.004479,0.00237,0.019827,0.003932,3.5270e-7,0.005041,0.0048,0.010295,0.010478,0.013373,0.009412,0.012296,0.000467,0.004729,0.007954,0.006953,0.004009,0.004024,0.006468,0.005817,0.003133,0.008948,0.013323,0.021256,2.8103e-13,0.009166,0.011669,0.004699,…,0.004421,0.013084,0.00499,0.002522,1.5381e-7,0.007147,0.007477,0.007904,0.007934,0.010578,0.012117,0.012066,0.007108,0.000007,0.008278,0.008119,0.007063,0.000155,0.007059,0.009499,0.013247,0.00424,0.002839,0.004836,0.005651,0.010431,0.011726,0.004007,0.010599,0.005619,0.032811,0.01029,0.008467,0.006666,0.013186,0.005407,124655928
75566669,0.000694,0.006927,0.012812,0.012005,0.020896,1.4352e-9,0.012811,0.007974,0.003594,0.003648,0.023715,0.007728,2.8027e-8,0.007769,0.000148,0.006295,0.003982,0.018802,0.015948,0.014318,0.000016,0.00009,0.010021,0.001232,0.006424,0.005671,0.004091,0.000007,0.007275,0.00722,0.003856,0.004338,5.4250e-12,0.003522,0.009116,0.00375,…,0.008862,0.007413,0.011898,0.008006,6.7269e-7,0.012622,0.000747,0.000021,0.016596,0.008574,0.003917,0.009031,0.003984,0.00432,0.009876,0.009509,0.004873,0.000526,0.000969,0.003824,0.006876,0.002628,0.023625,0.003651,0.014451,0.012074,6.4634e-9,0.002222,0.008541,0.004325,0.01262,0.003998,0.011245,0.005424,0.007638,0.015219,3588592
99224499,0.009379,0.011645,0.010549,0.010622,0.025892,0.006096,0.006018,0.001459,0.008471,0.005607,0.01401,0.008776,7.8061e-11,0.002868,0.001851,0.008079,0.002781,0.014483,0.011537,0.015019,0.00366,0.005514,0.00928,0.003169,0.00773,0.00168,0.001909,0.005699,0.001074,0.018898,0.007353,0.010012,1.5699e-7,0.013379,0.006475,0.000044,…,0.012816,0.032294,0.00255,0.000072,2.9525e-8,0.0086,0.007596,0.018701,0.01223,0.013387,0.003281,0.005676,0.002874,0.002873,0.005968,0.003889,0.003372,0.001886,0.00456,0.006165,0.007863,0.00026,0.009208,0.004638,0.00207,0.002943,0.005684,0.009563,0.002872,0.017337,0.009101,0.000014,0.010696,0.01118,0.016614,0.006015,316673998
300381746,0.005452,0.004053,0.019858,0.009658,0.006095,0.000037,0.008086,0.002396,0.00485,6.8473e-8,0.0256,0.006108,0.004384,0.004408,0.004317,0.009969,0.008328,0.011531,0.009299,0.005314,0.004327,0.002154,0.010414,0.007178,0.010213,0.005698,0.007196,0.012351,0.006561,0.016598,0.005078,0.013273,1.1940e-10,0.005377,0.010351,0.003863,…,0.004531,0.017718,0.00794,0.004126,0.002122,0.004253,0.011548,0.016502,0.007542,0.007859,0.01101,0.008977,0.004405,0.004443,0.005543,0.004526,0.011037,0.001287,0.012408,0.007497,0.003481,0.000001,0.001479,0.008737,0.005765,0.007567,0.002197,0.00479,0.002738,0.007091,0.011455,0.000076,0.006073,0.006235,0.014907,0.002454,288980844
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
335007791,0.006339,0.014779,0.019745,0.004568,0.012455,0.009299,0.003499,0.003647,0.007862,0.006469,0.015539,0.005287,0.002093,0.009066,0.005891,0.010902,0.004432,0.019943,0.004763,0.017974,0.000493,0.002107,0.006198,0.008436,0.007724,0.00486,0.010042,0.005725,0.005091,0.005623,0.005001,0.008396,3.7474e-10,0.009067,0.016537,0.002536,…,0.006667,0.005387,0.008911,0.00549,0.002384,0.002205,0.010425,0.007035,0.010448,0.011763,0.014306,0.01031,0.008777,0.00538,0.010626,0.005159,0.001686,0.003366,0.006054,0.008404,0.008211,0.00433,0.008887,0.009183,0.005394,0.010092,0.001055,0.002699,0.002663,0.013455,0.015977,0.007567,0.007114,0.007815,0.00781,0.006269,486073769
116938272,0.007726,0.006446,0.015608,0.005231,0.006282,0.005236,0.009962,0.009815,0.002234,0.005188,0.044901,0.002938,0.007258,0.009285,0.001225,0.0009,0.002338,0.016931,0.005447,0.015075,0.004683,0.005236,0.006186,0.008655,0.006855,0.010752,0.009841,0.000838,0.009859,0.023816,0.000008,0.01049,1.9639e-9,0.011608,0.012678,0.00573,…,0.004473,0.005677,0.000472,0.000029,0.001928,0.005697,0.009637,0.005296,0.014372,0.015056,1.1964e-8,0.002583,1.4597e-8,0.000009,0.010431,0.000362,0.000597,0.001849,0.010764,0.014929,0.002439,0.010512,0.005723,0.005532,0.000004,0.009751,5.4926e-17,0.000182,0.009436,0.009106,0.00812,0.010431,0.001626,0.000011,0.004992,0.004681,247349299
257744051,0.000328,0.01531,0.019429,0.005272,0.01421,0.00476,0.000291,0.000164,0.007293,0.014254,0.027168,0.009978,2.3031e-11,0.000481,0.000095,0.00907,0.008483,0.020022,0.004511,0.006653,0.000084,4.5008e-25,0.002086,0.006985,0.018538,0.000195,0.00023,0.000011,0.004737,0.00879,0.000032,0.006925,3.2344e-38,0.00669,0.00721,0.007805,…,0.002259,0.018211,0.009606,0.009663,0.004764,0.008699,0.00314,0.014777,0.024495,0.005473,0.01974,0.00686,0.000075,0.000008,7.1230e-10,0.011847,0.013877,8.4799e-12,0.003794,0.003694,0.004277,0.003336,0.019356,0.007763,0.004625,0.004929,2.2946e-14,0.005236,1.4406e-7,0.013692,0.014942,6.1488e-7,0.005486,0.000559,0.009906,0.009565,445418458
454044079,0.006445,0.007031,0.016647,0.002284,0.008373,0.003814,0.003807,0.003289,0.010739,0.002931,0.004077,0.011989,0.00376,0.007603,0.007684,0.005308,0.006061,0.018554,0.012479,0.007087,0.001932,9.7937e-12,0.011975,0.000016,0.009262,0.004775,0.008059,0.010933,0.003316,0.00873,0.004569,0.006439,1.5142e-17,0.000713,0.011828,0.000529,…,0.013809,0.018469,0.00594,0.000038,0.007498,0.004355,0.008183,0.011531,0.008285,0.004689,0.010606,0.012625,0.003794,0.00379,0.010558,0.000012,0.000505,0.008629,0.012082,0.013497,0.007044,0.005941,0.013351,0.004663,0.015414,0.00722,0.003802,0.003783,0.000972,0.015192,0.007051,0.007618,0.008939,0.007873,0.007797,0.011434,509125296


In [1]:
import polars as pl
import numpy as np
import pickle
import xgboost as xgb

my_validation_data = pl.read_parquet("my_validation_data.parquet")
with open('gaussian_mixture_128_full.pickle', 'rb') as f:
    gm = pickle.load(f)
    
selection_list = [f"{i}" for i in range(128)]
selection_list = ["user_id"] + selection_list
X = my_validation_data.select(selection_list).to_numpy()
del selection_list
y = my_validation_data.select(["item_id"]).to_numpy()
del my_validation_data

In [2]:
seed = 1996
rng = np.random.default_rng(seed)
n_query_groups = 35035
qid = rng.integers(0, n_query_groups, size=X.shape[0])

In [3]:
ranker = xgb.XGBRanker()
ranker.load_model("xgboost_ranker_model.ubj")

In [4]:
# Sort the inputs based on query index
sorted_idx = np.argsort(qid)
X = X[sorted_idx, :]
y = y[sorted_idx]
qid = qid[sorted_idx]

In [5]:
scores = ranker.predict(X)
sorted_idx = np.argsort(scores)[::-1]
scores = scores[sorted_idx]
y_pred = y[sorted_idx]

In [74]:
calculated_user_classes = pl.read_parquet("calculated_user_val_classes.parquet")
selection_list = [f"{i}" for i in range(128)]
calculated_user_classes.filter(
    pl.all_horizontal(selection_list)
)
print(calculated_user_classes)
user_embeddings = calculated_user_classes.select(selection_list).to_numpy()
labels = calculated_user_classes.select("user_id").to_numpy().flatten()
print(labels)
# Map your custom IDs to simple integers for hnswlib
reverse_map = {i: custom_id for i, custom_id in enumerate(labels)}
# Reverse map for retrieval
label_map = {custom_id: i for i, custom_id in enumerate(labels)}

shape: (98_725, 129)
┌───────────┬──────────┬──────────┬──────────┬───┬──────────┬──────────┬──────────┬──────────┐
│ user_id   ┆ 0        ┆ 1        ┆ 2        ┆ … ┆ 124      ┆ 125      ┆ 126      ┆ 127      │
│ ---       ┆ ---      ┆ ---      ┆ ---      ┆   ┆ ---      ┆ ---      ┆ ---      ┆ ---      │
│ u32       ┆ f64      ┆ f64      ┆ f64      ┆   ┆ f64      ┆ f64      ┆ f64      ┆ f64      │
╞═══════════╪══════════╪══════════╪══════════╪═══╪══════════╪══════════╪══════════╪══════════╡
│ 289523827 ┆ 0.007418 ┆ 0.015644 ┆ 0.010816 ┆ … ┆ 0.000037 ┆ 0.003784 ┆ 0.012866 ┆ 0.01561  │
│ 128563318 ┆ 0.002439 ┆ 0.000077 ┆ 0.017689 ┆ … ┆ 0.006396 ┆ 0.003059 ┆ 0.004455 ┆ 0.012153 │
│ 47393521  ┆ 0.00149  ┆ 0.009897 ┆ 0.029279 ┆ … ┆ 0.005068 ┆ 0.005996 ┆ 0.007327 ┆ 0.007399 │
│ 383942659 ┆ 0.00553  ┆ 0.007999 ┆ 0.025506 ┆ … ┆ 0.004179 ┆ 0.021648 ┆ 0.008246 ┆ 0.004669 │
│ 440752566 ┆ 0.012066 ┆ 0.008416 ┆ 0.017751 ┆ … ┆ 0.003315 ┆ 0.011614 ┆ 0.006415 ┆ 0.008823 │
│ …         ┆ …        ┆ …   

In [7]:
import hnswlib
import pickle

dim = 128
# num_elements = 10000
num_elements = user_embeddings.shape[0]

# Aparantly algorythm requires indices...
ids = np.arange(num_elements)

# Declaring index
p = hnswlib.Index(space = 'l2', dim = dim) # possible options are l2, cosine or ip

# Initializing index - the maximum number of elements should be known beforehand
p.init_index(max_elements = num_elements, ef_construction = 4000, M = 640)

p.set_num_threads(70)

# Element insertion (can be called several times):
p.add_items(user_embeddings)

# Controlling the recall by setting ef:
p.set_ef(1000) # ef should always be > k

# Index objects support pickling
# WARNING: serialization via pickle.dumps(p) or p.__getstate__() is NOT thread-safe with p.add_items method!
# Note: ef parameter is included in serialization; random number generator is initialized with random_seed on Index load
# p_copy = pickle.loads(pickle.dumps(p)) # creates a copy of index p using pickle round-trip
p.save_index("search_index.bin")

### Index parameters are exposed as class properties:
print(f"Parameters passed to constructor:  space={p.space}, dim={p.dim}") 
print(f"Index construction: M={p.M}, ef_construction={p.ef_construction}")
print(f"Index size is {p.element_count} and index capacity is {p.max_elements}")
print(f"Search speed/quality trade-off parameter: ef={p.ef}")

Parameters passed to constructor:  space=l2, dim=128
Index construction: M=640, ef_construction=4000
Index size is 98725 and index capacity is 98725
Search speed/quality trade-off parameter: ef=1000


In [76]:
with open('gaussian_mixture_128_full.pickle', 'rb') as f:
    gm = pickle.load(f)

query_iid = data_for_query_search.select(["item_id"])
query_raw_embeddings = data_for_query_search.select(["embedding"]).to_numpy()
query_raw_embeddings = np.array([x.tolist() for x in query_raw_embeddings]).reshape(113102, 32)
query_embeddings = gm.predict_proba(query_raw_embeddings)
del query_raw_embeddings

# Query dataset, k - number of the closest elements (returns 2 numpy arrays)
mangled_labels, distances = p.knn_query(query_embeddings, k = 100)

In [88]:
result = []
for mls in mangled_labels:
    unmangled_labels = []
    for ml in mls:
        unmangled_labels.append(int(reverse_map[ml]))
    result.append(unmangled_labels)
result = np.array(result, dtype=np.uint32)

In [89]:
result

array([[ 70542769,  82071495, 428101019, ..., 119318263, 206626927,
        427365766],
       [493276314, 219109825, 503599384, ..., 348818878, 224672049,
         81535518],
       [237344043, 108252095, 432066226, ...,  51871241, 375051437,
        422415021],
       ...,
       [ 71090609,   8715459, 499084668, ...,  70393226, 293765405,
        397624747],
       [312000463, 399034155, 431993294, ..., 379758735, 219154633,
        346642565],
       [150280213,  72554860, 148087833, ..., 487610884, 248865161,
         54656266]], shape=(113102, 100), dtype=uint32)

In [91]:
submission = pl.DataFrame({"item_id": query_iid, "user_id": result})

In [92]:
submission.write_parquet('submission.parquet')