### Data loading
Cell bellow will load small preprocessed part of VK dataset with 1% of random users and 1% of most frequent items

In [1]:
from huggingface_hub import hf_hub_download
import polars as pl
import numpy as np

subsample_name = 'up0.01_ip0.01'
# Original script fores us to use embedding with size 32
content_embedding_size = 32
# That is suboptimal cause we loose data, I will use 64
# content_embedding_size = 64

train_interactions_files = [f'subsamples/{subsample_name}/train/week_{i:02}.parquet'
                            for i in range(25)]
val_interactions_file = [f'subsamples/{subsample_name}/validation/week_25.parquet']

metadata_files = ['metadata/users_metadata.parquet',
                  'metadata/items_metadata.parquet',
                  'metadata/item_embeddings.npz']

for file in (train_interactions_files +
             val_interactions_file +
             metadata_files):
    hf_hub_download(
        repo_id='deepvk/VK-LSVD', repo_type='dataset',
        filename=file, local_dir='VK-LSVD'
    )

train_interactions = pl.concat([pl.scan_parquet(f'VK-LSVD/{file}')
                                for file in train_interactions_files])
# Train raw data
train_interactions = train_interactions.collect(engine='streaming')
# Validataion raw data
val_interactions = pl.read_parquet(f'VK-LSVD/{val_interactions_file[0]}')

# List of unique users in train dataset
train_users = train_interactions.select('user_id').unique()
# List of unique items in train dataset
train_items = train_interactions.select('item_id').unique()

# Metadata of 1% of most frequent videos
item_ids = np.load('VK-LSVD/metadata/item_embeddings.npz')['item_id']
# VK prearanged embeddings that describe video data
item_embeddings = np.load('VK-LSVD/metadata/item_embeddings.npz')['embedding']

# Select only items in our subset from global items metadata
mask = np.isin(item_ids, train_items.to_numpy())
# Leave only train metadata indices
item_ids = item_ids[mask]
# Leave only train metadata embeddings
item_embeddings = item_embeddings[mask]

# Here we chose embedding size; In exampe script we crop embedding to 32 positions
# That is suboptimal approach, we have full 64 positions, so here we loose useful data
# However it is unclear what features lie in second half
print(f"Crop embedding - item embedding shape: {item_embeddings.shape}")
item_embeddings = item_embeddings[:, :content_embedding_size]
# Temporary disable normalization to test GMM for class selection
print(f"Item embedding example before normalization {item_embeddings[0]}")
# item_embeddings = item_embeddings / np.linalg.norm(item_embeddings, axis=1).reshape((item_embeddings.shape[0], 1))
# print(f"Item embedding example after normalization {item_embeddings[0]}")

users_metadata = pl.read_parquet('VK-LSVD/metadata/users_metadata.parquet')
items_metadata = pl.read_parquet('VK-LSVD/metadata/items_metadata.parquet')

users_metadata = users_metadata.join(train_users, on='user_id')
items_metadata = items_metadata.join(train_items, on='item_id')
items_metadata = items_metadata.join(pl.DataFrame({'item_id': item_ids, 
                                                   'embedding': item_embeddings}), 
                                     on='item_id')


  from .autonotebook import tqdm as notebook_tqdm


Crop embedding - item embedding shape: (196277, 64)
Item embedding example before normalization [-0.5225   -0.1632    0.133    -0.007618  0.1466    0.3093    0.01971
 -0.0708    0.02953   0.2052   -0.1664    0.213     0.013504  0.1641
 -0.2498   -0.146    -0.0669   -0.004204  0.03156  -0.02571  -0.0659
 -0.1031    0.09924   0.06976   0.10284  -0.0633    0.0561   -0.002018
  0.0637    0.03955  -0.0916    0.02654 ]


In [2]:
users_metadata

user_id,age,gender,geo,train_interactions_rank
u32,u8,u8,u8,u32
136302664,18,1,0,64701
347489880,18,1,0,40800
200182184,18,1,1,82233
202612548,18,1,1,965
417607951,18,1,1,17023
…,…,…,…,…
220172774,70,2,79,8146
310638477,70,2,79,1363
361209246,70,2,79,31077
368937023,70,2,79,70482


In [3]:
items_metadata

item_id,author_id,duration,train_interactions_rank,embedding
u32,u32,u8,u32,"array[f16, 32]"
1222,274696,35,98839,"[-0.522461, -0.163208, … 0.026535]"
2376,936009,9,97985,"[-0.270752, 0.321533, … 0.064697]"
2425,219847,63,149209,"[-0.505859, -0.174438, … -0.032288]"
5967,504767,59,56611,"[-0.541016, -0.079773, … 0.003593]"
8553,687320,28,142411,"[-0.307129, -0.172974, … 0.035645]"
…,…,…,…,…
608049069,330884,12,23765,"[-0.317627, -0.120422, … 0.098633]"
608053295,946780,13,114670,"[-0.522949, -0.110229, … 0.003149]"
608059538,930725,5,155307,"[-0.36792, 0.172363, … 0.067444]"
608061840,1222813,59,64327,"[-0.631348, -0.17749, … 0.011719]"


In [4]:
train_interactions

user_id,item_id,place,platform,agent,timespent,like,dislike,share,bookmark,click_on_author,open_comments
u32,u32,u8,u8,u8,u8,bool,bool,bool,bool,bool,bool
4862415,175404824,0,0,0,48,false,false,false,false,false,true
276873582,97755319,1,1,1,44,false,false,false,false,false,false
434112541,254862034,1,1,1,41,false,false,false,false,false,false
37377677,132750843,1,1,1,8,false,false,false,false,false,false
425914526,163619500,1,0,0,27,false,false,false,false,false,false
…,…,…,…,…,…,…,…,…,…,…,…
309417267,42883716,1,0,0,59,false,false,false,false,false,false
226669287,532713837,1,0,0,17,false,false,false,false,false,false
216075342,115770061,1,0,0,36,false,false,false,false,false,false
509866772,205500526,1,0,0,1,false,false,false,false,false,false


In [11]:
train_users

user_id
u32
470205700
379558691
393239373
176058686
213504679
…
437706787
433823280
172760320
504110562


In [22]:
train_items

item_id
u32
502206635
330672826
360757243
212351180
495351156
…
230248039
583493481
28424285
22806726


In [2]:
def ndcg_100(submission, intereactions):
    final_metric = 0
    for j, entry in enumerate(submission.iter_rows(named=True)):
        dcg = 0
        idcg = 0
        iid = entry["item_id"]
        for i, uid in enumerate(entry["user_id"]):
            row = intereactions.filter((pl.col("user_id") == uid) & (pl.col("item_id") == iid),)
            row_arr = np.array([row["like"], row["dislike"], row["share"], row["bookmark"], row["click_on_author"], row["open_comments"]])
            idcg += pow(2.0, np.int32(row_arr.any()) - 1) / (np.log2(i + 2))
            dcg += np.int32(row_arr.any()) / (np.log2(i + 2))
        final_metric += dcg / idcg
        if (j % 1000 == 0):
            print(final_metric)
    return final_metric / submission.shape[0]

## Ideas to test
### First user preference embedding
We should take 100 clips given to us and create several features out of it.
First we must found average user embedding. For that we take items that user
Have watched and calculate average embedding. Also we may calculate average embedding
with videos with which he intereacted most. I.e only liked, shared. 
And one with wich he don't want to intereact: disliked.

Average 100 items embedding: [0.5, xxx, xxx, ..., xxx] - len 64

Average liked and shared and watched embedding: [xxx, xxx, xxx, ..., xxx] - len 64

Average disliked embedding: [xxx, xxx, xxx, ..., xxx] - len 64

Problem with idea - I don't have guarantee that average embedding will hold some real meaning.
Hope that tree will filter meaningless embedding positions.
### Second idea use HSNW to cluster the user pseudoclass into "abstract bubbles" and later average this bubbles
Find all clips watched by some user and split them into interest.Then for each interest we will have separate average embedding

This approach already exists and called Gaussian Mixture GM

After we split dataset to number of classes we must calculate classes for each user.

Later we must use XGBoost to build trees that will predict users for given item based on created labels, classes.

### First idea realization

In [None]:
# print(train_users)
result = pl.DataFrame({"user_id": np.uint32(0), "embedding": [np.zeros(64, dtype=np.float32)]})
# print(result)
for tu in train_users.to_numpy():
    # print(tu)
    mask = np.isin(train_interactions["user_id"], np.asarray([tu,]))
    if not mask.any():
        continue
    user_watched_items = train_interactions.filter(mask)
    all_unique_watched_items = user_watched_items["item_id"].unique()
    mask = np.isin(item_ids, all_unique_watched_items.to_numpy())
    watched_item_ids_within_all = item_ids[mask]
    watched_item_embeddings_within_all = item_embeddings[mask]
    # TODO(d-desiatkin): Here we must add normalization, cause extended embedding is not normalized
    average_embedding = watched_item_embeddings_within_all.mean(axis=0)
    single_user_processing_result = pl.DataFrame({"user_id": tu, "embedding": [average_embedding]})
    result = result.extend(single_user_processing_result)
result = result[1:]

In [None]:
np.save("average_user_embedding.npy", result, allow_pickle=True)

In [None]:
users_metadata = users_metadata.join(result, on='user_id')

In [None]:
users_metadata

In [None]:
items_metadata

In [None]:
def cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

def cosine_distance(a, b):
    return 1 - cosine_similarity(a, b)

def length_simmilarity(a,b):
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    similarity = min(norm_a, norm_b) / max(norm_a, norm_b)
    return similarity

def total_simmilarity(a,b):
    first = cosine_similarity(a,b)
    second = length_simmilarity(a,b)
    # Angle is 80% and Distance 20% of final simmilarity 
    return first * 0.8 + second * 0.2


In [None]:
result = pl.DataFrame({"item_id": np.uint32(0), "user_id": [np.zeros(100, dtype=np.uint32)]})
for item_entry in items_metadata.iter_rows(named=True):
    simmilarity_list = []
    for user_entry in users_metadata.iter_rows(named=True):
        simmilarity_list.append([user_entry["user_id"], cosine_distance(item_entry["embedding"], user_entry["embedding"])])
    simmilarity_list = sorted(simmilarity_list, key=lambda entry: entry[1])
    closest_user_ids = np.array([x[0] for x in simmilarity_list[:100]], dtype=np.uint32)
    single_item_processing_result = pl.DataFrame({"item_id": np.uint32(item_entry["item_id"]), "user_id": [closest_user_ids]})
    result = result.extend(single_item_processing_result)
result = result[1:]

In [None]:
print(result)

In [None]:
np.save("embeedding_similarity_based_submission.npy", result, allow_pickle=True)

In [None]:
tmp = np.load("embeedding_similarity_based_submission.npy", allow_pickle=True)
result = pl.DataFrame({"item_id": tmp.T[0].astype(np.uint32), "user_id": tmp.T[1].tolist()})
result.write_parquet('submission.parquet')

In [None]:
ndcg_100(result, train_interactions)

### Second Idea Realization

In [3]:
import hnswlib
import pickle

dim = 32
# num_elements = 10000
num_elements = item_embeddings.shape[0]

# Aparantly algorythm requires indices...
# ids = np.arange(num_elements)

# Declaring index
p = hnswlib.Index(space = 'cosine', dim = dim) # possible options are l2, cosine or ip

# Initializing index - the maximum number of elements should be known beforehand
p.init_index(max_elements = num_elements, ef_construction = 4000, M = 640)

# Element insertion (can be called several times):
p.add_items(item_embeddings)

# Controlling the recall by setting ef:
p.set_ef(1000) # ef should always be > k

# Query dataset, k - number of the closest elements (returns 2 numpy arrays)
labels, distances = p.knn_query(item_embeddings[1000:1010], k = 500)

# Index objects support pickling
# WARNING: serialization via pickle.dumps(p) or p.__getstate__() is NOT thread-safe with p.add_items method!
# Note: ef parameter is included in serialization; random number generator is initialized with random_seed on Index load
p_copy = pickle.loads(pickle.dumps(p)) # creates a copy of index p using pickle round-trip

### Index parameters are exposed as class properties:
print(f"Parameters passed to constructor:  space={p_copy.space}, dim={p_copy.dim}") 
print(f"Index construction: M={p_copy.M}, ef_construction={p_copy.ef_construction}")
print(f"Index size is {p_copy.element_count} and index capacity is {p_copy.max_elements}")
print(f"Search speed/quality trade-off parameter: ef={p_copy.ef}")

Parameters passed to constructor:  space=cosine, dim=32
Index construction: M=640, ef_construction=4000
Index size is 196277 and index capacity is 196277
Search speed/quality trade-off parameter: ef=1000


In [None]:
distances

In [8]:
from sklearn.mixture import GaussianMixture

gm = GaussianMixture(n_components=128,
                     covariance_type="full",
                     tol=1e-6,
                     reg_covar=1e-16,
                     n_init=1, 
                     max_iter=2000,
                     init_params="k-means++",
                     random_state=1094).fit(item_embeddings)

In [10]:
if gm.converged_:
    print(f"Score lower bound: {gm.lower_bound_}")
    print(f"Number of iterations to converge: {gm.n_iter_}")
else:
    print("Error! Gaussian Mixture not converged!")
print(gm.get_params())
print(gm.means_[:1])
print(gm.covariances_[:1])

Score lower bound: 32.40050297306434
Number of iterations to converge: 641
{'covariance_type': 'full', 'init_params': 'k-means++', 'max_iter': 2000, 'means_init': None, 'n_components': 128, 'n_init': 1, 'precisions_init': None, 'random_state': 1094, 'reg_covar': 1e-16, 'tol': 1e-06, 'verbose': 0, 'verbose_interval': 10, 'warm_start': False, 'weights_init': None}
[[-0.41019333  0.22170685  0.31923749  0.07133626  0.09925637 -0.0421611
  -0.04554408 -0.0899527   0.10958078 -0.06040252  0.04602672 -0.01733871
  -0.00628473 -0.01694732 -0.105821    0.05195312 -0.01966997 -0.0652695
  -0.00567655 -0.01537108  0.06947336  0.07542811  0.04550196 -0.03104853
  -0.04413151  0.00611915 -0.02442662 -0.01587331 -0.05129029  0.02467013
   0.05349485 -0.00157624]]
[[[ 0.01695752  0.00250496  0.00333141 ...  0.00186054 -0.000286
   -0.00082551]
  [ 0.00250496  0.01306879 -0.00240423 ...  0.00191387 -0.0026256
   -0.00021778]
  [ 0.00333141 -0.00240423  0.01233728 ... -0.00093395  0.00072251
   -0.002

In [12]:
import pickle
with open('gaussian_mixture_128_full.pickle', 'wb') as f:
    pickle.dump(gm, f, protocol=None)

### Second Idea Results

[x] 64  components - spherical - final loss: 25.78482731680628 // Powers of two

[x] 123 components - spherical - final loss: 26.612402741531618 // Lucas

[x] 128 components - spherical - final loss: 26.66317279508462 // Powers of two

[x] 144 components - spherical - final loss: 26.822238865843392 // Fibonacchi

[x] 199 components - spherical - final loss: 27.235624615038315 // Lucas

[x] 233 components - spherical - final loss: 27.44773975338467 // Fibonacchi

[x] 377 components - spherical - final loss: 28.065711959610123 // Fibonacchi

In [17]:
import pickle
with open('gaussian_mixture_128_full.pickle', 'rb') as f:
    gm = pickle.load(f)

In [15]:
result = pl.DataFrame({"user_id": np.uint32(0)})
for i in range(128):
    tmp_subres = pl.DataFrame({"user_id": np.uint32(0), f"{i}": np.float64(0.0)})
    result = result.join(tmp_subres, on='user_id')

for counter, tu in enumerate(train_users.to_numpy()):
    # print(tu)
    mask = np.isin(train_interactions["user_id"], np.asarray([tu,]))
    if not mask.any():
        continue
    user_watched_items = train_interactions.filter(mask)
    all_unique_watched_items = user_watched_items["item_id"].unique()
    mask = np.isin(item_ids, all_unique_watched_items.to_numpy())
    watched_item_ids_within_all = item_ids[mask]
    watched_item_embeddings_within_all = item_embeddings[mask]
    labels = gm.predict(watched_item_embeddings_within_all)
    probability = gm.predict_proba(watched_item_embeddings_within_all)
    single_user_processing_result = pl.DataFrame({"user_id": tu})
    for i in range(128):
        indices = np.where(labels == i)
        selected_prob = probability[indices]
        if (selected_prob.size == 0): 
            mean_prob = np.float64(0.0)
        else:
            mean_prob = (selected_prob).mean()
        tmp_subres = pl.DataFrame({"user_id": tu, f"{i}": mean_prob})
        single_user_processing_result = single_user_processing_result.join(tmp_subres, on='user_id')
    result = result.extend(single_user_processing_result)
    if(counter == 10):
        result.write_parquet('user_classes_checkpoint.parquet')
        break
result = result[1:]

In [None]:
# My code optimised by Yandex AI assistant. Smart him... AI me: 1-0.
# Read and learn how to use polars efficiently... And write smth in python for God Sake... It could be a game
# Assume gm, train_users, train_interactions, item_embeddings, and item_ids are defined.
# We also assume 'result' is initialized correctly (e.g., result = pl.DataFrame()).

item_ids = train_items["item_id"].to_numpy()

# 1. Pre-calculate GMM probabilities for all items
all_item_labels = gm.predict(item_embeddings)
all_item_probabilities = gm.predict_proba(item_embeddings)
num_components = all_item_probabilities.shape[1] # Should be 128

# Create a Polars DataFrame mapping every item_id to its GMM results
item_gmm_results = pl.DataFrame({
    "item_id": item_ids,
    "label": all_item_labels,
})
# Append probability columns efficiently using numpy to Polars conversion
prob_df = pl.DataFrame(all_item_probabilities, schema={str(i): pl.Float64 for i in range(num_components)})
item_gmm_results = pl.concat([item_gmm_results, prob_df], how="horizontal")

# 2. Iterate through users and aggregate results using efficient Polars operations
user_results_list = []
checkpoint_counter = 0

# Limit the loop iteration for this specific optimization example (original code had a break condition)
for tu in train_users["user_id"].to_numpy():
    # Filter user interactions efficiently using Polars' native filtering
    user_watched_items_df = train_interactions.filter(
        pl.col("user_id") == tu
    )

    if user_watched_items_df.is_empty():
        continue
    
    # Use Polars join to link watched items with their GMM probabilities
    # This is much faster than repeated numpy mask creation and filtering
    user_items_with_probs = user_watched_items_df.join(
        item_gmm_results, on="item_id", how="inner"
    )
    
    # Calculate the mean probability for each component in a single aggregation step
    # The resulting dataframe will have one row for the user, and N columns for the means
    mean_probs_df = user_items_with_probs.group_by("user_id").agg(
        [pl.mean(str(i)).alias(str(i)) for i in range(num_components)]
    )
    
    user_results_list.append(mean_probs_df)

    checkpoint_counter += 1
    if checkpoint_counter == 10000:
        # Checkpoint logic (if needed)
        pl.concat(user_results_list).write_parquet(f'user_embeddings_checkpoints/user_classes_checkpoint_{checkpoint_counter}.parquet')

# 3. Combine all results outside the loop
# This avoids the slow `result.extend(single_user_processing_result)` inside the loop
result = pl.concat(user_results_list)
# result = result[1:] # The original code had this. If you initialized 'result' as empty, you might not need this line.

print(result)


In [None]:
print(result.glimpse())

In [None]:
result.write_parquet("calculated_user_classes.parquet")