In [None]:
import pandas as pd
import polars as pl
import numpy as np
import scipy.sparse as sp

from tqdm import tqdm
from typing import List, Any
from gensim.models import Word2Vec
import optuna
import random

## Task Description

As an ML engineer in a music streaming service, we are tasked with creating a recommendation system for our own streaming service, similar to Spotify. We do not have knowledge about the content, but we have the listening history of artists for each user.

Our task is to improve the algorithm that will determine the most relevant recommendations for each user based on their listening history.

#### Data Description
The train.parquet file provides us with data about users listening to artists on the service.

| Field     | Type | Description               |
|-----------|------|---------------------------|
| user_id   | str  | User ID                   |
| artist_id | str  | Artist ID               |

### Quality Metrics
We will use the ndcg@20 metric, which is often applied in ranking tasks. The more relevant objects are at the beginning of the recommendation list, the higher it is. Session validation with the last N artists from the listening history is used for evaluation.

The code for calculating ndcg is as follows:

```python
def user_ndcg(y_rel: List[Any], y_rec: List[Any], k: int = 20) -> float:
    """
    :param y_rel: relevant items
    :param y_rec: recommended items
    :param k: number of top recommended items
    :return: ndcg metric for user recommendations
    """
    dcg = sum([1. / np.log2(idx + 2) for idx, item in enumerate(y_rec[:k]) if item in y_rel])
    idcg = sum([1. / np.log2(idx + 2) for idx, _ in enumerate(zip(y_rel, np.arange(k)))])
    return dcg / idcg
    

## Read the dataset

In [None]:
data = pl.read_parquet('train_session_based.parquet')
data

## Metrics

Our task will be the optimization of the ndcg@20 metric. Nevertheless, such a metric is difficult to interpret, so we will also have access to the hitrate@20 metric value.

In [None]:
TOP_K = 20


def user_hitrate(y_relevant: List[str], y_recs: List[str], k: int = TOP_K) -> int:
    return int(len(set(y_relevant).intersection(y_recs[:k])) > 0)

def user_ndcg(y_rel: List[Any], y_rec: List[Any], k: int = 10) -> float:
    """
    :param y_rel: relevant items
    :param y_rec: recommended items
    :param k: number of top recommended items
    :return: ndcg metric for user recommendations
    """
    dcg = sum([1. / np.log2(idx + 2) for idx, item in enumerate(y_rec[:k]) if item in y_rel])
    idcg = sum([1. / np.log2(idx + 2) for idx, _ in enumerate(zip(y_rel, np.arange(k)))])
    return dcg / idcg

In this dataset, identifiers are presented as strings, but for working with them, it might be easier to convert them into numbers (for instance, for matrix factorization algorithms).

In [None]:
user_mapping = {k: v for v, k in enumerate(data['user_id'].unique())}
user_mapping_inverse = {k: v for v, k in user_mapping.items()}

artist_mapping = {k: v for v, k in enumerate(data['artist_id'].unique())}
artist_mapping_inverse = {k: v for v, k in artist_mapping.items()}

In [None]:
grouped_df_with_inds = (
    data
    .with_columns([
        pl.col('user_id').apply(user_mapping.get),
        pl.col('artist_id').apply(artist_mapping.get),
    ])
    # для каждого пользователя оставим последние 3 объекта в качестве тестовой выборки,
    # а остальное будем использовать для тренировки
    .groupby('user_id')
    .agg([
        pl.col('artist_id').apply(lambda x: x[:-3]).alias('train_item_ids'),
        pl.col('artist_id').apply(lambda x: x[-3:]).alias('test_item_ids'),
    ])
)

grouped_df_with_inds

In [None]:
median_seq_len = int(grouped_df_with_inds['train_item_ids'].apply(len).median())
print(f"средняя длина сессии {median_seq_len}")

In [None]:
# соберем строчки для разреженной матрицы
rows = []
cols = []
values = []
for user_id, train_ids, _ in grouped_df_with_inds.rows():
    rows.extend([user_id] * len(train_ids))
    values.extend([1] * len(train_ids))
    cols.extend(train_ids)

user_item_data = sp.csr_matrix((values, (rows, cols)))

## Baselines

As a simple baseline, we will recommend the most popular artists.

We want to first validate such a solution, which means we will consider only those artists who appear most frequently in `train_item_ids` as popular artists.

In [None]:
top_artists = (
    grouped_df_with_inds
    .select(pl.col('train_item_ids').alias('artist_id'))
    .explode('artist_id')
    .groupby('artist_id')
    .count()
    .sort('count', descending=True)
    .head(TOP_K + median_seq_len)
)['artist_id'].to_list()

In [None]:
ndcg_list = []
hitrate_list = []

for user_id, user_history, y_rel in grouped_df_with_inds.rows():
    y_rec = top_artists.copy()
    
    ndcg_list.append(user_ndcg(y_rel, y_rec))
    hitrate_list.append(user_hitrate(y_rel, y_rec))
    
print(f'NDCG@{TOP_K} = {np.mean(ndcg_list):.5f}, Hitrate@{TOP_K} = {np.mean(hitrate_list):.5f}')

Don't forget about filtering out what has already been viewed (for different domains and approaches, this doesn't always improve recommendations, but in this case, it provided a boost).

In [None]:
ndcg_list = []
hitrate_list = []

for user_id, user_history, y_rel in grouped_df_with_inds.rows():
    y_rec = [artist_id for artist_id in top_artists if artist_id not in user_history]
    
    ndcg_list.append(user_ndcg(y_rel, y_rec))
    hitrate_list.append(user_hitrate(y_rel, y_rec))
    
print(f'NDCG@{TOP_K} = {np.mean(ndcg_list):.5f}, Hitrate@{TOP_K} = {np.mean(hitrate_list):.5f}')

## Building a Recommendations file

To build recommendations, we can now consider all possible data. It is important to note that previously, to optimize memory, ids were converted to an integer format. However, for the production display, it is necessary to convert them back to the original identifiers.

In [None]:
top_artists = (
    data
    .groupby('artist_id')
    .count()
    .sort('count', descending=True)
    .head(TOP_K + median_seq_len)
)['artist_id'].to_list()

In [None]:
submission = []

for user_id, user_history in data.groupby('user_id').agg(pl.col('artist_id')).rows():
    y_rec = top_artists.copy()
    
    submission.append((user_id, y_rec))
    
submission = pl.DataFrame(submission, schema=('user_id', 'y_rec'))
submission.write_parquet('sample_submission.parquet')
submission

! It's important to remember that the recommendations file should contain the original identifiers (strings), not those converted to numbers!

### W2V in RecSys using Gensim library

To apply the W2V (Word2Vec) algorithm without hyperparameter tuning, we have the W2V algorithm and an array of sessions that are used as training data.

Launch this algorithm on this dataset in a single line.

To validate this model, for each training session, call the method predict output word using the standard model.

If our model returns nothing (either an exception word or it has not been used before), we'll make the hitrate equal to zero and skip this example.

If we have recommendations, using them, we'll filter out those objects that were already in the training sample, and then assess their quality.

In [None]:
def evaluate_model(model):
    ndcg_list = []
    hitrate_list = []
    for train_ids, y_rel in grouped_df_with_inds.select('train_item_ids', 'test_item_ids').rows():
        model_preds = model.predict_output_word(
            train_ids, topn=(TOP_K + len(train_ids))
        )
        if model_preds is None:
            hitrate_list.append(0)
            continue

        y_rec = [pred[0] for pred in model_preds if pred[0] not in train_ids]
        ndcg_list.append(user_ndcg(y_rel, y_rec))
        hitrate_list.append(user_hitrate(y_rel, y_rec))
    return np.mean(ndcg_list), np.mean(hitrate_list)

# обучим w2v с параметрами по умолчанию
model = Word2Vec(grouped_df_with_inds['train_item_ids'].to_list())
mean_ndcg, mean_hitrate = evaluate_model(model)
print(f'NDCG@{TOP_K} = {mean_ndcg:.5f}, Hitrate@{TOP_K} = {mean_hitrate:.5f}')

#MAP@10 = 0.0033 Hitrate@10 = 0.1210

To find the optimal hyperparameters, we're starting from a baseline where NDCG was 0.0174.

To adjust the parameters, we'll use Optuna.

We have a set of hyperparameters to consider:

- SKIP-GRAM algorithm (whether to use it or not)
- The window parameter (the length of the window used for training)
- The ns_exponent and negative parameters
- The min_count parameter (filters objects that appear less than a certain number of times)
- The vector_size parameter (determines the dimensionality of the embedding space; the larger it is, the more parameters can be trained, but this does not mean that the final model will be better)

In [None]:
SEED = 42

def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)

def objective(trial):
    sg = trial.suggest_categorical('sg', [0, 1])
    window = trial.suggest_int('window', 1, 10)
    ns_exponent = trial.suggest_float('ns_exponent', -3, 3)
    negative = trial.suggest_int('negative', 8, 20)
    min_count = trial.suggest_int('min_count', 3, 20)
    vector_size = trial.suggest_categorical('vector_size', [64, 128])
    
    print({
        'sg': sg,
        'window_len': window,
        'ns_exponent': ns_exponent,
        'negative': negative,
        'min_count': min_count,
        'vector_size': vector_size,
    })

    set_seed(SEED)
    model = Word2Vec(
        grouped_df_with_inds['train_item_ids'].to_list() + grouped_df_with_inds['test_item_ids'].to_list(),
        window=window,
        sg=sg,
        hs=0,
        min_count=min_count,
        vector_size=vector_size,
        negative=negative,
        ns_exponent=ns_exponent,
        seed=SEED,
        epochs=50,
    )
    
    mean_ndcg, mean_hitrate = evaluate_model(model)

    print(f'NDCG@{TOP_K} = {mean_ndcg:.5f}, Hitrate@{TOP_K} = {mean_hitrate:.5f}')
    
    return mean_ndcg
    
    
study = optuna.create_study(directions=('maximize',))
study.optimize(objective, n_trials=1000)

study.best_params


Выведем гиперпараметры лучшей версии модели:

In [None]:
study.best_params

Trial 368 finished with value: 0.06800619114110919 and parameters: {'sg': 0, 'window': 10, 'ns_exponent': 0.07411320142850895, 'negative': 17, 'min_count': 11, 'vector_size': 128}. Best is trial 368 with value: 0.06800619114110919.
NDCG@20 = 0.06801, Hitrate@20 = 0.37866
{'sg': 0, 'window_len': 10, 'ns_exponent': 0.21706766856178566, 'negative': 20, 'min_count': 11, 'vector_size': 128}

Преейдем от integer идентификаторов к исходным:

In [None]:
set_seed(SEED)
model = Word2Vec(
    grouped_df_with_inds['train_item_ids'].to_list(),
    **study.best_params,
    hs=0,
    seed=SEED,
    epochs=50
)

#ndcg_list = []
submission_check = []
for user_id, train_item_ids, test_item_ids  in grouped_df_with_inds.select('user_id', 'train_item_ids', 'test_item_ids').rows():
    combined_known_items = set(train_item_ids + test_item_ids)
    model_preds = model.predict_output_word(combined_known_items, topn=(TOP_K + len(combined_known_items)))
    if model_preds is None:
        ndcg_list.append(0)
        continue
        
    y_rec = [pred[0] for pred in model_preds if pred[0] not in combined_known_items]
    
    #ndcg_list.append(user_ndcg(combined_known_items, y_rec))

    mapped_user_id = user_mapping_inverse[user_id]

    mapped_y_rec = [artist_mapping_inverse[artist_id] for artist_id in y_rec] # Adjust based on your actual logic
    
    submission_check.append((mapped_user_id, mapped_y_rec))

submission_check = pl.DataFrame(submission_check, schema=('user_id', 'y_rec'))
submission_check



Save the final result:

In [None]:
submission_check.write_parquet('sample_submission.parquet')

#### Result

NDCG@20 = 0.08397145641325111

Hitrate@20 = 0.36992