In [1]:
import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"

import pandas as pd
import joblib
import numpy as np
from scipy.sparse import coo_matrix, csr_matrix
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import ndcg_at_k
import optuna

RANDOM_SEED = 42

In [2]:
df_train = joblib.load(r"..\assets\combined\train_feature_engineered.pkl")
df_test = joblib.load(r"..\assets\combined\test_feature_engineered.pkl")

print(df_train.shape, df_test.shape)

(2643279, 141) (694638, 141)


In [3]:
print(len(df_train['user_id'].unique()), len(df_train['app_id'].unique()))

3232 24301


## Model Training

In [None]:
def build_user_item_matrix(df, user2idx, game2idx, n_users, n_games):
    """
    Return a user * game CSR matrix
    """
    user_idx = df['user_id'].map(user2idx)
    game_idx = df['app_id'].map(game2idx)
    relevance = df['relevance_score'].values
    
    mat_coo = coo_matrix(
        (relevance, (user_idx, game_idx)),
        shape=(n_users, n_games)
    )
    return mat_coo.tocsr()

def train_als_model(train_matrix, factors, reg, alpha, iters=15):
    model = AlternatingLeastSquares(
        factors=factors,
        regularization=reg,
        iterations=iters,
        alpha=alpha,
        use_gpu=False,
        random_state=RANDOM_SEED
    )
    model.fit(train_matrix, show_progress=False)
    return model

def evaluate_ndcg(model, train_matrix, val_matrix, K=10):
    return ndcg_at_k(
        model,
        train_user_items=train_matrix,
        test_user_items=val_matrix,
        K=K,
        show_progress=False
    )

def user_based_cv_folds(unique_users, n_folds=5):
    """
    Returns a list of folds, each fold is a set of user_ids.
    """
    rng = np.random.default_rng(RANDOM_SEED)
    shuffled_users = rng.permutation(unique_users)
    fold_size = len(shuffled_users) // n_folds
    
    folds = []
    start = 0
    for i in range(n_folds):
        end = start + fold_size
        if i == n_folds - 1:
            end = len(shuffled_users)  # Get the rest if it is final fold
        fold_users = shuffled_users[start:end]
        folds.append(set(fold_users))
        start = end
    return folds

In [5]:
unique_train_users = df_train['user_id'].unique()

unique_games = df_train['app_id'].unique()
game2idx = {g: i for i, g in enumerate(unique_games)}
n_games = len(unique_games)

# CV folds
n_folds = 5
folds = user_based_cv_folds(unique_train_users, n_folds)

### Cross validation using Optuna

In [None]:
def objective(trial):
    # Hyperparameters to fine tune
    factors_val = trial.suggest_int("factors", 32, 128, step=32)
    reg_val = trial.suggest_float("regularization", 1e-3, 1e-1, log=True)
    alpha_val = trial.suggest_float("alpha", 1.0, 50, log=True)
    
    fold_ndcgs = []
    
    # Perform user-based cross validation
    for fold_idx, fold_users in enumerate(folds):
        # Split into train and validation
        mask_fold_val = df_train['user_id'].isin(fold_users)        
        df_fold_val = df_train[mask_fold_val]
        df_fold_train = df_train[~mask_fold_val]

        # Build local user index mapper for each CV
        unique_fold_train_users = df_fold_train['user_id'].unique()
        unique_fold_val_users = df_fold_val['user_id'].unique()

        user2idx_fold_train = {u: i for i, u in enumerate(unique_fold_train_users)}
        user2idx_fold_val = {u: i for i, u in enumerate(unique_fold_val_users)}
        
        # Build user by item matrix for each
        train_matrix_cv = build_user_item_matrix(
            df_fold_train, user2idx_fold_train, game2idx, len(unique_fold_train_users), n_games
        )
        val_matrix_cv = build_user_item_matrix(
            df_fold_val, user2idx_fold_val, game2idx, len(unique_fold_val_users), n_games
        )
        
        # Train
        model_cv = train_als_model(
            train_matrix_cv,
            factors=factors_val,
            reg=reg_val,
            alpha=alpha_val
        )
        
        # Evaluate NDCG on fold's validation
        ndcg_val = evaluate_ndcg(model_cv, train_matrix_cv, val_matrix_cv)
        fold_ndcgs.append(ndcg_val)
    
    # Average across folds
    mean_ndcg = np.mean(fold_ndcgs)

    return mean_ndcg

In [7]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print("Best trial:")
best_trial = study.best_trial
print(f"  Value (NDCG): {best_trial.value:.4f}")
print("  Params:", best_trial.params)

[I 2025-03-29 14:30:44,064] A new study created in memory with name: no-name-5ecbdac6-b7fc-4231-b2db-fd3621aa82d1
[I 2025-03-29 14:30:59,117] Trial 0 finished with value: 0.3609986628219534 and parameters: {'factors': 128, 'regularization': 0.029047679694155292, 'alpha': 2.52845664390175}. Best is trial 0 with value: 0.3609986628219534.
[I 2025-03-29 14:31:14,827] Trial 1 finished with value: 0.3556861926257836 and parameters: {'factors': 96, 'regularization': 0.01608214884599777, 'alpha': 6.494388138956376}. Best is trial 0 with value: 0.3609986628219534.
[I 2025-03-29 14:31:32,019] Trial 2 finished with value: 0.2979784822492202 and parameters: {'factors': 128, 'regularization': 0.01729446748627178, 'alpha': 29.249916570393545}. Best is trial 0 with value: 0.3609986628219534.
[I 2025-03-29 14:31:47,879] Trial 3 finished with value: 0.38238331685860294 and parameters: {'factors': 64, 'regularization': 0.01843530113448585, 'alpha': 2.473863756986004}. Best is trial 3 with value: 0.3823

Best trial:
  Value (NDCG): 0.4087
  Params: {'factors': 32, 'regularization': 0.0024319028717811305, 'alpha': 1.7214833724374905}


### Retrain on the entire training set with the best hyperparameters

In [8]:
user2idx_train = {u: i for i, u in enumerate(unique_train_users)}

# Build user * game csr matrix for all training data
full_train_matrix = build_user_item_matrix(
    df_train,
    user2idx_train,
    game2idx,
    len(unique_train_users),
    n_games
)

final_model = train_als_model(
    full_train_matrix,
    factors=study.best_params["factors"],
    reg=study.best_params["regularization"],
    alpha=study.best_params["alpha"]
)

## Training Evaluation

In [9]:
from optuna.visualization import (
    plot_optimization_history,
    plot_param_importances,
    plot_parallel_coordinate,
    plot_slice
)

# Optimization History
plot_optimization_history(study).show()

In [10]:
# Hyperparameter Importances
plot_param_importances(study).show()

In [11]:
# Parallel Coordinate
plot_parallel_coordinate(study).show()

In [12]:
# Slice Plot
plot_slice(study).show()

## Extract embedding

### Get user and game embedding for train set

In [13]:
user_embeddings_train = final_model.user_factors
game_embeddings = final_model.item_factors

df_train['user_emb'] = df_train['user_id'].map(lambda x: user_embeddings_train[user2idx_train[x]].tolist())
df_train['game_emb'] = df_train['app_id'].map(lambda x: game_embeddings[game2idx[x]].tolist())

In [14]:
df_train[['user_id', 'app_id', 'user_emb', 'game_emb']].sample(5)

Unnamed: 0,user_id,app_id,user_emb,game_emb
418397,76561198062248931,60350,"[1.6727778911590576, 1.938355565071106, -1.099...","[0.049368780106306076, -0.032816044986248016, ..."
1370325,76561198087681716,24240,"[0.04799453169107437, 0.17479655146598816, 0.1...","[-0.01666131801903248, 0.1066851019859314, -0...."
2254767,76561198052327526,221040,"[0.12639452517032623, 0.05996783450245857, -0....","[0.12829409539699554, 0.06835531443357468, -0...."
2490956,76561198043360595,482400,"[1.0927624702453613, 0.9949914216995239, 0.171...","[-0.00370860961265862, 0.0018093361286446452, ..."
1853356,76561198074261126,429680,"[5.133449554443359, 1.8881070613861084, 0.7728...","[0.0062119909562170506, 0.053033702075481415, ..."


### Get user and game embedding for test set

In [15]:
# Only include data with games that exist in training mapping
df_test = df_test[df_test['app_id'].isin(game2idx.keys())].copy()

unique_test_users = df_test['user_id'].unique()
user2idx_test = {u: i for i, u in enumerate(unique_test_users)}

# Build user * game csr matrix for test data
test_matrix = build_user_item_matrix(
    df_test,
    user2idx_test,
    game2idx,
    len(unique_test_users),
    n_games
)

user_embedding_test = final_model.recalculate_user(unique_test_users, test_matrix)

df_test['user_emb'] = df_test['user_id'].map(lambda x: user_embedding_test[user2idx_test[x]].tolist())
df_test['game_emb'] = df_test['app_id'].map(lambda x: game_embeddings[game2idx[x]].tolist())

In [16]:
df_test[['user_id', 'app_id', 'user_emb', 'game_emb']].sample(5)

Unnamed: 0,user_id,app_id,user_emb,game_emb
179219,76561197993525512,659790,"[1.654205083847046, 1.1851140260696411, 1.0396...","[-0.005370128899812698, 0.007361488416790962, ..."
677371,76561198048806066,34010,"[0.7943929433822632, 1.248560905456543, -0.372...","[0.12801960110664368, 0.062387287616729736, -0..."
378112,76561197990231526,275490,"[0.3505335748195648, -0.5510267615318298, 0.26...","[0.07910574972629547, 0.024123266339302063, -0..."
492735,76561198044589452,1048540,"[1.6673423051834106, 0.9415253400802612, -1.25...","[0.020088914781808853, -0.09536014497280121, 0..."
671437,76561197978425746,1510,"[-0.19167281687259674, -0.16418370604515076, 0...","[0.12016583234071732, -0.07844189554452896, 0...."


### NDCG on test set

In [17]:
evaluate_ndcg(final_model, full_train_matrix, test_matrix)

0.404195959579037

In [18]:
# Check for embedding dim consistence
print(
    sum(df_train['user_emb'].apply(lambda x: len(x) != final_model.factors)),
    sum(df_test['user_emb'].apply(lambda x: len(x) != final_model.factors)),
    sum(df_train['game_emb'].apply(lambda x: len(x) != final_model.factors)),
    sum(df_test['game_emb'].apply(lambda x: len(x) != final_model.factors))
)

0 0 0 0


In [19]:
joblib.dump(df_train, r"..\assets\combined\train_CF.pkl")
joblib.dump(df_test, r"..\assets\combined\test_CF.pkl")

['..\\assets\\combined\\test_CF.pkl']

In [20]:
pip freeze > ..\requirements.txt

Note: you may need to restart the kernel to use updated packages.
