In [1]:
import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"

import pandas as pd
import joblib
import numpy as np
from scipy.sparse import coo_matrix, csr_matrix
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import ndcg_at_k
import optuna

RANDOM_SEED = 42

In [2]:
df_train = joblib.load(r"..\assets\combined\train_scaled.pkl")
df_test = joblib.load(r"..\assets\combined\test_scaled.pkl")

print(df_train.shape, df_test.shape)

(2643279, 131) (694638, 131)


In [3]:
print(len(df_train['user_id'].unique()), len(df_train['app_id'].unique()))

3232 24301


## Model Training

In [4]:
def build_user_item_matrix(df, user2idx, game2idx, n_users, n_games):
    """
    Return a user * game CSR matrix
    """
    user_idx = df['user_id'].map(user2idx)
    game_idx = df['app_id'].map(game2idx)
    relevance = df['relevance_score'].values
    
    mat_coo = coo_matrix(
        (relevance, (user_idx, game_idx)),
        shape=(n_users, n_games)
    )
    return mat_coo.tocsr()

def train_als_model(train_matrix, factors, reg, alpha, iters=15):
    model = AlternatingLeastSquares(
        factors=factors,
        regularization=reg,
        iterations=iters,
        alpha=alpha,
        use_gpu=False,
        random_state=RANDOM_SEED
    )
    model.fit(train_matrix, show_progress=False)
    return model

def evaluate_ndcg(model, train_matrix, val_matrix, K=10):
    return ndcg_at_k(
        model,
        train_user_items=train_matrix,
        test_user_items=val_matrix,
        K=K,
        show_progress=False
    )

def user_based_cv_folds(unique_users, n_folds=5):
    """
    Returns a list of folds, each fold is a set of user_ids.
    """
    rng = np.random.default_rng(RANDOM_SEED)
    shuffled_users = rng.permutation(unique_users)
    fold_size = len(shuffled_users) // n_folds
    
    folds = []
    start = 0
    for i in range(n_folds):
        end = start + fold_size
        if i == n_folds - 1:
            end = len(shuffled_users)  # Get the rest if it is final fold
        fold_users = shuffled_users[start:end]
        folds.append(set(fold_users))
        start = end
    return folds

In [5]:
unique_train_users = df_train['user_id'].unique()

unique_games = df_train['app_id'].unique()
game2idx = {g: i for i, g in enumerate(unique_games)}
n_games = len(unique_games)

# CV folds
n_folds = 5
folds = user_based_cv_folds(unique_train_users, n_folds)

### Cross validation using Optuna

In [6]:
def objective(trial):
    # Hyperparameters to fine tune
    factors_val = trial.suggest_categorical("factors", [2, 4, 8, 16, 32])
    reg_val = trial.suggest_float("regularization", 1e-4, 1e-2, log=True)
    alpha_val = trial.suggest_float("alpha", 1.0, 20, log=True)
    
    fold_ndcgs = []
    
    # Perform user-based cross validation
    for fold_idx, fold_users in enumerate(folds):
        # Split into train and validation
        mask_fold_val = df_train['user_id'].isin(fold_users)        
        df_fold_val = df_train[mask_fold_val]
        df_fold_train = df_train[~mask_fold_val]

        # Build local user index mapper for each CV
        unique_fold_train_users = df_fold_train['user_id'].unique()
        unique_fold_val_users = df_fold_val['user_id'].unique()

        user2idx_fold_train = {u: i for i, u in enumerate(unique_fold_train_users)}
        user2idx_fold_val = {u: i for i, u in enumerate(unique_fold_val_users)}
        
        # Build user by item matrix for each
        train_matrix_cv = build_user_item_matrix(
            df_fold_train, user2idx_fold_train, game2idx, len(unique_fold_train_users), n_games
        )
        val_matrix_cv = build_user_item_matrix(
            df_fold_val, user2idx_fold_val, game2idx, len(unique_fold_val_users), n_games
        )
        
        # Train
        model_cv = train_als_model(
            train_matrix_cv,
            factors=factors_val,
            reg=reg_val,
            alpha=alpha_val
        )
        
        # Evaluate NDCG on fold's validation
        ndcg_val = evaluate_ndcg(model_cv, train_matrix_cv, val_matrix_cv)
        fold_ndcgs.append(ndcg_val)
    
    # Average across folds
    mean_ndcg = np.mean(fold_ndcgs)

    return mean_ndcg

In [7]:
sampler = optuna.samplers.TPESampler(seed=RANDOM_SEED)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=30)

print("Best trial:")
best_trial = study.best_trial
print(f"  Value (NDCG): {best_trial.value:.4f}")
print("  Params:", best_trial.params)

[I 2025-04-02 11:27:44,032] A new study created in memory with name: no-name-cf8a1429-5baa-44ea-baf1-a17584eb0a50
[I 2025-04-02 11:28:32,708] Trial 0 finished with value: 0.5300477012662834 and parameters: {'factors': 4, 'regularization': 0.00020511104188433984, 'alpha': 1.1900590783184244}. Best is trial 0 with value: 0.5300477012662834.
[I 2025-04-02 11:28:52,753] Trial 1 finished with value: 0.40760163953356543 and parameters: {'factors': 32, 'regularization': 0.004622589001020831, 'alpha': 1.8891200276189386}. Best is trial 0 with value: 0.5300477012662834.
[I 2025-04-02 11:29:13,064] Trial 2 finished with value: 0.4121993893187721 and parameters: {'factors': 16, 'regularization': 0.0003823475224675188, 'alpha': 6.252287916406214}. Best is trial 0 with value: 0.5300477012662834.
[I 2025-04-02 11:29:32,910] Trial 3 finished with value: 0.3902935656932331 and parameters: {'factors': 32, 'regularization': 0.00025081156860452336, 'alpha': 4.666963767236923}. Best is trial 0 with value:

Best trial:
  Value (NDCG): 0.5663
  Params: {'factors': 2, 'regularization': 0.0003117234774171904, 'alpha': 1.585965518808334}


### Retrain on the entire training set with the best hyperparameters

In [8]:
user2idx_train = {u: i for i, u in enumerate(unique_train_users)}

# Build user * game csr matrix for all training data
full_train_matrix = build_user_item_matrix(
    df_train,
    user2idx_train,
    game2idx,
    len(unique_train_users),
    n_games
)

final_model = train_als_model(
    full_train_matrix,
    factors=study.best_params["factors"],
    reg=study.best_params["regularization"],
    alpha=study.best_params["alpha"]
)

## Training Evaluation

In [9]:
from optuna.visualization import (
    plot_optimization_history,
    plot_param_importances,
    plot_parallel_coordinate,
    plot_slice
)

# Optimization History
plot_optimization_history(study).show()

In [10]:
# Hyperparameter Importances
plot_param_importances(study).show()

In [11]:
# Parallel Coordinate
plot_parallel_coordinate(study).show()

In [12]:
# Slice Plot
plot_slice(study).show()

## Extract embedding

### Get user and game embedding for train set

In [13]:
user_embeddings_train = final_model.user_factors
game_embeddings = final_model.item_factors

df_train['user_emb'] = df_train['user_id'].map(lambda x: user_embeddings_train[user2idx_train[x]].tolist())
df_train['game_emb'] = df_train['app_id'].map(lambda x: game_embeddings[game2idx[x]].tolist())


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



In [14]:
df_train[['user_id', 'app_id', 'user_emb', 'game_emb']].sample(5)

Unnamed: 0,user_id,app_id,user_emb,game_emb
1871819,76561198057305726,389530,"[7.749264717102051, 7.5037617683410645]","[2.439454888092385e-18, -2.2768245622195593e-18]"
593003,76561197960403751,1184370,"[2.594144344329834, 0.9178107380867004]","[0.021229607984423637, -0.00064380734693259]"
574778,76561198059597267,236730,"[6.904428958892822, 5.949845314025879]","[0.09133350849151611, -0.034515611827373505]"
1695403,76561198016546516,459820,"[8.085650444030762, 6.69266939163208]","[0.016255322843790054, 0.0033440301194787025]"
1055888,76561197960288870,498720,"[0.05990280583500862, 0.0027458106633275747]","[-5.353248226647178e-19, 6.505213034913027e-19]"


### Get user and game embedding for test set

In [15]:
# Only include data with games that exist in training mapping
df_test = df_test[df_test['app_id'].isin(game2idx.keys())].copy()

unique_test_users = df_test['user_id'].unique()
user2idx_test = {u: i for i, u in enumerate(unique_test_users)}

# Build user * game csr matrix for test data
test_matrix = build_user_item_matrix(
    df_test,
    user2idx_test,
    game2idx,
    len(unique_test_users),
    n_games
)

user_embedding_test = final_model.recalculate_user(unique_test_users, test_matrix)

df_test['user_emb'] = df_test['user_id'].map(lambda x: user_embedding_test[user2idx_test[x]].tolist())
df_test['game_emb'] = df_test['app_id'].map(lambda x: game_embeddings[game2idx[x]].tolist())

In [16]:
df_test[['user_id', 'app_id', 'user_emb', 'game_emb']].sample(5)

Unnamed: 0,user_id,app_id,user_emb,game_emb
499332,76561197982715591,239030,"[5.441211700439453, 3.897188425064087]","[0.2645011246204376, -0.21440424025058746]"
213697,76561197971373352,575920,"[12.317736625671387, 11.378521919250488]","[-0.02346072532236576, 0.05162185803055763]"
28119,76561198032867520,601840,"[2.125602960586548, 0.4374443292617798]","[0.012516040354967117, 0.0011081225238740444]"
283972,76561198107441026,843810,"[2.693979263305664, 0.8504762649536133]","[-6.776263578034403e-21, 6.776263578034403e-21]"
257487,76561198009676887,750920,"[8.718300819396973, 7.18441104888916]","[0.27644044160842896, -0.22152864933013916]"


### NDCG on test set

In [17]:
evaluate_ndcg(final_model, full_train_matrix, test_matrix)

0.5511105334369449

In [18]:
# Check for embedding dim consistence
print(
    sum(df_train['user_emb'].apply(lambda x: len(x) != final_model.factors)),
    sum(df_test['user_emb'].apply(lambda x: len(x) != final_model.factors)),
    sum(df_train['game_emb'].apply(lambda x: len(x) != final_model.factors)),
    sum(df_test['game_emb'].apply(lambda x: len(x) != final_model.factors))
)

0 0 0 0


In [20]:
# Forgot to remove redundant one hot columns previously
df_train.drop(columns=['game_esrb_rating', 'game_available_parent_platforms', 'game_genres'], inplace=True)
df_test.drop(columns=['game_esrb_rating', 'game_available_parent_platforms', 'game_genres'], inplace=True)

In [21]:
joblib.dump(df_train, r"..\assets\combined\train_ready.pkl")
joblib.dump(df_test, r"..\assets\combined\test_ready.pkl")

['..\\assets\\combined\\test_ready.pkl']

In [22]:
pip freeze > ..\requirements.txt

Note: you may need to restart the kernel to use updated packages.
