In [None]:
import os
import sys
sys.path.append("..")
from nbr.preparation import Preprocess, save_split, Corpus
from nbr.trainer import NBRTrainer
from nbr.model import BPR, SLRC, NBRKNN
import torch
import random
import numpy as np
import optuna
import warnings
warnings.filterwarnings("ignore")

# TaFeng

Fix seed:

In [None]:
seed = 10
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

Read interactions data (filter users with less than 5 transactions, high purchase frequency and one-day users and items with less than 10 transactions). Train dataset - all baskets except the last two, validation dataset - the last but one basket, test dataset - the last basket:

In [None]:
corpus_path = "./data/"
dataset_name = "ta_feng"

preprocessor = Preprocess(corpus_path, dataset_name)
preprocessor.load_data(5, 10, filt=True)
save_split(corpus_path, dataset_name, preprocessor)

Before preprocessing: #users = 32266, #items = 23812, #clicks = 817741 (#illegal records = 0)
After preprocessing: #users = 7358, #items = 11202, #clicks = 368951
Saving dataset in ./data//data_ta_feng/...


In [None]:
corpus = Corpus(corpus_path, dataset_name)
corpus.load_data()

Train SLRC model with best hyperparams (see ./testing_slrc.ipynb):

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=20,
    topk=10,
    early_stop_num=3
)

train dataset preparing...


100%|██████████| 7358/7358 [00:11<00:00, 638.75it/s]


dev dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 4386.31it/s]


test dataset preparing...


100%|██████████| 7357/7357 [00:02<00:00, 2830.84it/s]


In [None]:
slrc_best_params = {'emb_size': 64, 'batch_size': 64, 'lr': 0.0006142297613045982, 'l2_reg_coef': 0.0047331742711911855}

params = {
    "model": SLRC(
        base_model_class=BPR,
        base_model_config={
            "emb_size": slrc_best_params["emb_size"],
            "user_num": corpus.n_users,
            "item_num": corpus.n_items,
            "click_num": corpus.n_clicks
        },
        item_num=corpus.n_items,
        avg_repeat_interval=corpus.total_avg_interval
    ),
    "batch_size": slrc_best_params["batch_size"],
    "lr": slrc_best_params["lr"],
    "l2_reg_coef": slrc_best_params["l2_reg_coef"]
}

trainer.init_hyperparams(**params)

In [None]:
trainer.train()

Epoch 1:


Batch loss = 0.648382: 100%|██████████| 4445/4445 [00:46<00:00, 95.77it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:18<00:00, 94.15it/s] 


 {'precision': 0.05296996058175887, 'recall': 0.11711213174337326, 'ndcg': 0.10312238567113029}
Epoch 2:



Batch loss = 0.620238: 100%|██████████| 4445/4445 [00:44<00:00, 99.37it/s] 


Evaluation (dev):



100%|██████████| 7357/7357 [01:19<00:00, 92.87it/s]


 {'precision': 0.055008835123012106, 'recall': 0.12688101431125864, 'ndcg': 0.10693213499769577}





Epoch 3:


Batch loss = 0.516222: 100%|██████████| 4445/4445 [00:44<00:00, 99.62it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:15<00:00, 97.55it/s] 


 {'precision': 0.05620497485388066, 'recall': 0.13271546807521978, 'ndcg': 0.1095805846360861}
Epoch 4:



Batch loss = 0.485183: 100%|██████████| 4445/4445 [00:44<00:00, 100.69it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:17<00:00, 95.05it/s]


 {'precision': 0.05596030990893027, 'recall': 0.13146033295519385, 'ndcg': 0.10894357560113532}
Epoch 5:



Batch loss = 0.424968: 100%|██████████| 4445/4445 [00:44<00:00, 100.76it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:14<00:00, 98.15it/s] 


 {'precision': 0.055892347424221826, 'recall': 0.1296131429002266, 'ndcg': 0.10869963177684373}
Epoch 6:



Batch loss = 0.407367: 100%|██████████| 4445/4445 [00:44<00:00, 99.59it/s] 


Evaluation (dev):



100%|██████████| 7357/7357 [01:17<00:00, 95.38it/s] 


 {'precision': 0.05601467989669703, 'recall': 0.12955432394369182, 'ndcg': 0.10851308820929444}





SLRC(
  (base_model): BPR(
    (user_emb): Embedding(7358, 64)
    (item_emb): Embedding(11202, 64)
  )
)

Save users' embeddings for validation step (user's embedding for validation step is his vector of recommendation scores for all items at the time of validation):

In [None]:
dev_user_emb = trainer.get_predictions(mode="dev")

100%|██████████| 7357/7357 [01:27<00:00, 84.54it/s] 


Tune KNN hyperparams on validation dataset:

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)

train dataset preparing...


100%|██████████| 7358/7358 [00:11<00:00, 645.05it/s]


dev dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 4430.63it/s]


test dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 3869.20it/s]


In [None]:
def objective(trial):
    params = {
        "model": NBRKNN(
            item_num=corpus.n_items,
            user_num=corpus.n_users,
            nearest_neighbors_num=trial.suggest_int("nearest_neighbors_num", low=1, high=200),
            alpha=trial.suggest_float("alpha", 0.0, 1.0, step=0.05),
            user_emb=dev_user_emb
        )
    }

    trainer.init_hyperparams(**params)
    metrics = trainer.evaluate(mode="dev")
    score = metrics["ndcg"]
    return score

In [None]:
sampler = optuna.samplers.TPESampler(seed=seed)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=25)

[32m[I 2023-04-23 17:16:00,621][0m A new study created in memory with name: no-name-81d739ff-b480-460a-bfa0-1ef674af2e74[0m
100%|██████████| 7357/7357 [06:17<00:00, 19.47it/s]
[32m[I 2023-04-23 17:22:23,884][0m Trial 0 finished with value: 0.0653908618873418 and parameters: {'nearest_neighbors_num': 155, 'alpha': 0.0}. Best is trial 0 with value: 0.0653908618873418.[0m
100%|██████████| 7357/7357 [06:11<00:00, 19.79it/s]
[32m[I 2023-04-23 17:28:39,754][0m Trial 1 finished with value: 0.11081685513222486 and parameters: {'nearest_neighbors_num': 127, 'alpha': 0.75}. Best is trial 1 with value: 0.11081685513222486.[0m
100%|██████████| 7357/7357 [06:46<00:00, 18.10it/s]
[32m[I 2023-04-23 17:35:31,426][0m Trial 2 finished with value: 0.09549438245097476 and parameters: {'nearest_neighbors_num': 100, 'alpha': 0.2}. Best is trial 1 with value: 0.11081685513222486.[0m
100%|██████████| 7357/7357 [06:38<00:00, 18.45it/s]
[32m[I 2023-04-23 17:42:14,985][0m Trial 3 finished with valu

Test SLRCKNN (calculate scores for different seeds):

In [None]:
test_metrics = {
    "precision": [],
    "recall": [],
    "ndcg": []
}

In [None]:
for seed in range(5):
    print(f"\n___SEED___{seed}")
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)

    trainer = NBRTrainer(
        corpus=corpus,
        max_epochs=20,
        topk=10,
        early_stop_num=3
    )

    params = {
        "model": SLRC(
            base_model_class=BPR,
            base_model_config={
                "emb_size": slrc_best_params["emb_size"],
                "user_num": corpus.n_users,
                "item_num": corpus.n_items,
                "click_num": corpus.n_clicks
            },
            item_num=corpus.n_items,
            avg_repeat_interval=corpus.total_avg_interval
        ),
        "batch_size": slrc_best_params["batch_size"],
        "lr": slrc_best_params["lr"],
        "l2_reg_coef": slrc_best_params["l2_reg_coef"]
    }

    trainer.init_hyperparams(**params)
    trainer.train()

    test_user_emb = trainer.get_predictions(mode="test")

    trainer = NBRTrainer(
        corpus=corpus,
        max_epochs=None,
        topk=10,
        early_stop_num=None
    )

    params = {
        "model": NBRKNN(
            item_num=corpus.n_items,
            user_num=corpus.n_users,
            nearest_neighbors_num=study.best_params["nearest_neighbors_num"],
            alpha=study.best_params["alpha"],
            user_emb=test_user_emb
        )
    }

    trainer.init_hyperparams(**params)

    metrics = trainer.evaluate(mode="test")

    test_metrics["precision"].append(metrics["precision"])
    test_metrics["recall"].append(metrics["recall"])
    test_metrics["ndcg"].append(metrics["ndcg"])


___SEED___0
train dataset preparing...


100%|██████████| 7358/7358 [00:11<00:00, 646.87it/s]


dev dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 4732.17it/s]


test dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 4056.85it/s]

Epoch 1:



Batch loss = 0.647844: 100%|██████████| 4445/4445 [00:45<00:00, 97.26it/s] 


Evaluation (dev):



100%|██████████| 7357/7357 [01:17<00:00, 94.54it/s] 


 {'precision': 0.05306510806035069, 'recall': 0.11760657494409904, 'ndcg': 0.10338726165353773}
Epoch 2:



Batch loss = 0.613529: 100%|██████████| 4445/4445 [00:46<00:00, 96.43it/s] 



Evaluation (dev):


100%|██████████| 7357/7357 [01:15<00:00, 97.08it/s] 


 {'precision': 0.055280685061845865, 'recall': 0.12787036008393035, 'ndcg': 0.10752063171562534}
Epoch 3:



Batch loss = 0.484176: 100%|██████████| 4445/4445 [00:45<00:00, 98.08it/s] 


Evaluation (dev):



100%|██████████| 7357/7357 [01:17<00:00, 94.87it/s] 


 {'precision': 0.05630012233247248, 'recall': 0.13254850400181878, 'ndcg': 0.1095571150683061}
Epoch 4:



Batch loss = 0.412783: 100%|██████████| 4445/4445 [00:45<00:00, 97.76it/s] 


Evaluation (dev):



100%|██████████| 7357/7357 [01:16<00:00, 95.60it/s]


 {'precision': 0.056069049884463776, 'recall': 0.13089943342672652, 'ndcg': 0.10860200062750996}
Epoch 5:



Batch loss = 0.40947: 100%|██████████| 4445/4445 [00:46<00:00, 95.95it/s] 


Evaluation (dev):



100%|██████████| 7357/7357 [01:17<00:00, 94.70it/s] 


 {'precision': 0.055579719994563, 'recall': 0.12869468613279975, 'ndcg': 0.1081829218646605}





Epoch 6:


Batch loss = 0.405149: 100%|██████████| 4445/4445 [00:46<00:00, 96.27it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:16<00:00, 95.65it/s] 


 {'precision': 0.05598749490281364, 'recall': 0.13105695275419954, 'ndcg': 0.1091629061070286}



100%|██████████| 7357/7357 [01:16<00:00, 96.12it/s] 


train dataset preparing...


100%|██████████| 7358/7358 [00:11<00:00, 643.62it/s]


dev dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 4485.42it/s]


test dataset preparing...


100%|██████████| 7357/7357 [00:02<00:00, 3025.67it/s]
100%|██████████| 7357/7357 [04:59<00:00, 24.59it/s]



___SEED___1
train dataset preparing...


100%|██████████| 7358/7358 [00:11<00:00, 655.68it/s]


dev dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 4528.72it/s]


test dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 3957.81it/s]

Epoch 1:



Batch loss = 0.648115: 100%|██████████| 4445/4445 [00:45<00:00, 96.66it/s] 


Evaluation (dev):



100%|██████████| 7357/7357 [01:19<00:00, 92.14it/s]


 {'precision': 0.052983553078700556, 'recall': 0.11730884025233931, 'ndcg': 0.10326253450365228}
Epoch 2:



Batch loss = 0.618138: 100%|██████████| 4445/4445 [00:46<00:00, 96.61it/s] 


Evaluation (dev):



100%|██████████| 7357/7357 [01:15<00:00, 96.99it/s] 


 {'precision': 0.05518553758325404, 'recall': 0.12778208290447507, 'ndcg': 0.10736117864801158}
Epoch 3:



Batch loss = 0.49879: 100%|██████████| 4445/4445 [00:46<00:00, 95.71it/s]



Evaluation (dev):


100%|██████████| 7357/7357 [01:19<00:00, 92.99it/s] 


 {'precision': 0.05597390240587196, 'recall': 0.13212535969929048, 'ndcg': 0.10932099089009956}
Epoch 4:



Batch loss = 0.498291: 100%|██████████| 4445/4445 [00:46<00:00, 95.60it/s] 



Evaluation (dev):


100%|██████████| 7357/7357 [01:18<00:00, 93.31it/s] 


 {'precision': 0.056286529835530785, 'recall': 0.1320072938810533, 'ndcg': 0.10893477085005902}
Epoch 5:



Batch loss = 0.393214: 100%|██████████| 4445/4445 [00:47<00:00, 94.24it/s] 


Evaluation (dev):



100%|██████████| 7357/7357 [01:18<00:00, 93.94it/s]


 {'precision': 0.05566127497621313, 'recall': 0.128561368615953, 'ndcg': 0.10802543395474196}
Epoch 6:



Batch loss = 0.372129: 100%|██████████| 4445/4445 [00:46<00:00, 96.03it/s] 


Evaluation (dev):



100%|██████████| 7357/7357 [01:16<00:00, 96.74it/s] 


 {'precision': 0.05598749490281364, 'recall': 0.12938509283272898, 'ndcg': 0.1089098140710593}



100%|██████████| 7357/7357 [01:19<00:00, 92.99it/s] 


train dataset preparing...


100%|██████████| 7358/7358 [00:10<00:00, 681.96it/s]


dev dataset preparing...


100%|██████████| 7357/7357 [00:02<00:00, 3364.72it/s]


test dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 4130.50it/s]
100%|██████████| 7357/7357 [05:04<00:00, 24.20it/s]



___SEED___2
train dataset preparing...


100%|██████████| 7358/7358 [00:09<00:00, 788.49it/s]


dev dataset preparing...


100%|██████████| 7357/7357 [00:02<00:00, 3245.65it/s]


test dataset preparing...


100%|██████████| 7357/7357 [00:02<00:00, 2626.51it/s]

Epoch 1:



Batch loss = 0.64851: 100%|██████████| 4445/4445 [00:47<00:00, 93.74it/s] 


Evaluation (dev):



100%|██████████| 7357/7357 [01:16<00:00, 96.64it/s]


 {'precision': 0.052942775587875496, 'recall': 0.11726026876172108, 'ndcg': 0.10313874782031339}
Epoch 2:



Batch loss = 0.613944: 100%|██████████| 4445/4445 [00:47<00:00, 94.10it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:16<00:00, 96.47it/s] 


 {'precision': 0.0552263150740791, 'recall': 0.12784914315486998, 'ndcg': 0.10745646263383438}
Epoch 3:



Batch loss = 0.515254: 100%|██████████| 4445/4445 [00:47<00:00, 93.29it/s] 


Evaluation (dev):



100%|██████████| 7357/7357 [01:17<00:00, 94.64it/s] 


 {'precision': 0.05615060486611391, 'recall': 0.1321609177451479, 'ndcg': 0.10964041700697584}
Epoch 4:



Batch loss = 0.40877: 100%|██████████| 4445/4445 [00:46<00:00, 95.43it/s] 



Evaluation (dev):


100%|██████████| 7357/7357 [01:19<00:00, 92.41it/s]


 {'precision': 0.05593312491504689, 'recall': 0.12980271658558626, 'ndcg': 0.10834286228903398}
Epoch 5:



Batch loss = 0.396547: 100%|██████████| 4445/4445 [00:46<00:00, 94.78it/s] 


Evaluation (dev):



100%|██████████| 7357/7357 [01:16<00:00, 96.63it/s] 


 {'precision': 0.056082642381405465, 'recall': 0.13033290741866962, 'ndcg': 0.10892434725283648}
Epoch 6:



Batch loss = 0.359622: 100%|██████████| 4445/4445 [00:47<00:00, 94.53it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:19<00:00, 92.66it/s] 


 {'precision': 0.056259344841647414, 'recall': 0.13130733438514197, 'ndcg': 0.1093480871326376}



100%|██████████| 7357/7357 [01:16<00:00, 96.13it/s]


train dataset preparing...


100%|██████████| 7358/7358 [00:10<00:00, 733.09it/s]


dev dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 3753.85it/s]


test dataset preparing...


100%|██████████| 7357/7357 [00:03<00:00, 2366.92it/s]
100%|██████████| 7357/7357 [05:07<00:00, 23.91it/s]



___SEED___3
train dataset preparing...


100%|██████████| 7358/7358 [00:11<00:00, 664.57it/s]


dev dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 4125.51it/s]


test dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 4222.65it/s]

Epoch 1:



Batch loss = 0.648213: 100%|██████████| 4445/4445 [00:47<00:00, 93.10it/s] 


Evaluation (dev):



100%|██████████| 7357/7357 [01:16<00:00, 96.32it/s]


 {'precision': 0.05292918309093381, 'recall': 0.11722428625684035, 'ndcg': 0.10334206215700821}





Epoch 2:


Batch loss = 0.613215: 100%|██████████| 4445/4445 [00:47<00:00, 92.61it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:16<00:00, 95.60it/s] 


 {'precision': 0.055307870055729236, 'recall': 0.12817648204870014, 'ndcg': 0.10749804528714656}
Epoch 3:



Batch loss = 0.522081: 100%|██████████| 4445/4445 [00:49<00:00, 90.52it/s]



Evaluation (dev):


100%|██████████| 7357/7357 [01:18<00:00, 93.57it/s] 


 {'precision': 0.056096234878347154, 'recall': 0.1320079945827257, 'ndcg': 0.10941848721221557}
Epoch 4:



Batch loss = 0.455008: 100%|██████████| 4445/4445 [00:47<00:00, 93.76it/s] 


Evaluation (dev):



100%|██████████| 7357/7357 [01:18<00:00, 93.87it/s]


 {'precision': 0.05605545738752209, 'recall': 0.13052547489044342, 'ndcg': 0.10846417290285777}
Epoch 5:



Batch loss = 0.462366: 100%|██████████| 4445/4445 [00:48<00:00, 91.65it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:16<00:00, 96.29it/s] 


 {'precision': 0.05559331249150469, 'recall': 0.12858402374791061, 'ndcg': 0.10827293938903636}
Epoch 6:



Batch loss = 0.455538: 100%|██████████| 4445/4445 [00:47<00:00, 93.36it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:20<00:00, 91.84it/s] 


 {'precision': 0.056449639798831046, 'recall': 0.13171414304460102, 'ndcg': 0.10979318641736421}
Epoch 7:



Batch loss = 0.354771: 100%|██████████| 4445/4445 [00:47<00:00, 93.40it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:18<00:00, 94.29it/s] 


 {'precision': 0.056123419872230525, 'recall': 0.13106735390259025, 'ndcg': 0.10969199779528051}
Epoch 8:



Batch loss = 0.431846: 100%|██████████| 4445/4445 [00:47<00:00, 92.62it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:19<00:00, 92.11it/s]


 {'precision': 0.056272937338589096, 'recall': 0.13061539721897414, 'ndcg': 0.10898434919210259}
Epoch 9:



Batch loss = 0.355546: 100%|██████████| 4445/4445 [00:47<00:00, 93.40it/s] 


Evaluation (dev):



100%|██████████| 7357/7357 [01:16<00:00, 96.34it/s] 


 {'precision': 0.05598749490281364, 'recall': 0.1302838275905165, 'ndcg': 0.10907176493139453}



100%|██████████| 7357/7357 [01:19<00:00, 92.64it/s] 


train dataset preparing...


100%|██████████| 7358/7358 [00:11<00:00, 641.83it/s]


dev dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 4361.14it/s]


test dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 4029.14it/s]
100%|██████████| 7357/7357 [05:06<00:00, 24.01it/s]



___SEED___4
train dataset preparing...


100%|██████████| 7358/7358 [00:11<00:00, 661.08it/s]


dev dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 4747.38it/s]


test dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 4052.67it/s]

Epoch 1:



Batch loss = 0.648428: 100%|██████████| 4445/4445 [00:47<00:00, 93.27it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:19<00:00, 92.68it/s]


 {'precision': 0.05283403561234199, 'recall': 0.11688602227934589, 'ndcg': 0.10315941097580893}
Epoch 2:



Batch loss = 0.617539: 100%|██████████| 4445/4445 [00:47<00:00, 93.14it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:18<00:00, 94.18it/s] 


 {'precision': 0.055280685061845865, 'recall': 0.12808553041197648, 'ndcg': 0.1075739078236309}
Epoch 3:



Batch loss = 0.526964: 100%|██████████| 4445/4445 [00:48<00:00, 92.45it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:19<00:00, 92.80it/s]


 {'precision': 0.05610982737528884, 'recall': 0.1318116758528083, 'ndcg': 0.10937556278952634}
Epoch 4:



Batch loss = 0.467776: 100%|██████████| 4445/4445 [00:48<00:00, 91.22it/s]



Evaluation (dev):


100%|██████████| 7357/7357 [01:16<00:00, 96.35it/s] 


 {'precision': 0.05589234742422183, 'recall': 0.13093346527700872, 'ndcg': 0.10873333472870504}
Epoch 5:



Batch loss = 0.472686: 100%|██████████| 4445/4445 [00:48<00:00, 91.82it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:20<00:00, 91.81it/s] 


 {'precision': 0.05564768247927144, 'recall': 0.12844266576522984, 'ndcg': 0.10794656347332765}
Epoch 6:



Batch loss = 0.3681: 100%|██████████| 4445/4445 [00:48<00:00, 91.89it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:18<00:00, 93.75it/s] 



 {'precision': 0.05593312491504689, 'recall': 0.12913710209741236, 'ndcg': 0.10902443637702317}


100%|██████████| 7357/7357 [01:15<00:00, 97.40it/s] 


train dataset preparing...


100%|██████████| 7358/7358 [00:13<00:00, 558.20it/s]


dev dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 4603.56it/s]


test dataset preparing...


100%|██████████| 7357/7357 [00:02<00:00, 2605.24it/s]
100%|██████████| 7357/7357 [05:08<00:00, 23.85it/s]


In [None]:
{
    "precision": np.array(test_metrics["precision"]).mean(),
    "recall": np.array(test_metrics["recall"]).mean(),
    "ndcg": np.array(test_metrics["ndcg"]).mean(),
}

{'precision': 0.06425717004213674,
 'recall': 0.1559840085486025,
 'ndcg': 0.12672619419372802}

# TaoBao

Fix seed:

In [None]:
seed = 10
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

Read interactions data (filter users with less than 10 transactions, high purchase frequency and one-day users and items with less than 10 transactions). Train dataset - all baskets except the last two, validation dataset - the last but one basket, test dataset - the last basket:

In [None]:
corpus_path = "./data/"
dataset_name = "taobao"

preprocessor = Preprocess(corpus_path, dataset_name)
preprocessor.load_data(10, 10, filt=True)
save_split(corpus_path, dataset_name, preprocessor)

Before preprocessing: #users = 672404, #items = 638962, #clicks = 2015807 (#illegal records = 0)
After preprocessing: #users = 10092, #items = 22286, #clicks = 67991
Saving dataset in ./data//data_taobao/...


In [None]:
corpus = Corpus(corpus_path, dataset_name)
corpus.load_data()

Train SLRC model with best hyperparams (see ./testing_slrc.ipynb):

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=20,
    topk=10,
    early_stop_num=3
)

train dataset preparing...


100%|██████████| 10092/10092 [00:41<00:00, 242.25it/s]


dev dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 27015.42it/s]


test dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 29580.62it/s]


In [None]:
slrc_best_params = {'emb_size': 32, 'batch_size': 256, 'lr': 1.0851391597925009e-05, 'l2_reg_coef': 0.03239377807560215}

params = {
    "model": SLRC(
        base_model_class=BPR,
        base_model_config={
            "emb_size": slrc_best_params["emb_size"],
            "user_num": corpus.n_users,
            "item_num": corpus.n_items,
            "click_num": corpus.n_clicks
        },
        item_num=corpus.n_items,
        avg_repeat_interval=corpus.total_avg_interval
    ),
    "batch_size": slrc_best_params["batch_size"],
    "lr": slrc_best_params["lr"],
    "l2_reg_coef": slrc_best_params["l2_reg_coef"]
}

trainer.init_hyperparams(**params)

In [None]:
trainer.train()

Epoch 1:


Batch loss = 0.671828: 100%|██████████| 191/191 [00:03<00:00, 57.04it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:58<00:00, 52.10it/s]


 {'precision': 0.010443752014612655, 'recall': 0.09904193975860463, 'ndcg': 0.07283291962948076}





Epoch 2:


Batch loss = 0.671776: 100%|██████████| 191/191 [00:03<00:00, 63.08it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:54<00:00, 53.42it/s]


 {'precision': 0.010433007413774578, 'recall': 0.09898821675441424, 'ndcg': 0.0728535311003784}
Epoch 3:



Batch loss = 0.671721: 100%|██████████| 191/191 [00:03<00:00, 56.73it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:47<00:00, 55.46it/s]


 {'precision': 0.010433007413774578, 'recall': 0.09898821675441424, 'ndcg': 0.07282528539007627}
Epoch 4:



Batch loss = 0.671692: 100%|██████████| 191/191 [00:03<00:00, 56.97it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:49<00:00, 55.01it/s]


 {'precision': 0.0104222628129365, 'recall': 0.09888077074603345, 'ndcg': 0.07280322602580534}
Epoch 5:



Batch loss = 0.671657: 100%|██████████| 191/191 [00:02<00:00, 64.91it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:49<00:00, 54.78it/s]


 {'precision': 0.0104222628129365, 'recall': 0.09888077074603345, 'ndcg': 0.07288007857110268}
Epoch 6:



Batch loss = 0.671621: 100%|██████████| 191/191 [00:02<00:00, 64.77it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:48<00:00, 55.09it/s]


 {'precision': 0.010411518212098422, 'recall': 0.09877332473765266, 'ndcg': 0.07279130981357429}
Epoch 7:



Batch loss = 0.671538: 100%|██████████| 191/191 [00:03<00:00, 62.79it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:50<00:00, 54.51it/s]


 {'precision': 0.010411518212098422, 'recall': 0.09877332473765266, 'ndcg': 0.07284312215258729}
Epoch 8:



Batch loss = 0.671486: 100%|██████████| 191/191 [00:03<00:00, 55.48it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:48<00:00, 55.36it/s]


 {'precision': 0.010411518212098422, 'recall': 0.09877332473765266, 'ndcg': 0.0728146605521322}





SLRC(
  (base_model): BPR(
    (user_emb): Embedding(10092, 32)
    (item_emb): Embedding(22286, 32)
  )
)

Save users' embeddings for validation step (user's embedding for validation step is his vector of recommendation scores for all items at the time of validation):

In [None]:
dev_user_emb = trainer.get_predictions(mode="dev")

100%|██████████| 9307/9307 [02:48<00:00, 55.35it/s]


Tune KNN hyperparams on validation dataset:

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)

train dataset preparing...


100%|██████████| 10092/10092 [00:36<00:00, 279.85it/s]


dev dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 31671.81it/s]


test dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 24917.03it/s]


In [None]:
def objective(trial):
    params = {
        "model": NBRKNN(
            item_num=corpus.n_items,
            user_num=corpus.n_users,
            nearest_neighbors_num=trial.suggest_int("nearest_neighbors_num", low=0, high=200, step=10),
            alpha=trial.suggest_float("alpha", 0.0, 1.0, step=0.05),
            user_emb=dev_user_emb
        )
    }

    trainer.init_hyperparams(**params)
    metrics = trainer.evaluate(mode="dev")
    score = metrics["ndcg"]
    return score

In [None]:
sampler = optuna.samplers.TPESampler(seed=seed)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=25)

[32m[I 2023-04-22 11:10:59,225][0m A new study created in memory with name: no-name-b2391149-ef11-47d7-bee4-7c156b63cb88[0m
100%|██████████| 9307/9307 [21:44<00:00,  7.14it/s]
[32m[I 2023-04-22 11:32:58,461][0m Trial 0 finished with value: 0.05488958889693859 and parameters: {'nearest_neighbors_num': 160, 'alpha': 0.0}. Best is trial 0 with value: 0.05488958889693859.[0m
100%|██████████| 9307/9307 [22:13<00:00,  6.98it/s]
[32m[I 2023-04-22 11:55:32,208][0m Trial 1 finished with value: 0.0739815303147157 and parameters: {'nearest_neighbors_num': 130, 'alpha': 0.75}. Best is trial 1 with value: 0.0739815303147157.[0m
100%|██████████| 9307/9307 [21:59<00:00,  7.05it/s]
[32m[I 2023-04-22 12:17:52,354][0m Trial 2 finished with value: 0.07189376061244337 and parameters: {'nearest_neighbors_num': 100, 'alpha': 0.2}. Best is trial 1 with value: 0.0739815303147157.[0m
100%|██████████| 9307/9307 [21:24<00:00,  7.25it/s]
[32m[I 2023-04-22 12:39:37,152][0m Trial 3 finished with value

Test SLRCKNN (calculate scores for different seeds):

In [None]:
test_metrics = {
    "precision": [],
    "recall": [],
    "ndcg": []
}

In [None]:
for seed in range(5):
    print(f"\n___SEED___{seed}")
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)

    trainer = NBRTrainer(
        corpus=corpus,
        max_epochs=20,
        topk=10,
        early_stop_num=3
    )

    params = {
        "model": SLRC(
            base_model_class=BPR,
            base_model_config={
                "emb_size": slrc_best_params["emb_size"],
                "user_num": corpus.n_users,
                "item_num": corpus.n_items,
                "click_num": corpus.n_clicks
            },
            item_num=corpus.n_items,
            avg_repeat_interval=corpus.total_avg_interval
        ),
        "batch_size": slrc_best_params["batch_size"],
        "lr": slrc_best_params["lr"],
        "l2_reg_coef": slrc_best_params["l2_reg_coef"]
    }

    trainer.init_hyperparams(**params)
    trainer.train()

    test_user_emb = trainer.get_predictions(mode="test")

    trainer = NBRTrainer(
        corpus=corpus,
        max_epochs=None,
        topk=10,
        early_stop_num=None
    )

    params = {
        "model": NBRKNN(
            item_num=corpus.n_items,
            user_num=corpus.n_users,
            nearest_neighbors_num=study.best_params["nearest_neighbors_num"],
            alpha=study.best_params["alpha"],
            user_emb=test_user_emb
        )
    }

    trainer.init_hyperparams(**params)

    metrics = trainer.evaluate(mode="test")

    test_metrics["precision"].append(metrics["precision"])
    test_metrics["recall"].append(metrics["recall"])
    test_metrics["ndcg"].append(metrics["ndcg"])


___SEED___0
train dataset preparing...


100%|██████████| 10092/10092 [00:35<00:00, 287.32it/s]


dev dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 34600.53it/s]


test dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 26224.21it/s]

Epoch 1:



Batch loss = 0.671844: 100%|██████████| 191/191 [00:03<00:00, 63.06it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:57<00:00, 52.44it/s]


 {'precision': 0.0104222628129365, 'recall': 0.09888077074603345, 'ndcg': 0.07300048930102894}
Epoch 2:



Batch loss = 0.671798: 100%|██████████| 191/191 [00:03<00:00, 54.04it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:52<00:00, 53.88it/s]


 {'precision': 0.010411518212098422, 'recall': 0.09877332473765266, 'ndcg': 0.07296455313152723}
Epoch 3:



Batch loss = 0.671731: 100%|██████████| 191/191 [00:03<00:00, 59.80it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:50<00:00, 54.74it/s]


 {'precision': 0.010411518212098422, 'recall': 0.09877332473765266, 'ndcg': 0.07291945580809885}
Epoch 4:



Batch loss = 0.671712: 100%|██████████| 191/191 [00:03<00:00, 55.75it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:52<00:00, 53.89it/s]


 {'precision': 0.010411518212098422, 'recall': 0.09877332473765266, 'ndcg': 0.07289013117963543}



100%|██████████| 9307/9307 [02:55<00:00, 53.13it/s]


train dataset preparing...


100%|██████████| 10092/10092 [00:30<00:00, 334.72it/s]


dev dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 33374.02it/s]


test dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 30630.40it/s]
100%|██████████| 9307/9307 [16:27<00:00,  9.42it/s]



___SEED___1
train dataset preparing...


100%|██████████| 10092/10092 [00:31<00:00, 323.87it/s]


dev dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 19915.60it/s]


test dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 15840.86it/s]


Epoch 1:


Batch loss = 0.671883: 100%|██████████| 191/191 [00:03<00:00, 56.47it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:49<00:00, 54.79it/s]


 {'precision': 0.010465241216288815, 'recall': 0.09920310877117582, 'ndcg': 0.0734044488405234}
Epoch 2:



Batch loss = 0.671844: 100%|██████████| 191/191 [00:03<00:00, 60.98it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:48<00:00, 55.21it/s]


 {'precision': 0.010465241216288815, 'recall': 0.09920310877117582, 'ndcg': 0.07334945301938596}
Epoch 3:



Batch loss = 0.671808: 100%|██████████| 191/191 [00:03<00:00, 59.10it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:51<00:00, 54.41it/s]


 {'precision': 0.010465241216288815, 'recall': 0.09920310877117582, 'ndcg': 0.07344369861600566}
Epoch 4:



Batch loss = 0.671758: 100%|██████████| 191/191 [00:03<00:00, 59.24it/s]



Evaluation (dev):


100%|██████████| 9307/9307 [02:51<00:00, 54.39it/s]


 {'precision': 0.010465241216288815, 'recall': 0.09920310877117582, 'ndcg': 0.07339878776500793}
Epoch 5:



Batch loss = 0.671729: 100%|██████████| 191/191 [00:03<00:00, 60.32it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:49<00:00, 55.01it/s]


 {'precision': 0.010454496615450738, 'recall': 0.09909566276279502, 'ndcg': 0.07330910588535189}
Epoch 6:



Batch loss = 0.671676: 100%|██████████| 191/191 [00:03<00:00, 57.76it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:50<00:00, 54.59it/s]


 {'precision': 0.010454496615450738, 'recall': 0.09909566276279502, 'ndcg': 0.07326640614399639}



100%|██████████| 9307/9307 [02:48<00:00, 55.29it/s]


train dataset preparing...


100%|██████████| 10092/10092 [00:30<00:00, 332.90it/s]


dev dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 32970.34it/s]


test dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 25434.07it/s]
100%|██████████| 9307/9307 [16:16<00:00,  9.53it/s]



___SEED___2
train dataset preparing...


100%|██████████| 10092/10092 [00:30<00:00, 326.13it/s]


dev dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 33778.11it/s]


test dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 16022.97it/s]

Epoch 1:



Batch loss = 0.671854: 100%|██████████| 191/191 [00:03<00:00, 48.32it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:49<00:00, 54.98it/s]


 {'precision': 0.010411518212098422, 'recall': 0.09871960173346227, 'ndcg': 0.07298222158697909}
Epoch 2:



Batch loss = 0.671842: 100%|██████████| 191/191 [00:03<00:00, 55.27it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:51<00:00, 54.22it/s]


 {'precision': 0.010411518212098422, 'recall': 0.09871960173346227, 'ndcg': 0.07303087620925015}
Epoch 3:



Batch loss = 0.671784: 100%|██████████| 191/191 [00:03<00:00, 59.82it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:49<00:00, 54.75it/s]


 {'precision': 0.010411518212098422, 'recall': 0.09871960173346227, 'ndcg': 0.07306332070346207}
Epoch 4:



Batch loss = 0.671717: 100%|██████████| 191/191 [00:03<00:00, 59.05it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:50<00:00, 54.56it/s]


 {'precision': 0.010411518212098422, 'recall': 0.09871960173346227, 'ndcg': 0.07309419630228346}
Epoch 5:



Batch loss = 0.671687: 100%|██████████| 191/191 [00:03<00:00, 60.20it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:50<00:00, 54.50it/s]


 {'precision': 0.010411518212098422, 'recall': 0.09871960173346227, 'ndcg': 0.07313553776834017}
Epoch 6:



Batch loss = 0.671648: 100%|██████████| 191/191 [00:03<00:00, 63.04it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:50<00:00, 54.70it/s]


 {'precision': 0.010411518212098422, 'recall': 0.09871960173346227, 'ndcg': 0.07316701077021953}
Epoch 7:



Batch loss = 0.671609: 100%|██████████| 191/191 [00:03<00:00, 53.95it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:49<00:00, 54.77it/s]


 {'precision': 0.010411518212098422, 'recall': 0.09871960173346227, 'ndcg': 0.07310306323887776}
Epoch 8:



Batch loss = 0.671517: 100%|██████████| 191/191 [00:03<00:00, 52.42it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:48<00:00, 55.18it/s]


 {'precision': 0.010400773611260341, 'recall': 0.09861215572508147, 'ndcg': 0.07308100387460682}
Epoch 9:



Batch loss = 0.671517: 100%|██████████| 191/191 [00:03<00:00, 51.31it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:52<00:00, 53.86it/s]


 {'precision': 0.010411518212098422, 'recall': 0.09871960173346227, 'ndcg': 0.07311995343904591}



100%|██████████| 9307/9307 [02:46<00:00, 55.74it/s]


train dataset preparing...


100%|██████████| 10092/10092 [00:32<00:00, 308.09it/s]


dev dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 32329.26it/s]


test dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 30063.44it/s]
100%|██████████| 9307/9307 [16:03<00:00,  9.66it/s]



___SEED___3
train dataset preparing...


100%|██████████| 10092/10092 [00:30<00:00, 330.67it/s]


dev dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 32843.04it/s]


test dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 27564.99it/s]

Epoch 1:



Batch loss = 0.671816: 100%|██████████| 191/191 [00:03<00:00, 57.64it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:52<00:00, 53.89it/s]


 {'precision': 0.010433007413774578, 'recall': 0.09898821675441424, 'ndcg': 0.07250471029556521}
Epoch 2:



Batch loss = 0.671804: 100%|██████████| 191/191 [00:03<00:00, 48.91it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:55<00:00, 53.11it/s]



 {'precision': 0.0104222628129365, 'recall': 0.09888077074603345, 'ndcg': 0.07245213502729231}
Epoch 3:


Batch loss = 0.671757: 100%|██████████| 191/191 [00:03<00:00, 52.79it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:59<00:00, 51.96it/s]


 {'precision': 0.0104222628129365, 'recall': 0.09888077074603345, 'ndcg': 0.07246064326169149}
Epoch 4:



Batch loss = 0.671701: 100%|██████████| 191/191 [00:04<00:00, 43.40it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:55<00:00, 53.05it/s]


 {'precision': 0.0104222628129365, 'recall': 0.09888077074603345, 'ndcg': 0.07247970196392889}



100%|██████████| 9307/9307 [02:56<00:00, 52.85it/s]


train dataset preparing...


100%|██████████| 10092/10092 [00:30<00:00, 326.48it/s]


dev dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 28519.57it/s]


test dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 20213.49it/s]
100%|██████████| 9307/9307 [16:12<00:00,  9.57it/s]



___SEED___4
train dataset preparing...


100%|██████████| 10092/10092 [00:30<00:00, 328.52it/s]


dev dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 29657.16it/s]


test dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 28766.30it/s]

Epoch 1:



Batch loss = 0.671825: 100%|██████████| 191/191 [00:03<00:00, 59.13it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:51<00:00, 54.35it/s]


 {'precision': 0.0104222628129365, 'recall': 0.09882704774184306, 'ndcg': 0.07251693470541855}
Epoch 2:



Batch loss = 0.671778: 100%|██████████| 191/191 [00:03<00:00, 50.41it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:54<00:00, 53.33it/s]


 {'precision': 0.0104222628129365, 'recall': 0.09882704774184306, 'ndcg': 0.0725007522274374}
Epoch 3:



Batch loss = 0.671715: 100%|██████████| 191/191 [00:03<00:00, 59.84it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:51<00:00, 54.17it/s]


 {'precision': 0.0104222628129365, 'recall': 0.09882704774184306, 'ndcg': 0.07254511603932819}
Epoch 4:



Batch loss = 0.67168: 100%|██████████| 191/191 [00:03<00:00, 62.37it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:50<00:00, 54.45it/s]


 {'precision': 0.010433007413774578, 'recall': 0.09888077074603345, 'ndcg': 0.07259953111472374}
Epoch 5:



Batch loss = 0.671653: 100%|██████████| 191/191 [00:03<00:00, 60.12it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:52<00:00, 53.94it/s]


 {'precision': 0.010433007413774578, 'recall': 0.09888077074603345, 'ndcg': 0.072574361746052}
Epoch 6:



Batch loss = 0.671547: 100%|██████████| 191/191 [00:03<00:00, 53.20it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:49<00:00, 55.01it/s]


 {'precision': 0.0104222628129365, 'recall': 0.09882704774184306, 'ndcg': 0.0726587889249773}





Epoch 7:


Batch loss = 0.671561: 100%|██████████| 191/191 [00:03<00:00, 48.96it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:53<00:00, 53.79it/s]


 {'precision': 0.010433007413774578, 'recall': 0.09893449375022384, 'ndcg': 0.07263141609543815}
Epoch 8:



Batch loss = 0.671459: 100%|██████████| 191/191 [00:03<00:00, 55.44it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:51<00:00, 54.27it/s]


 {'precision': 0.010433007413774578, 'recall': 0.09893449375022384, 'ndcg': 0.07260209661604644}
Epoch 9:



Batch loss = 0.671451: 100%|██████████| 191/191 [00:03<00:00, 60.70it/s]


Evaluation (dev):



100%|██████████| 9307/9307 [02:51<00:00, 54.38it/s]


 {'precision': 0.010433007413774578, 'recall': 0.09893449375022384, 'ndcg': 0.07252946776558236}



100%|██████████| 9307/9307 [02:51<00:00, 54.23it/s]


train dataset preparing...


100%|██████████| 10092/10092 [00:30<00:00, 334.18it/s]


dev dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 33353.94it/s]


test dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 16873.09it/s]
100%|██████████| 9307/9307 [16:07<00:00,  9.62it/s]


In [None]:
{
    "precision": np.array(test_metrics["precision"]).mean(),
    "recall": np.array(test_metrics["recall"]).mean(),
    "ndcg": np.array(test_metrics["ndcg"]).mean(),
}

{'precision': 0.01229397227892984,
 'recall': 0.11895705741198381,
 'ndcg': 0.08141059581578725}

# Dunnhumby

Fix seed:

In [None]:
seed = 10
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

Read interactions data (filter users with less than 5 transactions, high purchase frequency and one-day users and items with less than 10 transactions). Train dataset - all baskets except the last two, validation dataset - the last but one basket, test dataset - the last basket:

In [None]:
corpus_path = "./data/"
dataset_name = "dunnhumby"

preprocessor = Preprocess(corpus_path, dataset_name)
preprocessor.load_data(5, 10, filt=True)
save_split(corpus_path, dataset_name, preprocessor)

Before preprocessing: #users = 2500, #items = 92339, #clicks = 2595370 (#illegal records = 0)
After preprocessing: #users = 2358, #items = 26756, #clicks = 1976796
Saving dataset in ./data//data_dunnhumby/...


In [None]:
corpus = Corpus(corpus_path, dataset_name)
corpus.load_data()

Train SLRC model with best hyperparams (see ./testing_slrc.ipynb):

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=20,
    topk=10,
    early_stop_num=3
)

train dataset preparing...


100%|██████████| 2358/2358 [00:10<00:00, 220.61it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:10<00:00, 219.72it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:11<00:00, 199.37it/s]


In [None]:
slrc_best_params = {'emb_size': 32, 'batch_size': 256, 'lr': 0.0006366285017414498, 'l2_reg_coef': 0.09063752099202302}

params = {
    "model": SLRC(
        base_model_class=BPR,
        base_model_config={
            "emb_size": slrc_best_params["emb_size"],
            "user_num": corpus.n_users,
            "item_num": corpus.n_items,
            "click_num": corpus.n_clicks
        },
        item_num=corpus.n_items,
        avg_repeat_interval=corpus.total_avg_interval
    ),
    "batch_size": slrc_best_params["batch_size"],
    "lr": slrc_best_params["lr"],
    "l2_reg_coef": slrc_best_params["l2_reg_coef"]
}

trainer.init_hyperparams(**params)

In [None]:
trainer.train()

Epoch 1:


Batch loss = 0.575686: 100%|██████████| 7530/7530 [02:11<00:00, 57.43it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:15<00:00, 31.39it/s]


 {'precision': 0.11904963937208317, 'recall': 0.182277700424532, 'ndcg': 0.1694145228041231}
Epoch 2:



Batch loss = 0.533279: 100%|██████████| 7530/7530 [02:14<00:00, 56.14it/s]



Evaluation (dev):


100%|██████████| 2357/2357 [01:11<00:00, 33.18it/s]


 {'precision': 0.11714043275350022, 'recall': 0.18073043804384614, 'ndcg': 0.16877583293808682}
Epoch 3:



Batch loss = 0.498366: 100%|██████████| 7530/7530 [02:16<00:00, 55.17it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:12<00:00, 32.48it/s]


 {'precision': 0.11709800593975392, 'recall': 0.18133129790695426, 'ndcg': 0.16794579661060294}
Epoch 4:



Batch loss = 0.459155: 100%|██████████| 7530/7530 [02:16<00:00, 55.09it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:12<00:00, 32.59it/s]


 {'precision': 0.11646160373355961, 'recall': 0.1801680333166432, 'ndcg': 0.16614035622344148}





SLRC(
  (base_model): BPR(
    (user_emb): Embedding(2358, 32)
    (item_emb): Embedding(26756, 32)
  )
)

Save users' embeddings for validation step (user's embedding for validation step is his vector of recommendation scores for all items at the time of validation):

In [None]:
dev_user_emb = trainer.get_predictions(mode="dev")

100%|██████████| 2357/2357 [01:14<00:00, 31.70it/s]


Tune KNN hyperparams on validation dataset:

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)

train dataset preparing...


100%|██████████| 2358/2358 [00:10<00:00, 221.38it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:10<00:00, 223.89it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:12<00:00, 195.26it/s]


In [None]:
def objective(trial):
    params = {
        "model": NBRKNN(
            item_num=corpus.n_items,
            user_num=corpus.n_users,
            nearest_neighbors_num=trial.suggest_int("nearest_neighbors_num", low=1, high=200),
            alpha=trial.suggest_float("alpha", 0.0, 1.0, step=0.05),
            user_emb=dev_user_emb
        )
    }

    trainer.init_hyperparams(**params)
    metrics = trainer.evaluate(mode="dev")
    score = metrics["ndcg"]
    return score

In [None]:
sampler = optuna.samplers.TPESampler(seed=seed)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=25)

[32m[I 2023-04-23 09:44:10,283][0m A new study created in memory with name: no-name-dab2f14e-1955-4c2c-a7e3-0be41ae025c0[0m
100%|██████████| 2357/2357 [03:09<00:00, 12.42it/s]
[32m[I 2023-04-23 09:47:25,259][0m Trial 0 finished with value: 0.12505135057878455 and parameters: {'nearest_neighbors_num': 155, 'alpha': 0.0}. Best is trial 0 with value: 0.12505135057878455.[0m
100%|██████████| 2357/2357 [02:48<00:00, 13.98it/s]
[32m[I 2023-04-23 09:50:17,249][0m Trial 1 finished with value: 0.17325260973753215 and parameters: {'nearest_neighbors_num': 127, 'alpha': 0.75}. Best is trial 1 with value: 0.17325260973753215.[0m
100%|██████████| 2357/2357 [02:45<00:00, 14.27it/s]
[32m[I 2023-04-23 09:53:05,837][0m Trial 2 finished with value: 0.16500951841871095 and parameters: {'nearest_neighbors_num': 100, 'alpha': 0.2}. Best is trial 1 with value: 0.17325260973753215.[0m
100%|██████████| 2357/2357 [02:35<00:00, 15.13it/s]
[32m[I 2023-04-23 09:55:47,644][0m Trial 3 finished with va

Test SLRCKNN (calculate scores for different seeds):

In [None]:
test_metrics = {
    "precision": [],
    "recall": [],
    "ndcg": []
}

In [None]:
for seed in range(5):
    print(f"\n___SEED___{seed}")
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)

    trainer = NBRTrainer(
        corpus=corpus,
        max_epochs=20,
        topk=10,
        early_stop_num=3
    )

    params = {
        "model": SLRC(
            base_model_class=BPR,
            base_model_config={
                "emb_size": slrc_best_params["emb_size"],
                "user_num": corpus.n_users,
                "item_num": corpus.n_items,
                "click_num": corpus.n_clicks
            },
            item_num=corpus.n_items,
            avg_repeat_interval=corpus.total_avg_interval
        ),
        "batch_size": slrc_best_params["batch_size"],
        "lr": slrc_best_params["lr"],
        "l2_reg_coef": slrc_best_params["l2_reg_coef"]
    }

    trainer.init_hyperparams(**params)
    trainer.train()

    test_user_emb = trainer.get_predictions(mode="test")

    trainer = NBRTrainer(
        corpus=corpus,
        max_epochs=None,
        topk=10,
        early_stop_num=None
    )

    params = {
        "model": NBRKNN(
            item_num=corpus.n_items,
            user_num=corpus.n_users,
            nearest_neighbors_num=study.best_params["nearest_neighbors_num"],
            alpha=study.best_params["alpha"],
            user_emb=test_user_emb
        )
    }

    trainer.init_hyperparams(**params)

    metrics = trainer.evaluate(mode="test")

    test_metrics["precision"].append(metrics["precision"])
    test_metrics["recall"].append(metrics["recall"])
    test_metrics["ndcg"].append(metrics["ndcg"])


___SEED___0
train dataset preparing...


100%|██████████| 2358/2358 [00:10<00:00, 224.96it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:11<00:00, 207.57it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:11<00:00, 210.88it/s]

Epoch 1:



Batch loss = 0.57551: 100%|██████████| 7530/7530 [02:17<00:00, 54.60it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:13<00:00, 32.01it/s]


 {'precision': 0.118879932117098, 'recall': 0.18208650031932486, 'ndcg': 0.16937464958948809}
Epoch 2:



Batch loss = 0.532531: 100%|██████████| 7530/7530 [02:20<00:00, 53.64it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:13<00:00, 31.93it/s]


 {'precision': 0.11726771319473907, 'recall': 0.182605322260548, 'ndcg': 0.16941483380978967}
Epoch 3:



Batch loss = 0.482401: 100%|██████████| 7530/7530 [02:17<00:00, 54.73it/s]



Evaluation (dev):


100%|██████████| 2357/2357 [01:15<00:00, 31.03it/s]


 {'precision': 0.11697072549851507, 'recall': 0.18104452768462156, 'ndcg': 0.1678905745748544}
Epoch 4:



Batch loss = 0.478594: 100%|██████████| 7530/7530 [02:15<00:00, 55.39it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:12<00:00, 32.53it/s]


 {'precision': 0.11646160373355961, 'recall': 0.18022523522667563, 'ndcg': 0.16613224587069464}
Epoch 5:



Batch loss = 0.449429: 100%|██████████| 7530/7530 [02:19<00:00, 54.11it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:13<00:00, 32.20it/s]


 {'precision': 0.11641917691981331, 'recall': 0.17985273614361144, 'ndcg': 0.16548402706566653}



100%|██████████| 2357/2357 [01:12<00:00, 32.60it/s]


train dataset preparing...


100%|██████████| 2358/2358 [00:08<00:00, 291.59it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:12<00:00, 192.53it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:13<00:00, 172.61it/s]
100%|██████████| 2357/2357 [02:03<00:00, 19.06it/s]



___SEED___1
train dataset preparing...


100%|██████████| 2358/2358 [00:07<00:00, 296.34it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:12<00:00, 193.46it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:12<00:00, 193.99it/s]


Epoch 1:


Batch loss = 0.574728: 100%|██████████| 7530/7530 [02:17<00:00, 54.67it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:09<00:00, 33.73it/s]


 {'precision': 0.11909206618582946, 'recall': 0.1820022107620056, 'ndcg': 0.16958347360965687}
Epoch 2:



Batch loss = 0.52777: 100%|██████████| 7530/7530 [02:17<00:00, 54.66it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:14<00:00, 31.49it/s]


 {'precision': 0.11739499363597795, 'recall': 0.1820814317295862, 'ndcg': 0.1693973975085176}
Epoch 3:



Batch loss = 0.487147: 100%|██████████| 7530/7530 [02:16<00:00, 54.99it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:12<00:00, 32.47it/s]


 {'precision': 0.11692829868476878, 'recall': 0.1802864774089352, 'ndcg': 0.167748478666449}
Epoch 4:



Batch loss = 0.467395: 100%|██████████| 7530/7530 [02:22<00:00, 52.91it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:12<00:00, 32.45it/s]


 {'precision': 0.11629189647857446, 'recall': 0.17995990701576256, 'ndcg': 0.16594597091322003}



100%|██████████| 2357/2357 [01:11<00:00, 33.02it/s]


train dataset preparing...


100%|██████████| 2358/2358 [00:09<00:00, 244.32it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:12<00:00, 189.14it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:12<00:00, 190.50it/s]
100%|██████████| 2357/2357 [02:11<00:00, 17.94it/s]



___SEED___2
train dataset preparing...


100%|██████████| 2358/2358 [00:08<00:00, 274.63it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:12<00:00, 191.22it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:12<00:00, 189.20it/s]

Epoch 1:



Batch loss = 0.575429: 100%|██████████| 7530/7530 [02:22<00:00, 53.02it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:14<00:00, 31.66it/s]


 {'precision': 0.1189223589308443, 'recall': 0.18175430419669467, 'ndcg': 0.16961052771533622}
Epoch 2:



Batch loss = 0.535303: 100%|██████████| 7530/7530 [02:20<00:00, 53.73it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:12<00:00, 32.51it/s]


 {'precision': 0.11714043275350022, 'recall': 0.18195169099661504, 'ndcg': 0.16933952001306773}
Epoch 3:



Batch loss = 0.484216: 100%|██████████| 7530/7530 [02:25<00:00, 51.91it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:13<00:00, 32.16it/s]


 {'precision': 0.11667373780229105, 'recall': 0.18048165869995278, 'ndcg': 0.1673737234889153}
Epoch 4:



Batch loss = 0.46151: 100%|██████████| 7530/7530 [02:25<00:00, 51.74it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:13<00:00, 32.06it/s]


 {'precision': 0.11646160373355961, 'recall': 0.18014316209181905, 'ndcg': 0.16597490172741558}



100%|██████████| 2357/2357 [01:13<00:00, 31.92it/s]


train dataset preparing...


100%|██████████| 2358/2358 [00:10<00:00, 225.56it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:12<00:00, 189.01it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:11<00:00, 201.61it/s]
100%|██████████| 2357/2357 [02:12<00:00, 17.81it/s]



___SEED___3
train dataset preparing...


100%|██████████| 2358/2358 [00:08<00:00, 288.30it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:12<00:00, 189.18it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:12<00:00, 189.06it/s]

Epoch 1:



Batch loss = 0.575868: 100%|██████████| 7530/7530 [02:25<00:00, 51.92it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:15<00:00, 31.02it/s]


 {'precision': 0.1189647857445906, 'recall': 0.1820888415025857, 'ndcg': 0.1694849567757149}
Epoch 2:



Batch loss = 0.53325: 100%|██████████| 7530/7530 [02:22<00:00, 52.73it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:13<00:00, 32.09it/s]


 {'precision': 0.11726771319473907, 'recall': 0.18168297205200976, 'ndcg': 0.16921977732694268}
Epoch 3:



Batch loss = 0.489312: 100%|██████████| 7530/7530 [02:27<00:00, 51.17it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:13<00:00, 31.98it/s]


 {'precision': 0.11697072549851506, 'recall': 0.18080963720724855, 'ndcg': 0.16766592007451925}
Epoch 4:



Batch loss = 0.47149: 100%|██████████| 7530/7530 [02:25<00:00, 51.61it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:13<00:00, 32.06it/s]


 {'precision': 0.11633432329232077, 'recall': 0.18054860062257394, 'ndcg': 0.1663662974681971}



100%|██████████| 2357/2357 [01:13<00:00, 32.00it/s]


train dataset preparing...


100%|██████████| 2358/2358 [00:08<00:00, 280.50it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:12<00:00, 190.99it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:12<00:00, 190.10it/s]
100%|██████████| 2357/2357 [02:10<00:00, 18.08it/s]



___SEED___4
train dataset preparing...


100%|██████████| 2358/2358 [00:07<00:00, 296.45it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:12<00:00, 192.20it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:12<00:00, 188.59it/s]

Epoch 1:



Batch loss = 0.575093: 100%|██████████| 7530/7530 [02:25<00:00, 51.70it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:15<00:00, 31.06it/s]


 {'precision': 0.11904963937208317, 'recall': 0.1818893610416039, 'ndcg': 0.1694487064721814}
Epoch 2:



Batch loss = 0.52849: 100%|██████████| 7530/7530 [02:23<00:00, 52.34it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:12<00:00, 32.49it/s]


 {'precision': 0.11743742044972423, 'recall': 0.1816663917449361, 'ndcg': 0.16937594774304177}
Epoch 3:



Batch loss = 0.492156: 100%|██████████| 7530/7530 [02:25<00:00, 51.64it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:12<00:00, 32.39it/s]


 {'precision': 0.11701315231226135, 'recall': 0.18080669690286144, 'ndcg': 0.16788002873202354}
Epoch 4:



Batch loss = 0.449101: 100%|██████████| 7530/7530 [02:22<00:00, 52.66it/s]



Evaluation (dev):


100%|██████████| 2357/2357 [01:14<00:00, 31.48it/s]


 {'precision': 0.11658888417479846, 'recall': 0.1798352159912159, 'ndcg': 0.16606146931135962}



100%|██████████| 2357/2357 [01:12<00:00, 32.47it/s]


train dataset preparing...


100%|██████████| 2358/2358 [00:10<00:00, 221.23it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:11<00:00, 214.23it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:11<00:00, 198.08it/s]
100%|██████████| 2357/2357 [02:07<00:00, 18.55it/s]


In [None]:
{
    "precision": np.array(test_metrics["precision"]).mean(),
    "recall": np.array(test_metrics["recall"]).mean(),
    "ndcg": np.array(test_metrics["ndcg"]).mean(),
}

{'precision': 0.12115400933389901,
 'recall': 0.17846917033774878,
 'ndcg': 0.1707951526383235}