In [1]:
import datetime as dt

import pandas as pd
import numpy as np
from lenskit.algorithms import item_knn, user_knn
from sklearn.model_selection import ParameterGrid

from recsys4daos.datasets import to_lenskit
from recsys4daos.model_selection import cvtt_open, explore_hparams
from recsys4daos.evaluation import test_with_hparams_lenskit

import paths

  from tqdm.autonotebook import tqdm


# Parameters

In [2]:
# Dataset config
ORG_NAME = 'Decentraland'
SPLITS_FREQ = 'W-THU'  # Split weekly
LAST_FOLDS = 20  # Use just last 10 splits
SPLITS_NORMALIZE = True

# Evaluation
K_RECOMMENDATIONS: list[int] = [1,3,5,10,15,100]
SPLITS_FREQ: str = 'W-THU' # split weekly
SPLITS_NORMALIZE = True # Wether or not to move everything to 00:00
LAST_FOLDS = 10 # Use just last 10 splits
LAST_FOLD_DATE_STR: str = None

# Search space config
WINDOW_SIZES = ['7d', '14d', '21d', '30d', '60d', '90d', '10YE']
ITEMKNN_Ks = [1,2,3,4,5,6,7,8,9,10,15]

OPTIM_METRIC = 'map@10'

In [3]:
# Parameters
EXECUTION_ID = "2024-09-04T10:00"
ORG_NAME = "HUWA-DAO"
SPLITS_FREQ = "2d"
LAST_FOLDS = 6
SPLITS_NORMALIZE = True
LAST_FOLD_DATE_STR = "2021-11-13"


# Load the dataset

In [4]:
dfp = paths.load_proposals(ORG_NAME)
dfv = paths.load_votes(ORG_NAME)

print(dfp.info())
print(dfv.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 631 entries, 0 to 630
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   id                 631 non-null    object        
 1   author             631 non-null    object        
 2   date               631 non-null    datetime64[us]
 3   start              631 non-null    datetime64[us]
 4   end                631 non-null    datetime64[us]
 5   platform_proposal  631 non-null    object        
dtypes: datetime64[us](3), object(3)
memory usage: 29.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4151 entries, 0 to 4150
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   id        4151 non-null   object        
 1   proposal  4151 non-null   object        
 2   voter     4151 non-null   object        
 3   date      4151 non-null   datetime64[us]
dtypes: d

In [5]:
df = to_lenskit(dfv)
df

Unnamed: 0,user,item,timestamp,rating
0,0x7452e7d530078dbd6a2ed65007bea00a4d60f138,7f78eca6-a122-5074-ad86-268642177a4b,2021-07-22 11:44:39,1
1,0x2245be89fc8fab94ed982e859aa3212a4e4eb7e5,7f78eca6-a122-5074-ad86-268642177a4b,2021-07-22 11:48:19,1
2,0x5f527da3f5d3107423080165ed5452c94940c812,7f78eca6-a122-5074-ad86-268642177a4b,2021-07-22 11:48:38,1
3,0xd903839f391b169becc0a9d5aa98e26a06542cd0,7f78eca6-a122-5074-ad86-268642177a4b,2021-07-22 11:53:23,1
4,0xab7b49bacd43bd4cfa41433d477f690bb9e1fb26,7f78eca6-a122-5074-ad86-268642177a4b,2021-07-22 11:57:02,1
...,...,...,...,...
4146,0x20d801dbee0505f9a77cff40f5fed6ff0f0ee9d6,ccddcdaf-d162-54c1-9714-f10b876ff38b,2023-06-30 14:01:01,1
4147,0x20d801dbee0505f9a77cff40f5fed6ff0f0ee9d6,2ef1f8a7-ecd9-5064-b846-c1b2204d2faa,2023-06-30 14:01:29,1
4148,0x6404f1952d58a748d10e8747bd621714e873da0d,ccddcdaf-d162-54c1-9714-f10b876ff38b,2023-07-01 00:50:21,1
4149,0x6404f1952d58a748d10e8747bd621714e873da0d,7ada4be8-5590-58c0-b9b6-891cd28635ac,2023-07-01 00:50:56,1


## Split in folds

In [6]:
all_folds = { f.end:f for f in cvtt_open(
    df, SPLITS_FREQ, dfp.reset_index(), remove_not_in_train_col='item', col_item='item', last_fold=LAST_FOLD_DATE_STR,
)}
last_folds_idx = list(all_folds.keys())[-LAST_FOLDS:]
last_folds_idx

[Timestamp('2021-11-03 00:00:00'),
 Timestamp('2021-11-05 00:00:00'),
 Timestamp('2021-11-07 00:00:00'),
 Timestamp('2021-11-09 00:00:00'),
 Timestamp('2021-11-11 00:00:00'),
 Timestamp('2021-11-13 00:00:00')]

# Item-based KNN

In [7]:
def testHParamsItemKNN(fold, k: int, window_size=None):
    # Create algorithm
    algo = item_knn.ItemItem(
        feedback='implicit', # VERY IMPORTANT
        min_sim=0,
        # min_nbrs=0,
        nnbrs=k,  # the maximum number of neighbors for scoring each item (None for unlimited)
    )
    return test_with_hparams_lenskit(algo, all_folds[fold], K_RECOMMENDATIONS, window_size)

pd.Series(testHParamsItemKNN(last_folds_idx[-1],5,'14d'))

Numba is using threading layer omp - consider TBB


found 1 potential runtime problems - see https://boi.st/lkpy-perf


  b = blocks[bi]


fold_t             2021-11-13 00:00:00
time_train                    5.446426
time_rec                      0.258482
open_proposals                      80
min_recs                            79
avg_recs                     79.888889
precision@1                        0.0
precision@3                   0.037037
precision@5                   0.088889
precision@10                  0.066667
precision@15                  0.066667
precision@100                 0.011111
ndcg@1                             0.0
ndcg@3                        0.033333
ndcg@5                        0.119469
ndcg@10                       0.163284
ndcg@15                       0.210633
ndcg@100                      0.219548
map@1                              0.0
map@3                         0.037037
map@5                          0.12037
map@10                        0.151235
map@15                        0.174868
map@100                       0.179699
recall@1                           0.0
recall@3                 

## Exploring hparams

In [8]:
results = explore_hparams(
    testHParamsItemKNN, 
    ParameterGrid({
    'fold': last_folds_idx,
    'k': ITEMKNN_Ks, # ,20,25,30,35,40,45,50,60,70,80,90,100],
    'window_size': WINDOW_SIZES,
    }),
    paths.hparams_progress('itemknn', ORG_NAME, SPLITS_FREQ, SPLITS_NORMALIZE),
)
mdfi = pd.DataFrame(results)
mdfi

Restored checkpoint from ../.cache/HUWA-DAO/hparams-itemknn_2d_normalize.pkl with 462 results


  0%|          | 0/462 [00:00<?, ?it/s]

Unnamed: 0,fold,k,window_size,fold_t,time_train,time_rec,open_proposals,min_recs,avg_recs,precision@1,...,recall@10,recall@15,recall@100,r-precision@1,r-precision@3,r-precision@5,r-precision@10,r-precision@15,r-precision@100,time_eval
0,2021-11-03,1,7d,2021-11-03,0.001969,0.007703,39,36,37.666667,0.333333,...,0.088889,0.288889,1.0,0.033333,0.1,0.166667,0.266667,0.366667,0.8,0.132510
1,2021-11-03,1,14d,2021-11-03,0.001243,0.010819,39,36,37.500000,0.166667,...,0.044444,0.200000,1.0,0.033333,0.1,0.166667,0.266667,0.366667,0.8,0.132469
2,2021-11-03,1,21d,2021-11-03,0.001347,0.010705,39,36,37.500000,0.166667,...,0.044444,0.200000,1.0,0.033333,0.1,0.166667,0.266667,0.366667,0.8,0.130596
3,2021-11-03,1,30d,2021-11-03,0.015833,0.010744,39,36,37.500000,0.166667,...,0.044444,0.200000,1.0,0.033333,0.1,0.166667,0.266667,0.366667,0.8,0.130311
4,2021-11-03,1,60d,2021-11-03,0.001618,0.010741,39,36,37.500000,0.166667,...,0.044444,0.200000,1.0,0.033333,0.1,0.166667,0.266667,0.366667,0.8,0.130248
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
457,2021-11-13,15,21d,2021-11-13,0.002456,0.020126,80,79,79.900000,0.000000,...,0.600000,0.850000,1.0,0.000000,,,,,,0.129825
458,2021-11-13,15,30d,2021-11-13,0.002419,0.020039,80,79,79.900000,0.000000,...,0.600000,0.850000,1.0,0.000000,,,,,,0.129452
459,2021-11-13,15,60d,2021-11-13,0.002513,0.020217,80,79,79.900000,0.000000,...,0.600000,0.850000,1.0,0.000000,,,,,,0.129717
460,2021-11-13,15,90d,2021-11-13,0.003073,0.020278,80,79,79.900000,0.000000,...,0.600000,0.850000,1.0,0.000000,,,,,,0.129587


### Best overall hparams

In [9]:
display_columns = ['time_train', 'avg_recs'] + [ c for c in mdfi.columns if c.endswith('@5') or c.endswith('@10') or c.endswith('@100') ]
overall_hparams = mdfi[mdfi['fold'] > last_folds_idx[0]].groupby(['window_size', 'k']).mean().sort_values(OPTIM_METRIC, ascending=False)
overall_hparams[display_columns]

Unnamed: 0_level_0,Unnamed: 1_level_0,time_train,avg_recs,precision@5,precision@10,precision@100,ndcg@5,ndcg@10,ndcg@100,map@5,map@10,map@100,recall@5,recall@10,recall@100,r-precision@5,r-precision@10,r-precision@100
window_size,k,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
7d,9,0.002740,133.161343,0.110691,0.078746,0.015825,0.097339,0.110118,0.142020,0.242850,0.254347,0.267679,0.419773,0.534907,0.799398,0.719298,0.771930,0.859649
7d,10,0.002154,133.161343,0.110691,0.078746,0.015825,0.097339,0.110118,0.142020,0.242850,0.254347,0.267679,0.419773,0.534907,0.799398,0.719298,0.771930,0.859649
7d,15,0.002666,133.161343,0.110691,0.078746,0.015825,0.097339,0.110118,0.142020,0.242850,0.254347,0.267679,0.419773,0.534907,0.799398,0.719298,0.771930,0.859649
7d,7,0.001831,133.161343,0.110691,0.078746,0.015825,0.097339,0.110118,0.141993,0.242850,0.254347,0.267649,0.419773,0.534907,0.799398,0.719298,0.771930,0.859649
7d,6,0.002841,133.161343,0.110691,0.078746,0.015825,0.097339,0.110118,0.141993,0.242850,0.254347,0.267649,0.419773,0.534907,0.799398,0.719298,0.771930,0.859649
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90d,1,0.002429,133.180836,0.075109,0.060632,0.016738,0.057457,0.079984,0.145808,0.079585,0.090702,0.115031,0.193967,0.345101,0.844774,0.605263,0.657895,0.789474
30d,1,0.004540,133.180836,0.075109,0.060632,0.016338,0.057457,0.079984,0.144994,0.079585,0.090702,0.113948,0.193967,0.345101,0.839060,0.605263,0.657895,0.789474
60d,1,0.002276,133.180836,0.075109,0.060632,0.016338,0.057457,0.079984,0.144994,0.079585,0.090702,0.113948,0.193967,0.345101,0.839060,0.605263,0.657895,0.789474
21d,1,0.002859,133.180836,0.075109,0.060632,0.016338,0.057457,0.079984,0.144985,0.079585,0.090702,0.113866,0.193967,0.345101,0.839060,0.605263,0.657895,0.789474


Now let's see the behaviour in each fold

In [10]:
best_avg_hparams = mdfi.set_index(['window_size', 'k']).loc[overall_hparams.iloc[0].name].reset_index().set_index(['fold', 'window_size', 'k'])
paths.save_model_results(best_avg_hparams, 'itemknn-best-avg', ORG_NAME, SPLITS_FREQ, SPLITS_NORMALIZE, K_RECOMMENDATIONS)
best_avg_hparams

Saved dataframe into /home/daviddavo/recsys4daos/data/output/HUWA-DAO/models/itemknn-best-avg_2d_normalize.parquet


  best_avg_hparams = mdfi.set_index(['window_size', 'k']).loc[overall_hparams.iloc[0].name].reset_index().set_index(['fold', 'window_size', 'k'])


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,fold_t,time_train,time_rec,open_proposals,min_recs,avg_recs,precision@1,precision@3,precision@5,precision@10,...,recall@10,recall@15,recall@100,r-precision@1,r-precision@3,r-precision@5,r-precision@10,r-precision@15,r-precision@100,time_eval
fold,window_size,k,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2021-11-03,7d,9,2021-11-03,0.002082,0.007001,39,36,37.666667,0.333333,0.333333,0.333333,0.266667,...,0.088889,0.288889,1.0,0.033333,0.1,0.166667,0.266667,0.366667,0.8,0.12981
2021-11-05,7d,9,2021-11-05,0.001626,0.013442,217,195,209.6,0.2,0.2,0.16,0.16,...,0.363158,0.394737,0.596992,0.052632,0.105263,0.157895,0.315789,0.473684,0.578947,0.140541
2021-11-07,7d,9,2021-11-07,0.002392,0.151459,167,156,166.388535,0.006369,0.004246,0.002548,0.001911,...,0.015924,0.199045,0.411359,1.0,1.0,1.0,1.0,1.0,1.0,0.484553
2021-11-09,7d,9,2021-11-09,0.002091,0.02918,118,110,116.181818,0.0,0.015152,0.027273,0.040909,...,0.386364,0.590909,0.988636,0.0,,,,,,0.151462
2021-11-11,7d,9,2021-11-11,0.001813,0.017928,94,93,93.636364,0.0,0.090909,0.163636,0.090909,...,0.909091,0.954545,1.0,0.0,,,,,,0.132014
2021-11-13,7d,9,2021-11-13,0.005779,0.009458,80,80,80.0,0.6,0.333333,0.2,0.1,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.130642


### Best hparams by fold

This are the parameters used to check which model is the best (validation)

In [11]:
print("Best hyperparams by fold", OPTIM_METRIC)
best_hparams = mdfi.sort_values(OPTIM_METRIC,ascending=False).drop_duplicates(['fold'], keep='first').sort_values('fold').set_index(['fold', 'k', 'window_size'])
paths.save_model_results(best_hparams, 'itemknn-best-val', ORG_NAME, SPLITS_FREQ, SPLITS_NORMALIZE, K_RECOMMENDATIONS)
best_hparams[display_columns]

Best hyperparams by fold map@10
Saved dataframe into /home/daviddavo/recsys4daos/data/output/HUWA-DAO/models/itemknn-best-val_2d_normalize.parquet


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,time_train,avg_recs,precision@5,precision@10,precision@100,ndcg@5,ndcg@10,ndcg@100,map@5,map@10,map@100,recall@5,recall@10,recall@100,r-precision@5,r-precision@10,r-precision@100
fold,k,window_size,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2021-11-03,10,7d,0.001634,37.666667,0.333333,0.266667,0.11,0.1,0.085798,0.14269,0.333333,0.254034,0.311671,0.055556,0.088889,1.0,0.166667,0.266667,0.8
2021-11-05,3,10YE,0.001938,209.6,0.2,0.2,0.044,0.045524,0.068018,0.077333,0.162,0.184722,0.179289,0.142105,0.573684,0.825564,0.210526,0.368421,0.578947
2021-11-07,4,10YE,0.002692,166.358025,0.003704,0.001852,0.005,0.009989,0.009989,0.080933,0.010494,0.010494,0.029668,0.018519,0.018519,0.407922,1.0,1.0,1.0
2021-11-09,7,7d,0.002153,116.181818,0.027273,0.040909,0.013182,0.031488,0.073655,0.144164,0.043182,0.082143,0.115418,0.136364,0.386364,0.988636,,,
2021-11-11,9,7d,0.001813,93.636364,0.163636,0.090909,0.010909,0.121103,0.130663,0.139851,0.218182,0.229545,0.237662,0.818182,0.909091,1.0,,,
2021-11-13,8,7d,0.015106,80.0,0.2,0.1,0.01,0.284124,0.284124,0.284124,0.8,0.8,0.8,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
best_hparams.tail(len(best_hparams)-1).describe()

Unnamed: 0,fold_t,time_train,time_rec,open_proposals,min_recs,avg_recs,precision@1,precision@3,precision@5,precision@10,...,recall@10,recall@15,recall@100,r-precision@1,r-precision@3,r-precision@5,r-precision@10,r-precision@15,r-precision@100,time_eval
count,5,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,3.0,3.0,3.0,3.0,3.0,5.0
mean,2021-11-09 00:00:00,0.00474,0.045661,135.2,126.4,133.155241,0.161235,0.128702,0.118923,0.086734,...,0.577531,0.667236,0.844424,0.410526,0.701754,0.736842,0.789474,0.824561,0.859649,0.209302
min,2021-11-05 00:00:00,0.001813,0.009493,80.0,80.0,80.0,0.0,0.004115,0.003704,0.001852,...,0.018519,0.195988,0.407922,0.0,0.105263,0.210526,0.368421,0.473684,0.578947,0.13108
25%,2021-11-07 00:00:00,0.001938,0.014082,94.0,93.0,93.636364,0.0,0.015152,0.027273,0.040909,...,0.386364,0.590909,0.825564,0.0,0.552632,0.605263,0.684211,0.736842,0.789474,0.132014
50%,2021-11-09 00:00:00,0.002153,0.017928,118.0,110.0,116.181818,0.006173,0.090909,0.163636,0.090909,...,0.573684,0.594737,0.988636,0.052632,1.0,1.0,1.0,1.0,1.0,0.140395
75%,2021-11-11 00:00:00,0.002692,0.029209,167.0,154.0,166.358025,0.2,0.2,0.2,0.1,...,0.909091,0.954545,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.150769
max,2021-11-13 00:00:00,0.015106,0.157593,217.0,195.0,209.6,0.6,0.333333,0.2,0.2,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.492253
std,,0.005804,0.062996,56.442006,47.447866,53.901634,0.259834,0.138565,0.095945,0.074712,...,0.399208,0.32651,0.254997,0.538542,0.516577,0.455803,0.364642,0.303869,0.243095,0.158373


### Results of using best hparams in next fold

Kind of like the cvtt from [the LightGCN notebook](./11_microsoft_tuning.ipynb).

In [13]:
next_hparams_idx = best_hparams.index.to_frame(False)
next_hparams_idx['fold'] = next_hparams_idx['fold'].shift(-1)
next_hparams_idx = next_hparams_idx.dropna()
assert len(next_hparams_idx) == len(best_hparams)-1
next_hparams = mdfi.set_index(['fold', 'k', 'window_size']).loc[pd.MultiIndex.from_frame(next_hparams_idx)]
paths.save_model_results(next_hparams, 'itemknn-best-test', ORG_NAME, SPLITS_FREQ, SPLITS_NORMALIZE, K_RECOMMENDATIONS)
next_hparams[display_columns]

Saved dataframe into /home/daviddavo/recsys4daos/data/output/HUWA-DAO/models/itemknn-best-test_2d_normalize.parquet


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,time_train,avg_recs,precision@5,precision@10,precision@100,ndcg@5,ndcg@10,ndcg@100,map@5,map@10,map@100,recall@5,recall@10,recall@100,r-precision@5,r-precision@10,r-precision@100
fold,k,window_size,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2021-11-05,10,7d,0.001693,209.6,0.16,0.16,0.04,0.041908,0.053197,0.062838,0.143333,0.150175,0.155517,0.131579,0.363158,0.596992,0.157895,0.315789,0.578947
2021-11-07,3,10YE,0.003914,166.358025,0.003704,0.001852,0.005,0.009989,0.009989,0.080933,0.010494,0.010494,0.029668,0.018519,0.018519,0.407922,1.0,1.0,1.0
2021-11-09,4,10YE,0.002625,116.346154,0.015385,0.030769,0.012692,0.023078,0.064748,0.158099,0.028846,0.060897,0.096452,0.076923,0.288462,0.990385,,,
2021-11-11,7,7d,0.001862,93.636364,0.163636,0.090909,0.010909,0.121103,0.130663,0.139851,0.218182,0.229545,0.237662,0.818182,0.909091,1.0,,,
2021-11-13,9,7d,0.005779,80.0,0.2,0.1,0.01,0.284124,0.284124,0.284124,0.8,0.8,0.8,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
next_hparams.describe()

Unnamed: 0,fold_t,time_train,time_rec,open_proposals,min_recs,avg_recs,precision@1,precision@3,precision@5,precision@10,...,recall@10,recall@15,recall@100,r-precision@1,r-precision@3,r-precision@5,r-precision@10,r-precision@15,r-precision@100,time_eval
count,5,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,3.0,3.0,3.0,3.0,3.0,5.0
mean,2021-11-09 00:00:00,0.003175,0.04695,135.2,126.4,133.188108,0.161235,0.128236,0.108545,0.076706,...,0.515846,0.589823,0.79906,0.410526,0.701754,0.719298,0.77193,0.824561,0.859649,0.21077
min,2021-11-05 00:00:00,0.001693,0.009458,80.0,80.0,80.0,0.0,0.004115,0.003704,0.001852,...,0.018519,0.195988,0.407922,0.0,0.105263,0.157895,0.315789,0.473684,0.578947,0.130642
25%,2021-11-07 00:00:00,0.001862,0.013392,94.0,93.0,93.636364,0.0,0.012821,0.015385,0.030769,...,0.288462,0.394737,0.596992,0.0,0.552632,0.578947,0.657895,0.736842,0.789474,0.132383
50%,2021-11-09 00:00:00,0.002625,0.018106,118.0,110.0,116.346154,0.006173,0.090909,0.16,0.090909,...,0.363158,0.403846,0.990385,0.052632,1.0,1.0,1.0,1.0,1.0,0.140972
75%,2021-11-11 00:00:00,0.003914,0.035536,167.0,154.0,166.358025,0.2,0.2,0.163636,0.1,...,0.909091,0.954545,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.156706
max,2021-11-13 00:00:00,0.005779,0.158259,217.0,195.0,209.6,0.6,0.333333,0.2,0.16,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.493147
std,,0.001699,0.063015,56.442006,47.447866,53.888745,0.259834,0.139046,0.091811,0.062039,...,0.421722,0.363669,0.278917,0.538542,0.516577,0.48619,0.395029,0.303869,0.243095,0.15819


# User-based KNN

In [15]:
def testHParamsUserKNN(fold, k: int, window_size=None):
    # Create algorithm
    algo = user_knn.UserUser(
        feedback='implicit', # VERY IMPORTANT
        min_sim=0,
        # min_nbrs=0,
        nnbrs=k,  # the maximum number of neighbors for scoring each item (None for unlimited)
    )
    return test_with_hparams_lenskit(algo, all_folds[fold], K_RECOMMENDATIONS, window_size)

pd.Series(testHParamsUserKNN(last_folds_idx[-1],5,'14d'))

fold_t             2021-11-13 00:00:00
time_train                    0.595836
time_rec                      0.841153
open_proposals                      80
min_recs                            79
avg_recs                     79.888889
precision@1                   0.777778
precision@3                   0.333333
precision@5                        0.2
precision@10                       0.1
precision@15                  0.066667
precision@100                 0.011111
ndcg@1                        0.466667
ndcg@3                        0.534519
ndcg@5                        0.534519
ndcg@10                       0.534519
ndcg@15                       0.534519
ndcg@100                      0.544322
map@1                         0.777778
map@3                         0.861111
map@5                         0.861111
map@10                        0.861111
map@15                        0.861111
map@100                       0.867647
recall@1                      0.777778
recall@3                 

## Exploring hparams

In [16]:
results = explore_hparams(
    testHParamsUserKNN, 
    ParameterGrid({
        'fold': last_folds_idx,
        'k': ITEMKNN_Ks, # ,20,25,30,35,40,45,50,60,70,80,90,100],
        'window_size': WINDOW_SIZES,
    }),
    paths.hparams_progress('userknn', ORG_NAME, SPLITS_FREQ, SPLITS_NORMALIZE),
)
mdfu = pd.DataFrame(results)
mdfu

Restored checkpoint from ../.cache/HUWA-DAO/hparams-userknn_2d_normalize.pkl with 462 results


  0%|          | 0/462 [00:00<?, ?it/s]

Unnamed: 0,fold,k,window_size,fold_t,time_train,time_rec,open_proposals,min_recs,avg_recs,precision@1,...,recall@10,recall@15,recall@100,r-precision@1,r-precision@3,r-precision@5,r-precision@10,r-precision@15,r-precision@100,time_eval
0,2021-11-03,1,7d,2021-11-03,0.000524,0.006353,39,36,37.666667,0.333333,...,0.088889,0.288889,1.0,0.033333,0.066667,0.133333,0.266667,0.366667,0.800000,0.133342
1,2021-11-03,1,14d,2021-11-03,0.000498,0.007059,39,36,37.500000,0.166667,...,0.044444,0.200000,1.0,0.033333,0.066667,0.133333,0.266667,0.366667,0.800000,0.132121
2,2021-11-03,1,21d,2021-11-03,0.000528,0.007061,39,36,37.500000,0.166667,...,0.044444,0.200000,1.0,0.033333,0.066667,0.133333,0.266667,0.366667,0.800000,0.131906
3,2021-11-03,1,30d,2021-11-03,0.000541,0.007024,39,36,37.500000,0.166667,...,0.044444,0.200000,1.0,0.033333,0.066667,0.133333,0.266667,0.366667,0.800000,0.131901
4,2021-11-03,1,60d,2021-11-03,0.000514,0.007022,39,36,37.500000,0.166667,...,0.044444,0.200000,1.0,0.033333,0.066667,0.133333,0.266667,0.366667,0.800000,0.131494
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
457,2021-11-13,15,21d,2021-11-13,0.000988,0.014947,80,79,79.900000,0.600000,...,0.950000,0.950000,1.0,1.000000,0.928571,0.928571,0.928571,0.928571,0.928571,0.135808
458,2021-11-13,15,30d,2021-11-13,0.000992,0.014852,80,79,79.900000,0.600000,...,0.950000,0.950000,1.0,1.000000,0.928571,0.928571,0.928571,0.928571,0.928571,0.136110
459,2021-11-13,15,60d,2021-11-13,0.000997,0.014765,80,79,79.900000,0.600000,...,0.950000,0.950000,1.0,1.000000,0.928571,0.928571,0.928571,0.928571,0.928571,0.135666
460,2021-11-13,15,90d,2021-11-13,0.001007,0.014853,80,79,79.900000,0.600000,...,0.950000,0.950000,1.0,1.000000,0.928571,0.928571,0.928571,0.928571,0.928571,0.135547


### Best overall hparams

In [17]:
display_columns = ['time_train', 'avg_recs'] + [ c for c in mdfu.columns if c.endswith('@5') or c.endswith('@10') or c.endswith('@100') ]
overall_hparams = mdfu[mdfu['fold'] > last_folds_idx[0]].groupby(['window_size', 'k']).mean().sort_values(OPTIM_METRIC, ascending=False)
overall_hparams[display_columns]

Unnamed: 0_level_0,Unnamed: 1_level_0,time_train,avg_recs,precision@5,precision@10,precision@100,ndcg@5,ndcg@10,ndcg@100,map@5,map@10,map@100,recall@5,recall@10,recall@100,r-precision@5,r-precision@10,r-precision@100
window_size,k,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
7d,4,0.000733,133.161343,0.144146,0.080746,0.015825,0.175491,0.179624,0.206713,0.478552,0.474353,0.489499,0.527939,0.560226,0.799398,0.773465,0.786623,0.826096
7d,10,0.000731,133.161343,0.144146,0.080746,0.015825,0.175491,0.179624,0.206713,0.478552,0.474353,0.489499,0.527939,0.560226,0.799398,0.773465,0.786623,0.826096
7d,15,0.000725,133.161343,0.144146,0.080746,0.015825,0.175491,0.179624,0.206713,0.478552,0.474353,0.489499,0.527939,0.560226,0.799398,0.773465,0.786623,0.826096
7d,5,0.000759,133.161343,0.144146,0.080746,0.015825,0.175491,0.179624,0.206713,0.478552,0.474353,0.489499,0.527939,0.560226,0.799398,0.773465,0.786623,0.826096
7d,7,0.000729,133.161343,0.144146,0.080746,0.015825,0.175491,0.179624,0.206713,0.478552,0.474353,0.489499,0.527939,0.560226,0.799398,0.773465,0.786623,0.826096
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14d,1,0.000966,133.188613,0.114719,0.072197,0.016411,0.166733,0.178445,0.216309,0.267745,0.272808,0.290849,0.437073,0.501967,0.839060,0.678830,0.699883,0.731462
10YE,1,0.001072,133.180836,0.107664,0.075678,0.016738,0.182671,0.200255,0.238129,0.259414,0.269492,0.287643,0.422273,0.542253,0.844774,0.723538,0.749854,0.789327
30d,1,0.000972,133.180836,0.107664,0.072447,0.016338,0.183723,0.200092,0.237445,0.261722,0.268462,0.285068,0.421656,0.509328,0.839060,0.678830,0.699883,0.731462
60d,1,0.000986,133.180836,0.107664,0.072447,0.016338,0.183723,0.200092,0.237445,0.261722,0.268462,0.285068,0.421656,0.509328,0.839060,0.678830,0.699883,0.731462


In [18]:
best_avg_hparams = mdfu.set_index(['window_size', 'k']).loc[overall_hparams.iloc[0].name].reset_index().set_index(['fold', 'window_size', 'k'])
paths.save_model_results(best_avg_hparams, 'userknn-best-avg', ORG_NAME, SPLITS_FREQ, SPLITS_NORMALIZE, K_RECOMMENDATIONS)
best_avg_hparams

Saved dataframe into /home/daviddavo/recsys4daos/data/output/HUWA-DAO/models/userknn-best-avg_2d_normalize.parquet


  best_avg_hparams = mdfu.set_index(['window_size', 'k']).loc[overall_hparams.iloc[0].name].reset_index().set_index(['fold', 'window_size', 'k'])


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,fold_t,time_train,time_rec,open_proposals,min_recs,avg_recs,precision@1,precision@3,precision@5,precision@10,...,recall@10,recall@15,recall@100,r-precision@1,r-precision@3,r-precision@5,r-precision@10,r-precision@15,r-precision@100,time_eval
fold,window_size,k,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2021-11-03,7d,4,2021-11-03,0.000507,0.004733,39,36,37.666667,0.333333,0.222222,0.266667,0.266667,...,0.088889,0.288889,1.0,0.033333,0.066667,0.133333,0.266667,0.366667,0.8,0.129997
2021-11-05,7d,4,2021-11-05,0.000698,0.009074,217,195,209.6,0.0,0.133333,0.2,0.12,...,0.152632,0.384211,0.596992,0.0,0.105263,0.210526,0.263158,0.421053,0.421053,0.134273
2021-11-07,7d,4,2021-11-07,0.000937,0.120352,167,156,166.388535,0.0,0.004246,0.002548,0.001911,...,0.015924,0.199045,0.411359,0.0,,,,,,0.491431
2021-11-09,7d,4,2021-11-09,0.000896,0.02221,118,110,116.181818,0.227273,0.181818,0.118182,0.081818,...,0.67803,0.768939,0.988636,0.866667,0.933333,0.933333,0.933333,0.933333,0.933333,0.159277
2021-11-11,7d,4,2021-11-11,0.000778,0.012896,94,93,93.636364,0.909091,0.333333,0.2,0.1,...,0.954545,0.954545,1.0,0.95,0.95,0.95,0.95,0.95,0.95,0.139514
2021-11-13,7d,4,2021-11-13,0.000357,0.006062,80,80,80.0,1.0,0.333333,0.2,0.1,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.12955


### Best hparams by fold

In [19]:
print("Best hyperparams by fold", OPTIM_METRIC)
best_hparams = mdfu.sort_values(OPTIM_METRIC,ascending=False).drop_duplicates(['fold'], keep='first').sort_values('fold').set_index(['fold', 'k', 'window_size'])
paths.save_model_results(best_hparams, 'userknn-best-val', ORG_NAME, SPLITS_FREQ, SPLITS_NORMALIZE, K_RECOMMENDATIONS)
best_hparams[display_columns]

Best hyperparams by fold map@10
Saved dataframe into /home/daviddavo/recsys4daos/data/output/HUWA-DAO/models/userknn-best-val_2d_normalize.parquet


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,time_train,avg_recs,precision@5,precision@10,precision@100,ndcg@5,ndcg@10,ndcg@100,map@5,map@10,map@100,recall@5,recall@10,recall@100,r-precision@5,r-precision@10,r-precision@100
fold,k,window_size,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2021-11-03,1,7d,0.000524,37.666667,0.266667,0.266667,0.11,0.083042,0.08237,0.14099,0.236667,0.230516,0.303832,0.044444,0.088889,1.0,0.133333,0.266667,0.8
2021-11-05,4,90d,0.000739,209.6,0.2,0.14,0.044,0.05936,0.05753,0.081889,0.201667,0.186667,0.210049,0.331579,0.352632,0.825564,0.157895,0.263158,0.421053
2021-11-07,3,90d,0.000987,166.358025,0.003704,0.001852,0.005,0.008722,0.008722,0.07973,0.00823,0.00823,0.027448,0.018519,0.018519,0.407922,,,
2021-11-09,5,7d,0.000899,116.181818,0.118182,0.081818,0.013182,0.18956,0.216172,0.252543,0.344697,0.366522,0.384314,0.530303,0.67803,0.988636,0.933333,0.933333,0.933333
2021-11-11,15,7d,0.000729,93.636364,0.2,0.1,0.010909,0.310427,0.310427,0.314972,0.909091,0.909091,0.914773,0.954545,0.954545,1.0,0.95,0.95,0.95
2021-11-13,2,7d,0.00036,80.0,0.2,0.1,0.01,0.333333,0.333333,0.333333,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
best_hparams.tail(len(best_hparams)-1).describe()[display_columns]

Unnamed: 0,time_train,avg_recs,precision@5,precision@10,precision@100,ndcg@5,ndcg@10,ndcg@100,map@5,map@10,map@100,recall@5,recall@10,recall@100,r-precision@5,r-precision@10,r-precision@100
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0,4.0
mean,0.000743,133.155241,0.144377,0.084734,0.016618,0.18028,0.185237,0.212493,0.492737,0.494102,0.507317,0.566989,0.600745,0.844424,0.760307,0.786623,0.826096
min,0.00036,80.0,0.003704,0.001852,0.005,0.008722,0.008722,0.07973,0.00823,0.00823,0.027448,0.018519,0.018519,0.407922,0.157895,0.263158,0.421053
25%,0.000729,93.636364,0.118182,0.081818,0.01,0.05936,0.05753,0.081889,0.201667,0.186667,0.210049,0.331579,0.352632,0.825564,0.739474,0.765789,0.805263
50%,0.000739,116.181818,0.2,0.1,0.010909,0.18956,0.216172,0.252543,0.344697,0.366522,0.384314,0.530303,0.67803,0.988636,0.941667,0.941667,0.941667
75%,0.000899,166.358025,0.2,0.1,0.013182,0.310427,0.310427,0.314972,0.909091,0.909091,0.914773,0.954545,0.954545,1.0,0.9625,0.9625,0.9625
max,0.000987,209.6,0.2,0.14,0.044,0.333333,0.333333,0.333333,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
std,0.00024,53.901634,0.086251,0.050986,0.015596,0.145347,0.146653,0.123887,0.439332,0.440174,0.43085,0.416918,0.415662,0.254997,0.402606,0.350124,0.271511


### Results of using best hparams in next fold

Kind of like the cvtt from [the LightGCN notebook](./11_microsoft_tuning.ipynb)

In [21]:
next_hparams_idx = best_hparams.index.to_frame(False)
next_hparams_idx['fold'] = next_hparams_idx['fold'].shift(-1)
next_hparams_idx = next_hparams_idx.dropna()
assert len(next_hparams_idx) == len(best_hparams)-1
next_hparams = mdfu.set_index(['fold', 'k', 'window_size']).loc[pd.MultiIndex.from_frame(next_hparams_idx)]
paths.save_model_results(next_hparams, 'userknn-best-test', ORG_NAME, SPLITS_FREQ, SPLITS_NORMALIZE, K_RECOMMENDATIONS)
next_hparams

Saved dataframe into /home/daviddavo/recsys4daos/data/output/HUWA-DAO/models/userknn-best-test_2d_normalize.parquet


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,fold_t,time_train,time_rec,open_proposals,min_recs,avg_recs,precision@1,precision@3,precision@5,precision@10,...,recall@10,recall@15,recall@100,r-precision@1,r-precision@3,r-precision@5,r-precision@10,r-precision@15,r-precision@100,time_eval
fold,k,window_size,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2021-11-05,1,7d,2021-11-05,0.000686,0.009076,217,195,209.6,0.2,0.133333,0.16,0.14,...,0.163158,0.373684,0.596992,0.052632,0.105263,0.157895,0.315789,0.368421,0.368421,0.141221
2021-11-07,4,90d,2021-11-07,0.001043,0.12364,167,154,166.358025,0.0,0.006173,0.003704,0.001852,...,0.018519,0.195988,0.407922,0.0,,,,,,0.487545
2021-11-09,3,90d,2021-11-09,0.00104,0.026774,118,110,116.346154,0.153846,0.141026,0.115385,0.065385,...,0.564103,0.727564,0.990385,0.833333,0.916667,0.916667,0.916667,0.916667,0.916667,0.164971
2021-11-11,5,7d,2021-11-11,0.000748,0.012614,94,93,93.636364,0.909091,0.333333,0.2,0.1,...,0.954545,0.954545,1.0,0.95,0.95,0.95,0.95,0.95,0.95,0.137531
2021-11-13,15,7d,2021-11-13,0.000363,0.006062,80,80,80.0,1.0,0.333333,0.2,0.1,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.130174


In [22]:
next_hparams.describe()

Unnamed: 0,fold_t,time_train,time_rec,open_proposals,min_recs,avg_recs,precision@1,precision@3,precision@5,precision@10,...,recall@10,recall@15,recall@100,r-precision@1,r-precision@3,r-precision@5,r-precision@10,r-precision@15,r-precision@100,time_eval
count,5,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,4.0,4.0,4.0,4.0,4.0,5.0
mean,2021-11-09 00:00:00,0.000776,0.035633,135.2,126.4,133.188108,0.452587,0.18944,0.135818,0.081447,...,0.540065,0.650356,0.79906,0.567193,0.742982,0.75614,0.795614,0.808772,0.808772,0.212288
min,2021-11-05 00:00:00,0.000363,0.006062,80.0,80.0,80.0,0.0,0.006173,0.003704,0.001852,...,0.018519,0.195988,0.407922,0.0,0.105263,0.157895,0.315789,0.368421,0.368421,0.130174
25%,2021-11-07 00:00:00,0.000686,0.009076,94.0,93.0,93.636364,0.153846,0.133333,0.115385,0.065385,...,0.163158,0.373684,0.596992,0.052632,0.713816,0.726974,0.766447,0.779605,0.779605,0.137531
50%,2021-11-09 00:00:00,0.000748,0.012614,118.0,110.0,116.346154,0.2,0.141026,0.16,0.1,...,0.564103,0.727564,0.990385,0.833333,0.933333,0.933333,0.933333,0.933333,0.933333,0.141221
75%,2021-11-11 00:00:00,0.00104,0.026774,167.0,154.0,166.358025,0.909091,0.333333,0.2,0.1,...,0.954545,0.954545,1.0,0.95,0.9625,0.9625,0.9625,0.9625,0.9625,0.164971
max,2021-11-13 00:00:00,0.001043,0.12364,217.0,195.0,209.6,1.0,0.333333,0.2,0.14,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.487545
std,,0.000283,0.049833,56.442006,47.447866,53.888745,0.465279,0.141853,0.081693,0.051745,...,0.446649,0.354874,0.278917,0.497789,0.426523,0.400298,0.321711,0.295558,0.295558,0.154424
