In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.model_selection import ParameterGrid

from recsys4daos.models import NLPSimilarity
from recsys4daos.utils import Timer
from recsys4daos.datasets import to_microsoft
from recsys4daos.evaluation import calculate_all_metrics
from recsys4daos.model_selection import cvtt_open, explore_hparams

import paths

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

  from tqdm.autonotebook import tqdm, trange


# Parameters

In [2]:
# Dataset config
ORG_NAME = 'Decentraland'

# Evaluation
K_RECOMMENDATIONS: list[int] = [1,3,5,10,100]
SPLITS_FREQ: str = 'W-THU' # split weekly
SPLITS_NORMALIZE = True # Wether or not to move everything to 00:00
LAST_FOLDS = 10 # Use just last 10 splits
LAST_FOLD_DATE_STR: str = None

# Search space config
WINDOW_SIZES = ['7d', '14d', '21d', '30d', '60d', '90d', '10YE']
MODEL_NAMES = [
    'all-mpnet-base-v2', # The one used in sbert's examples
    'all-distilroberta-v1',
    'all-MiniLM-L12-v2',
    'all-MiniLM-L6-v2',
]

OPTIM_METRIC = 'map@10'

In [3]:
# Parameters
EXECUTION_ID = "2024-07-03"
ORG_NAME = "Plaza"
SPLITS_FREQ = "3d"
LAST_FOLDS = 20
SPLITS_NORMALIZE = True
LAST_FOLD_DATE_STR = "2022-06-29"


In [4]:
EMBEDDINGS_CACHE = Path(paths.DEFAULT_CACHE_PATH) / 'pln-embeddings'

# Load the dataset

In [5]:
dfp = paths.load_proposals(ORG_NAME, text=True)
dfv = paths.load_votes(ORG_NAME)

print(dfp.info())
print(dfv.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 419 entries, 0 to 418
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   id                 419 non-null    object        
 1   author             419 non-null    object        
 2   date               419 non-null    datetime64[us]
 3   start              419 non-null    datetime64[us]
 4   end                416 non-null    datetime64[us]
 5   platform_proposal  419 non-null    object        
 6   title              419 non-null    object        
 7   description        357 non-null    object        
dtypes: datetime64[us](3), object(5)
memory usage: 26.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 534 entries, 0 to 533
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   id        534 non-null    object        
 1   proposal  534 non-null    object  

In [6]:
df = to_microsoft(dfv)
df

Unnamed: 0,userID,itemID,timestamp,rating
0,0xd1629474d25a63b1018fcc965e1d218a00f6cbd3,93673fce-71b4-5182-b6ec-e178735412c6,2021-08-06 00:09:50,1
1,0xd1629474d25a63b1018fcc965e1d218a00f6cbd3,28d70b46-cb6e-523d-8a61-61e9ef4ba177,2021-08-07 13:33:45,1
2,0xd1629474d25a63b1018fcc965e1d218a00f6cbd3,3978518b-82b8-5391-b005-f901adc496e5,2021-08-08 12:20:05,1
3,0xd1629474d25a63b1018fcc965e1d218a00f6cbd3,5e18612e-cb1f-5d5c-b919-ae7b22b61913,2021-08-08 20:04:05,1
4,0xd1629474d25a63b1018fcc965e1d218a00f6cbd3,07b44ec6-c87b-5fb7-9661-0a0204b69f58,2021-08-08 20:04:15,1
...,...,...,...,...
529,0x1e9c89aff77215f3ad26bffe0c50d4fdeba6a352,fbfb0022-9f70-5ced-8d91-e907dd141a21,2023-04-25 17:07:30,1
530,0x1e9c89aff77215f3ad26bffe0c50d4fdeba6a352,d761260c-ff6b-568c-b1b7-d0715469b834,2023-04-25 17:08:15,1
531,0x1e9c89aff77215f3ad26bffe0c50d4fdeba6a352,b354250b-af5b-5caf-ba86-914269d2a25d,2023-06-21 10:10:30,1
532,0x1e9c89aff77215f3ad26bffe0c50d4fdeba6a352,b6a2a953-6beb-511f-8c73-8fe1385282a5,2023-07-19 01:34:15,1


## Split in folds

In [7]:
all_folds = { f.end:f for f in cvtt_open(
    df, SPLITS_FREQ, dfp.reset_index(), remove_not_in_train_col='userID', last_fold=LAST_FOLD_DATE_STR,
)}
last_folds_idx = list(all_folds.keys())[-LAST_FOLDS:]
last_folds_idx

[Timestamp('2022-05-03 00:00:00'),
 Timestamp('2022-05-06 00:00:00'),
 Timestamp('2022-05-09 00:00:00'),
 Timestamp('2022-05-12 00:00:00'),
 Timestamp('2022-05-15 00:00:00'),
 Timestamp('2022-05-18 00:00:00'),
 Timestamp('2022-05-21 00:00:00'),
 Timestamp('2022-05-24 00:00:00'),
 Timestamp('2022-05-27 00:00:00'),
 Timestamp('2022-05-30 00:00:00'),
 Timestamp('2022-06-02 00:00:00'),
 Timestamp('2022-06-05 00:00:00'),
 Timestamp('2022-06-08 00:00:00'),
 Timestamp('2022-06-11 00:00:00'),
 Timestamp('2022-06-14 00:00:00'),
 Timestamp('2022-06-17 00:00:00'),
 Timestamp('2022-06-20 00:00:00'),
 Timestamp('2022-06-23 00:00:00'),
 Timestamp('2022-06-26 00:00:00'),
 Timestamp('2022-06-29 00:00:00')]

# Similarity PLN

In [8]:
def testHParamsPLNSimilarity(fold, window_size, model_name):
    f = all_folds[fold]
    
    model = NLPSimilarity(dfp, EMBEDDINGS_CACHE, model_name=model_name)

    offset = pd.tseries.frequencies.to_offset(window_size)
    train = f.train[f.train['timestamp'] > (f.end - offset)]

    with Timer() as t_fit:
        model.fit(train)

    with Timer() as t_rec:
        # Truncate the top_k
        top_k = max(K_RECOMMENDATIONS)
        users = np.intersect1d(f.test['userID'].unique(), train['userID'].unique())
        recs = model.recommend_k_items(users, top_k=min(len(f.open_proposals), top_k), recommend_from=f.open_proposals)
    
    return {
        'time_train': t_fit.time,
        'time_rec': t_rec.time,
        'rec_users': len(users),
        **calculate_all_metrics(f.test, recs, K_RECOMMENDATIONS),
    }

testHParamsPLNSimilarity(last_folds_idx[0], '14D', MODEL_NAMES[0])

Some embeddings need to be calculated


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

{'time_train': 1.0364137890283018,
 'time_rec': 0.0007212619530037045,
 'rec_users': 0,
 'precision@1': 0.0,
 'precision@3': 0.0,
 'precision@5': 0.0,
 'precision@10': 0.0,
 'precision@100': 0.0,
 'ndcg@1': 0.0,
 'ndcg@3': 0.0,
 'ndcg@5': 0.0,
 'ndcg@10': 0.0,
 'ndcg@100': 0.0,
 'map@1': 0.0,
 'map@3': 0.0,
 'map@5': 0.0,
 'map@10': 0.0,
 'map@100': 0.0,
 'recall@1': 0.0,
 'recall@3': 0.0,
 'recall@5': 0.0,
 'recall@10': 0.0,
 'recall@100': 0.0,
 'r-precision@1': 0.0,
 'r-precision@3': 0.0,
 'r-precision@5': 0.0,
 'r-precision@10': 0.0,
 'r-precision@100': 0.0,
 'time_eval': 0.05688643001485616}

In [9]:
# Populate the cache of embeddings for each model
from recsys4daos.models.nlp import NLPModel

for m in MODEL_NAMES:
    NLPModel(dfp, EMBEDDINGS_CACHE, model_name=m).fit()

Some embeddings need to be calculated


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Some embeddings need to be calculated


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Some embeddings need to be calculated


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

In [10]:
results = explore_hparams(
    testHParamsPLNSimilarity,
    ParameterGrid({
        'fold': last_folds_idx,
        'model_name': MODEL_NAMES,
        'window_size': WINDOW_SIZES,
    }),
    paths.hparams_progress('pln-similarity', ORG_NAME, SPLITS_FREQ, SPLITS_NORMALIZE),
)
mdf = pd.DataFrame(results)
mdf

  0%|          | 0/560 [00:00<?, ?it/s]

[2024-07-04T08:36:05.741791] Saving checkpoint at ../.cache/Plaza/hparams-pln-similarity_3d_normalize.pkl


[2024-07-04T08:37:07.141318] Saving checkpoint at ../.cache/Plaza/hparams-pln-similarity_3d_normalize.pkl


[2024-07-04T08:38:08.448474] Saving checkpoint at ../.cache/Plaza/hparams-pln-similarity_3d_normalize.pkl


[2024-07-04T08:39:09.805902] Saving checkpoint at ../.cache/Plaza/hparams-pln-similarity_3d_normalize.pkl


[2024-07-04T08:40:10.906968] Saving checkpoint at ../.cache/Plaza/hparams-pln-similarity_3d_normalize.pkl


[2024-07-04T08:41:11.825074] Saving checkpoint at ../.cache/Plaza/hparams-pln-similarity_3d_normalize.pkl


[2024-07-04T08:42:12.849878] Saving checkpoint at ../.cache/Plaza/hparams-pln-similarity_3d_normalize.pkl


[2024-07-04T08:43:14.355133] Saving checkpoint at ../.cache/Plaza/hparams-pln-similarity_3d_normalize.pkl


[2024-07-04T08:44:15.081355] Saving checkpoint at ../.cache/Plaza/hparams-pln-similarity_3d_normalize.pkl


[2024-07-04T08:45:15.671266] Saving checkpoint at ../.cache/Plaza/hparams-pln-similarity_3d_normalize.pkl


[2024-07-04T08:46:16.482330] Saving checkpoint at ../.cache/Plaza/hparams-pln-similarity_3d_normalize.pkl


[2024-07-04T08:47:19.851447] Saving checkpoint at ../.cache/Plaza/hparams-pln-similarity_3d_normalize.pkl


[2024-07-04T08:48:20.922195] Saving checkpoint at ../.cache/Plaza/hparams-pln-similarity_3d_normalize.pkl


Unnamed: 0,fold,model_name,window_size,time_train,time_rec,rec_users,precision@1,precision@3,precision@5,precision@10,...,recall@3,recall@5,recall@10,recall@100,r-precision@1,r-precision@3,r-precision@5,r-precision@10,r-precision@100,time_eval
0,2022-05-03,all-mpnet-base-v2,7d,0.226868,0.000334,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057165
1,2022-05-03,all-mpnet-base-v2,14d,0.070915,0.000320,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.056403
2,2022-05-03,all-mpnet-base-v2,21d,0.224365,0.000303,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055794
3,2022-05-03,all-mpnet-base-v2,30d,0.228002,0.000302,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055136
4,2022-05-03,all-mpnet-base-v2,60d,0.070849,0.000314,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555,2022-06-29,all-MiniLM-L6-v2,21d,0.060009,0.000291,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.056390
556,2022-06-29,all-MiniLM-L6-v2,30d,0.221873,0.000288,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055298
557,2022-06-29,all-MiniLM-L6-v2,60d,0.059907,0.000304,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055377
558,2022-06-29,all-MiniLM-L6-v2,90d,0.227342,0.000293,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055334


## Best overall hparams

In [11]:
display_columns = ['time_train'] + [ c for c in mdf.columns if c.endswith('@5') or c.endswith('@10') or c.endswith('@100') ]
hparam_cols = ['window_size', 'model_name']
overall_hparams = mdf[mdf['fold'] > last_folds_idx[0]].groupby(hparam_cols).mean().sort_values(OPTIM_METRIC, ascending=False)
overall_hparams[display_columns]

Unnamed: 0_level_0,Unnamed: 1_level_0,time_train,precision@5,precision@10,precision@100,ndcg@5,ndcg@10,ndcg@100,map@5,map@10,map@100,recall@5,recall@10,recall@100,r-precision@5,r-precision@10,r-precision@100
window_size,model_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
14d,all-distilroberta-v1,0.174815,0.142105,0.071053,0.007105,0.439501,0.439501,0.439501,0.404971,0.404971,0.404971,0.526316,0.526316,0.526316,0.444444,0.444444,0.444444
21d,all-distilroberta-v1,0.141096,0.142105,0.071053,0.007105,0.416427,0.416427,0.416427,0.374269,0.374269,0.374269,0.526316,0.526316,0.526316,0.411765,0.411765,0.411765
30d,all-mpnet-base-v2,0.158319,0.142105,0.071053,0.007105,0.410967,0.410967,0.410967,0.366813,0.366813,0.366813,0.526316,0.526316,0.526316,0.375,0.375,0.375
30d,all-MiniLM-L12-v2,0.136584,0.142105,0.071053,0.007105,0.406564,0.406564,0.406564,0.365497,0.365497,0.365497,0.526316,0.526316,0.526316,0.383333,0.383333,0.383333
30d,all-distilroberta-v1,0.149438,0.142105,0.071053,0.007105,0.407655,0.407655,0.407655,0.362573,0.362573,0.362573,0.526316,0.526316,0.526316,0.375,0.375,0.375
14d,all-mpnet-base-v2,0.165469,0.142105,0.071053,0.007105,0.407206,0.407206,0.407206,0.362427,0.362427,0.362427,0.526316,0.526316,0.526316,0.397059,0.397059,0.397059
21d,all-MiniLM-L12-v2,0.145324,0.142105,0.071053,0.007105,0.403465,0.403465,0.403465,0.361111,0.361111,0.361111,0.526316,0.526316,0.526316,0.40625,0.40625,0.40625
21d,all-mpnet-base-v2,0.147974,0.142105,0.071053,0.007105,0.40684,0.40684,0.40684,0.361111,0.361111,0.361111,0.526316,0.526316,0.526316,0.382353,0.382353,0.382353
14d,all-MiniLM-L12-v2,0.13646,0.142105,0.071053,0.007105,0.404517,0.404517,0.404517,0.361111,0.361111,0.361111,0.526316,0.526316,0.526316,0.411765,0.411765,0.411765
7d,all-distilroberta-v1,0.157637,0.126316,0.071053,0.007105,0.379827,0.405045,0.405045,0.348538,0.358772,0.358772,0.447368,0.526316,0.526316,0.352941,0.375,0.375


In [12]:
best_avg_hparams = mdf.set_index(hparam_cols).loc[overall_hparams.iloc[0].name].reset_index().set_index(['fold', *hparam_cols])
paths.save_model_results(best_avg_hparams, 'plnsim-best-avg', ORG_NAME, SPLITS_FREQ, SPLITS_NORMALIZE, K_RECOMMENDATIONS)
best_avg_hparams

Saved dataframe into /home/daviddavo/recsys4daos/data/output/Plaza/models/plnsim-best-avg_3d_normalize.pq


  best_avg_hparams = mdf.set_index(hparam_cols).loc[overall_hparams.iloc[0].name].reset_index().set_index(['fold', *hparam_cols])


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,time_train,time_rec,rec_users,precision@1,precision@3,precision@5,precision@10,precision@100,ndcg@1,ndcg@3,...,recall@3,recall@5,recall@10,recall@100,r-precision@1,r-precision@3,r-precision@5,r-precision@10,r-precision@100,time_eval
fold,window_size,model_name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2022-05-03,14d,all-distilroberta-v1,0.074501,0.000296,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055611
2022-05-06,14d,all-distilroberta-v1,0.232552,0.001161,1,1.0,0.333333,0.4,0.2,0.02,1.0,0.613147,...,0.5,1.0,1.0,1.0,0.5,0.5,0.5,0.5,0.5,0.098018
2022-05-09,14d,all-distilroberta-v1,0.072033,0.001178,1,1.0,0.333333,0.2,0.1,0.01,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.097469
2022-05-12,14d,all-distilroberta-v1,0.232606,0.001051,1,1.0,0.333333,0.2,0.1,0.01,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.095885
2022-05-15,14d,all-distilroberta-v1,0.07263,0.000285,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055578
2022-05-18,14d,all-distilroberta-v1,0.238801,0.000287,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05581
2022-05-21,14d,all-distilroberta-v1,0.073365,0.001193,1,1.0,0.333333,0.2,0.1,0.01,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.09765
2022-05-24,14d,all-distilroberta-v1,0.233062,0.000303,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055627
2022-05-27,14d,all-distilroberta-v1,0.232101,0.0011,1,0.0,0.333333,0.2,0.1,0.01,0.0,0.5,...,1.0,1.0,1.0,1.0,0.0,,,,,0.091662
2022-05-30,14d,all-distilroberta-v1,0.231946,0.000297,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.056146


## best hparams by fold

In [13]:
print("Best hyperparams by fold", OPTIM_METRIC)
best_hparams = mdf.sort_values(OPTIM_METRIC,ascending=False).drop_duplicates(['fold'], keep='first').sort_values('fold').set_index(['fold', 'model_name', 'window_size'])
paths.save_model_results(best_hparams, 'plnsim-best-test', ORG_NAME, SPLITS_FREQ, SPLITS_NORMALIZE, K_RECOMMENDATIONS)
best_hparams[display_columns]

Best hyperparams by fold map@10
Saved dataframe into /home/daviddavo/recsys4daos/data/output/Plaza/models/plnsim-best-test_3d_normalize.pq


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,time_train,precision@5,precision@10,precision@100,ndcg@5,ndcg@10,ndcg@100,map@5,map@10,map@100,recall@5,recall@10,recall@100,r-precision@5,r-precision@10,r-precision@100
fold,model_name,window_size,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2022-05-03,all-mpnet-base-v2,7d,0.226868,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-05-06,all-MiniLM-L12-v2,21d,0.217225,0.4,0.2,0.02,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2022-05-09,all-MiniLM-L12-v2,10YE,0.060547,0.2,0.1,0.01,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2022-05-12,all-distilroberta-v1,30d,0.232212,0.2,0.1,0.01,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2022-05-15,all-MiniLM-L6-v2,60d,0.059592,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-05-18,all-mpnet-base-v2,90d,0.072862,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-05-21,all-mpnet-base-v2,14d,0.234636,0.2,0.1,0.01,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2022-05-24,all-mpnet-base-v2,7d,0.072487,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-05-27,all-distilroberta-v1,10YE,0.232818,0.2,0.1,0.01,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2022-05-30,all-MiniLM-L6-v2,10YE,0.062666,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
best_hparams.describe()[display_columns]

Unnamed: 0,time_train,precision@5,precision@10,precision@100,ndcg@5,ndcg@10,ndcg@100,map@5,map@10,map@100,recall@5,recall@10,recall@100,r-precision@5,r-precision@10,r-precision@100
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.148728,0.135,0.0675,0.00675,0.466812,0.466812,0.466812,0.451389,0.451389,0.451389,0.5,0.5,0.5,0.475,0.475,0.475
std,0.083048,0.153125,0.076563,0.007656,0.486144,0.486144,0.486144,0.477301,0.477301,0.477301,0.512989,0.512989,0.512989,0.499342,0.499342,0.499342
min,0.059592,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.072446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.145536,0.1,0.05,0.005,0.321779,0.321779,0.321779,0.263889,0.263889,0.263889,0.5,0.5,0.5,0.25,0.25,0.25
75%,0.232363,0.2,0.1,0.01,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,0.240009,0.4,0.2,0.02,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Results of using best hparams on next fold

In [15]:
next_hparams_idx = best_hparams.index.to_frame(False)
next_hparams_idx['fold'] = next_hparams_idx['fold'].shift()
next_hparams_idx = next_hparams_idx.dropna()
assert len(next_hparams_idx) == len(best_hparams)-1
next_hparams = mdf.set_index(['fold', 'model_name', 'window_size']).loc[pd.MultiIndex.from_frame(next_hparams_idx)]
paths.save_model_results(next_hparams, 'plnsim-best-valid', ORG_NAME, SPLITS_FREQ, SPLITS_NORMALIZE, K_RECOMMENDATIONS)
next_hparams[display_columns]

Saved dataframe into /home/daviddavo/recsys4daos/data/output/Plaza/models/plnsim-best-valid_3d_normalize.pq


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,time_train,precision@5,precision@10,precision@100,ndcg@5,ndcg@10,ndcg@100,map@5,map@10,map@100,recall@5,recall@10,recall@100,r-precision@5,r-precision@10,r-precision@100
fold,model_name,window_size,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2022-05-03,all-MiniLM-L12-v2,21d,0.059419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-05-06,all-MiniLM-L12-v2,10YE,0.217379,0.4,0.2,0.02,0.650921,0.650921,0.650921,0.5,0.5,0.5,1.0,1.0,1.0,0.5,0.5,0.5
2022-05-09,all-distilroberta-v1,30d,0.071882,0.2,0.1,0.01,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2022-05-12,all-MiniLM-L6-v2,60d,0.216826,0.2,0.1,0.01,0.63093,0.63093,0.63093,0.5,0.5,0.5,1.0,1.0,1.0,,,
2022-05-15,all-mpnet-base-v2,90d,0.232561,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-05-18,all-mpnet-base-v2,14d,0.071827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-05-21,all-mpnet-base-v2,7d,0.072569,0.0,0.1,0.01,0.0,0.356207,0.356207,0.0,0.166667,0.166667,0.0,1.0,1.0,0.0,,
2022-05-24,all-distilroberta-v1,10YE,0.234404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-05-27,all-MiniLM-L6-v2,10YE,0.219773,0.2,0.1,0.01,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2022-05-30,all-MiniLM-L6-v2,60d,0.214206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
next_hparams.describe()[display_columns]

Unnamed: 0,time_train,precision@5,precision@10,precision@100,ndcg@5,ndcg@10,ndcg@100,map@5,map@10,map@100,recall@5,recall@10,recall@100,r-precision@5,r-precision@10,r-precision@100
count,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,17.0,16.0,16.0
mean,0.143477,0.126316,0.071053,0.007105,0.365096,0.393217,0.393217,0.330994,0.344152,0.344152,0.447368,0.526316,0.526316,0.352941,0.375,0.375
std,0.079803,0.148482,0.076948,0.007695,0.420804,0.41626,0.41626,0.399783,0.396242,0.396242,0.497067,0.512989,0.512989,0.459779,0.465475,0.465475
min,0.059419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.071991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.073691,0.0,0.1,0.01,0.0,0.356207,0.356207,0.0,0.166667,0.166667,0.0,1.0,1.0,0.0,0.0,0.0
75%,0.220821,0.2,0.1,0.01,0.733193,0.746784,0.746784,0.625,0.666667,0.666667,1.0,1.0,1.0,1.0,1.0,1.0
max,0.235298,0.4,0.2,0.02,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
