In [1]:
import datetime as dt
import itertools as it

import pandas as pd
import numpy as np

from pathlib import Path

import lenskit

from src import paths
from src.datasets import daocensus_text
from src.model_selection import timeFreqSplitCurrent
from src.utils import Timer

In [2]:
# TODO: Deleteme
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Parameters

In [3]:
ORG_NAME = 'Decentraland'
ORG_PLATFORM: str = 'snapshot'
USE_ORG_NAMES: bool = True

# Evaluation
K_RECOMMENDATIONS: list[int] = [1,3,5,10]
SPLITS_FREQ: str = 'W-THU' # split weekly
SPLITS_NORMALIZE = True # Wether or not to move everything to 00:00
LAST_SPLITS = 10 # Use just last 10 splits
CUTOFF_DATE_STR: str = '2023-07-29'

MDF_FILE: str = None

In [4]:
CUTOFF_DATE = dt.datetime.fromisoformat(CUTOFF_DATE_STR) if CUTOFF_DATE_STR else None

print('CUTOFF_DATE', CUTOFF_DATE, type(CUTOFF_DATE))
# Metrics Dataframe
MDF_FILE = Path(MDF_FILE) if MDF_FILE else paths.pln_mdf(ORG_NAME, SPLITS_FREQ, SPLITS_NORMALIZE, cutoff_date=CUTOFF_DATE)
print(MDF_FILE)

CUTOFF_DATE 2023-07-29 00:00:00 <class 'datetime.datetime'>
data/pln/mdf_Decentraland_W-THU_normalize-cutoff_date=2023-07-29 00:00:00.pkl


## Getting the dataset

In [5]:
print(ORG_NAME, ORG_PLATFORM, USE_ORG_NAMES, CUTOFF_DATE)
dfv, dfp = daocensus_text.get("./data/daos-census-text", ORG_NAME, ORG_PLATFORM, use_org_names=USE_ORG_NAMES, cutoff_date=CUTOFF_DATE)
dfv[['voter', 'proposal']] = dfv[['voter', 'proposal']].astype(str)
dfp[['id']] = dfp[['id']].astype(str)
dfp = dfp.set_index('id')
print(dfv.info())
print(dfp.info())

Decentraland snapshot True 2023-07-29 00:00:00
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116560 entries, 0 to 116559
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   platform       116560 non-null  object        
 1   name           116560 non-null  object        
 2   id             116560 non-null  object        
 3   proposal       116560 non-null  object        
 4   deployment     116560 non-null  object        
 5   platform_vote  116560 non-null  object        
 6   voter          116560 non-null  object        
 7   date           116560 non-null  datetime64[us]
 8   choice         116560 non-null  object        
 9   weight         116560 non-null  float64       
dtypes: datetime64[us](1), float64(1), object(8)
memory usage: 8.9+ MB
None
<class 'pandas.core.frame.DataFrame'>
Index: 1942 entries, 5301d20a-7943-500f-b69c-50944cf6b919 to 5c62cef0-2102-5dcf-8083-41a045bb0f68
Data c

In [6]:
def to_lenskit(dfv):
    df = dfv[['voter', 'proposal', 'date']].rename(columns={
        'voter': 'user',
        'proposal': 'item',
        'date': 'timestamp',
    })
    df['user'] = df['user'].astype('str')
    df['item'] = df['item'].astype('str')
    df['rating'] = 1
    
    return df

## Using Item-based KNN

In [7]:
from lenskit.algorithms import item_knn
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k, r_precision_at_k

In [8]:
df = to_lenskit(dfv)
all_folds = list(timeFreqSplitCurrent(
    df, SPLITS_FREQ, dfp.reset_index(), return_open=True, remove_not_in_train_col='user',
    item_col='item',
    user_col='user',
))
folds = all_folds[-LAST_SPLITS:]

In [9]:
metrics_f = { 
    'precision':precision_at_k, 
    'ndcg': ndcg_at_k, 
    'map': map_at_k, 
    'recall': recall_at_k,
    'r-precision': r_precision_at_k,
}

In [10]:
print('nunique train:', folds[0].train['item'].nunique())
print('open props:', len(folds[0].open_proposals))
print('open props in train:', len(np.intersect1d(folds[0].open_proposals, folds[0].train['item'].unique())))
print('nunique test:', folds[0].test['item'].nunique())
print('open props in test:', len(np.intersect1d(folds[0].open_proposals, folds[0].test['item'].unique())))
'360848f6-b447-5540-ba76-a035cff69734' in folds[0].train['item'].unique()

nunique train: 1838
open props: 18
open props in train: 18
nunique test: 18
open props in test: 18


True

In [11]:
folds[0].train[folds[0].train['item'] == '360848f6-b447-5540-ba76-a035cff69734']

Unnamed: 0,user,item,timestamp,rating
107380,0xc375b0a133f49dbf3c6418895d6d024be2d8f84a,360848f6-b447-5540-ba76-a035cff69734,2023-05-10 22:50:35,1
107383,0x6a43fcce7dec946dc463d4e3cefe436c651f2466,360848f6-b447-5540-ba76-a035cff69734,2023-05-10 17:26:35,1
107390,0x8660c0133a9705ab8ab370c3b4e63b60381bf4b3,360848f6-b447-5540-ba76-a035cff69734,2023-05-10 10:39:54,1
107406,0xdd9a5ca4710fe4b08672fabc2d5774e0926a4afb,360848f6-b447-5540-ba76-a035cff69734,2023-05-09 18:17:01,1
107410,0x70753f5847071ec1c96cb9ebed2b2dbe2846da3a,360848f6-b447-5540-ba76-a035cff69734,2023-05-09 16:25:40,1
...,...,...,...,...
107529,0x4e9f5aa62c28f40bd772fbf7826a0595ec2dbad0,360848f6-b447-5540-ba76-a035cff69734,2023-05-03 03:37:30,1
107530,0xfec447014eb1b96279d7fef62ea3f8848f99d09d,360848f6-b447-5540-ba76-a035cff69734,2023-05-03 03:30:57,1
107531,0xed0e0cb94f60f72ec94bef848f5df4cbd365af1d,360848f6-b447-5540-ba76-a035cff69734,2023-05-03 02:29:10,1
107532,0x88f659b4b6d5614b991c6404b34f821e10390ec0,360848f6-b447-5540-ba76-a035cff69734,2023-05-03 00:14:08,1


In [12]:
def testHParamsItemKNN(fold, k: int, window_size=None):
    # Get and filter train data
    f = folds[fold]
    train = f.train
    
    if window_size:
        offset = pd.tseries.frequencies.to_offset(window_size)
        train = train[train['timestamp'] > (f.end - offset)]

    # Create algorithm
    algo = item_knn.ItemItem(
        feedback='implicit', # VERY IMPORTANT
        min_sim=0,
        # min_nbrs=0,
        nnbrs=k,  # the maximum number of neighbors for scoring each item (None for unlimited)
    )
    with Timer() as t:
        algo.fit(train)

    # TODO: For each user, make the recommendations
    # and then generate a microsoft-like dataframe
    users = set(f.test['user'].unique()).intersection(train['user'].unique())
    voted_props = train.groupby('user')['item'].unique()
    def _recu(u):
        # Remove proposals the user voted in
        ps = np.setdiff1d(f.open_proposals, voted_props.loc[u])
        # TODO: WHY DOES IT RETURN SO MANY NAs?
        x = (algo
            .predict_for_user(u, ps)
            .rename('prediction')
            .reset_index()
            # .dropna()
            .fillna(0.00)
            .assign(user=u)[['user', 'item', 'prediction']]
        )
        return x

    recs = pd.concat(map(_recu, users))
    # display(recs[recs['user'] == '0x58787bf10665b2f0f7cd995e762df0f46a7601d0'])

    metrics = { 
        'time': t.time, 
        # 'open_proposals': len(f.open_proposals),
        # 'train_open_proposals': len(np.intersect1d(f.open_proposals, train['item'].unique())),
        'min_recs': recs.groupby('user').size().min(),
        'avg_recs': recs.groupby('user').size().mean(),
    }
    for (m, e), k_recs in it.product(metrics_f.items(), K_RECOMMENDATIONS):
        metrics[f'{m}@{k_recs}'] = e(f.test, recs, k=k_recs, col_user='user', col_item='item')

    return metrics
    
testHParamsItemKNN(0, 5, '14d')

Numba is using threading layer omp - consider TBB
found 1 potential runtime problems - see https://boi.st/lkpy-perf
  b = blocks[bi]


{'time': 5.886575273936614,
 'min_recs': 1,
 'avg_recs': 9.795918367346939,
 'precision@1': 0.41836734693877553,
 'precision@3': 0.3537414965986395,
 'precision@5': 0.2816326530612245,
 'precision@10': 0.2010204081632653,
 'ndcg@1': 0.2949640287769784,
 'ndcg@3': 0.3355170230723114,
 'ndcg@5': 0.36015071902018586,
 'ndcg@10': 0.4009555318439043,
 'map@1': 0.41836734693877553,
 'map@3': 0.4433106575963719,
 'map@5': 0.4638265306122449,
 'map@10': 0.49638325369830466,
 'recall@1': 0.19142371234207967,
 'recall@3': 0.45538143828960154,
 'recall@5': 0.558357628765792,
 'recall@10': 0.7184766763848396,
 'r-precision@1': 0.4575493612078977,
 'r-precision@3': 0.7181839402427638,
 'r-precision@5': 0.7572776280323449,
 'r-precision@10': 0.7934411500449234}