In [None]:
from typing import List
import sys
from collections import defaultdict

import pandas as pd
import numpy as np
from tqdm.autonotebook import tqdm

from recsys24_daos.models import OpenPop
from recsys24_daos.datasets import to_microsoft
from recsys24_daos.model_selection import time_freq_split_current

In [None]:
SPLITS_FREQ = "W-THU"
SPLITS_NORMALIZE = True
LAST_SPLITS = 10

K_RECOMMENDATIONS: List[int] = [5, 10, 15]

## Obtain dataset

In [None]:
!pwd

In [None]:
dfp = pd.read_csv("../data/decentraland/proposals.csv", parse_dates=['date', 'start', 'end'])
dfv = pd.read_csv("../data/decentraland/votes.csv", parse_dates=['date'])

df = to_microsoft(dfv)

print(dfp.info())
print(dfv.info())
print(df.info())

## Showing some info of the folds

In [None]:
sdd = defaultdict(list)

for dftrain, dftest, t, openproposals in time_freq_split_current(to_microsoft(dfv), SPLITS_FREQ, dfp.reset_index(), remove_not_in_train_col='userID', normalize=SPLITS_NORMALIZE):
    dftrain_filter = dftrain[dftrain['itemID'].isin(openproposals)]
    
    sdd['t'].append(t)
    sdd['open proposals'].append(len(openproposals))
    
    sdd['proposals in train'].append(dftrain['itemID'].nunique())
    sdd['votes in train'].append(len(dftrain))
    sdd['votes in open proposals (train)'].append(len(dftrain_filter))
    sdd['users in open proposals (train)'].append(dftrain_filter['userID'].nunique())
    sdd['votes in test'].append(len(dftest))
    sdd['users in train'].append(dftrain['userID'].nunique())
    sdd['users in test'].append(dftest['userID'].nunique())

sdf_all = pd.DataFrame(sdd).set_index('t')
sdf_all['vpp in open proposals (train)'] = sdf_all['votes in open proposals (train)'] / sdf_all['open proposals']
sdf_all['vpu in open proposals (train)'] = sdf_all['votes in open proposals (train)'] / sdf_all['users in open proposals (train)']
sdf_all['vpp test'] = sdf_all['votes in test'] / sdf_all['open proposals']
sdf_all['vpu test'] = sdf_all['votes in test'] / sdf_all['users in test']
sdf = sdf_all.tail(10)
print(sdf['votes in train'])
_style = sdf[['open proposals', 'votes in open proposals (train)', 'users in open proposals (train)', 'vpp in open proposals (train)', 'vpu in open proposals (train)', 'votes in test', 'users in test', 'vpp test', 'vpu test']].style
_style = _style.format_index('{:%G-W%V}').format(precision=2)
_style

In [None]:
print(_style
  .format_index("\\textbf{{{}}}", escape="latex", axis=1)
  .to_latex()
)

## Running openpop baseline

In [None]:
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k

In [None]:
folds = list(time_freq_split_current(df, SPLITS_FREQ, dfp, remove_not_in_train_col='userID', normalize=SPLITS_NORMALIZE))
metrics_f = { 'precision':precision_at_k, 'ndcg': ndcg_at_k, 'ndcg': ndcg_at_k, 'map': map_at_k, 'recall': recall_at_k, }
metrics = defaultdict(list)
perfectmetrics = defaultdict(list)

for i, (train, test, t, open_proposals) in enumerate(tqdm(folds)):
    assert not train.empty, f"Train should not be empty on fold {i}"
    if test.empty:
        print(f"Warning, empty test fold {i}", file=sys.stderr)

    metrics['t'].append(t)
    perfectmetrics['t'].append(t)
    for k_recs in K_RECOMMENDATIONS:
        model = OpenPop(train)
        recs = model.recommend_k_items(test['userID'].unique(), k_recs, recommend_from=open_proposals)
    
        for m, f in metrics_f.items():
            r = f(test, recs, k=k_recs)
            metrics[m+f'@{k_recs}'].append(r)
            
        
        recs = test.copy()
        recs['prediction'] = 1

        for m, f in metrics_f.items():
            # Need relevancy_method=None as they are unsorted
            r = f(test, recs, k=k_recs)
            perfectmetrics[m+f'@{k_recs}'].append(r)

### Caching these results

In [None]:
bdf = pd.DataFrame(metrics).set_index("t")
bdf.to_csv("../data/cache/baseline.csv")
bdf.describe()

In [None]:
pdf = pd.DataFrame(perfectmetrics).set_index("t")
pdf.to_csv("../data/cache/perfect.csv")
display(pdf.describe())
assert all( (0 <= pdf.min()) & (pdf.max() <= 1) ), "There are metrics with wrong range"

## Plotting some graphs

In [None]:
mdf = pd.DataFrame(metrics)
mdf['perfect precision@5'] = perfectmetrics['precision@5']
mdf['perfect precision@10'] = perfectmetrics['precision@10']
mdf[['precision@5', 'perfect precision@5', 'precision@10', 'perfect precision@10']].plot(title='Evaluación modelo baseline MP')
mdf.describe()

In [None]:
mdf[-LAST_SPLITS:].describe()