#### Vamos a hacer un _mixed_ recommendation system, en el que simplemente se hacen $k$ recomendaciones de cada uno, y se combinan las dos listas para obtener $k$ recomendaciones.

- Si un item está en las dos listas, se escoge sí o sí (sin importar su posición en la lista)
- Los huecos restantes se rellenan con un item de cada lista alternativamente

Es decir, si R1 recomienda {A,B,C,D,E} y R2 recomienda {A,D,F,G,H,I}, el recomendador mixto recomendaría {A,D,B,F,C}. A y D se recomiendan porque están en ambas, y luego se recomendaría el restante de cada uno.

In [1]:
from pathlib import Path
import itertools as it

import pandas as pd
import numpy as np

from tqdm.autonotebook import tqdm
from matplotlib import pyplot as plt

from src.datasets import daocensus, to_microsoft
from src.models.nlp import NLPModel, NLPSimilarity

%load_ext autoreload
%autoreload 2

  from tqdm.autonotebook import tqdm
2024-01-16 15:56:24.762704: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-16 15:56:24.762735: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-16 15:56:24.762750: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [12]:
ORG_NAME = 'Decentraland'
SPLITS_FREQ = 'W-THU'
SPLITS_NORMALIZE = True # Wether or not to move everything to 00:00
SEED: int = 42
K_RECOMMENDATIONS = [5, 10]

# To evaluate the hybrid recommender system
MERGE_FUNC = 'naive' # one of 'avg', 'naive', 'prioritize'

# These two files are generated by 09_analyze_results.ipynb
BEST_HPARAMS_FILE = Path(f'./data/baseline/best-{ORG_NAME}-{SPLITS_FREQ}{"-normalize" if SPLITS_NORMALIZE else ""}.csv')
REALISTIC_HPARAMS_FILE = Path(f'./data/baseline/realistic-{ORG_NAME}-{SPLITS_FREQ}{"-normalize" if SPLITS_NORMALIZE else ""}.csv')

# Reading the dataset

In [3]:
dfptext = pd.read_csv('./snapshot_proposals.csv')[['proposal_id', 'title', 'description', 'start', 'end']]
dfv, dfp = daocensus.get("./data/daos-census", ORG_NAME, 'snapshot')
dfv['voter'] = dfv['voter'].astype('str')
dfp = dfp.merge(dfptext, how='left', left_on='platform_proposal', right_on='proposal_id')
dfp[['start', 'end']] = dfp[['start', 'end']].astype('datetime64')
dfp = dfp.set_index('id')
print(dfv.info())
print(dfp.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116560 entries, 0 to 116559
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   platform       116560 non-null  object        
 1   name           116560 non-null  object        
 2   id             116560 non-null  object        
 3   proposal       116560 non-null  category      
 4   deployment     116560 non-null  object        
 5   platform_vote  116560 non-null  object        
 6   voter          116560 non-null  object        
 7   date           116560 non-null  datetime64[ns]
 8   choice         116560 non-null  object        
 9   weight         116560 non-null  float64       
dtypes: category(1), datetime64[ns](1), float64(1), object(7)
memory usage: 8.3+ MB
None
<class 'pandas.core.frame.DataFrame'>
CategoricalIndex: 1942 entries, 1e570406-6963-530d-8f67-0a7686449f64 to 19a58960-27e4-54d2-b0ae-87e258c741c3
Data columns (total 13 column

In [4]:
from src.model_selection import timeFreqSplitCurrent

df = to_microsoft(dfv)
folds = list(timeFreqSplitCurrent(df, SPLITS_FREQ, dfp.reset_index(), return_open=True, remove_not_in_train_col='userID'))
f = folds[0][0]
f

Unnamed: 0,userID,itemID,timestamp,rating
7695,0xe161cc33f5b430be52aa69520d32cd3f39fa2be6,bc3ec30e-b7f7-5de5-aa8e-8acf9d430887,2021-05-26 23:49:47,1
7696,0x338ad1918362025f34b3701dac1e9648e8e8268f,bc3ec30e-b7f7-5de5-aa8e-8acf9d430887,2021-05-26 23:49:34,1
7697,0xcf10cd8b5dc2323b1eb6de6164647756bad4de4d,bc3ec30e-b7f7-5de5-aa8e-8acf9d430887,2021-05-26 23:28:08,1
7698,0xffac7fd045303112fdb28e9dace8e1334ad324c0,bc3ec30e-b7f7-5de5-aa8e-8acf9d430887,2021-05-26 23:21:51,1
7699,0xd210dc1dd26751503cbf1b8c9154224707820da8,bc3ec30e-b7f7-5de5-aa8e-8acf9d430887,2021-05-26 21:58:01,1
...,...,...,...,...
108178,0x8cff6832174091dae86f0244e3fd92d4ced2fe07,954eac50-670b-5b29-b2a9-f97bfbfab26b,2021-05-24 17:41:41,1
108179,0xec6e6c0841a2ba474e92bf42baf76bfe80e8657c,954eac50-670b-5b29-b2a9-f97bfbfab26b,2021-05-24 17:40:41,1
108180,0xe2b6024873d218b2e83b462d3658d8d7c3f55a18,954eac50-670b-5b29-b2a9-f97bfbfab26b,2021-05-24 17:40:40,1
108181,0xd210dc1dd26751503cbf1b8c9154224707820da8,954eac50-670b-5b29-b2a9-f97bfbfab26b,2021-05-24 17:37:43,1


# Creating the model

In [5]:
from src.models.hybrid import HybridRecommendation

lightgcn_config = dict(
    n_layers=3,
    batch_size=512,
    embed_size=64,
    epochs=2,
    learning_rate=0.001,
    decay=0.00001,
    metrics=[],
    eval_epoch=2,
    top_k=5,
    save_model=False,
    MODEL_DIR='/tmp/hybrid-model',
    cf_seed=SEED,
)

nlp_config = dict(
    # filter_window='14d',
)

hr = HybridRecommendation(folds[0][0], folds[0][1], dfp, lightgcn_config=lightgcn_config, nlp_config=nlp_config)
hr

  df = train if test is None else train.append(test)


Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.


<src.models.hybrid.HybridRecommendation at 0x7f4955a0b190>

In [6]:
hr.fit()

Epoch 1 (train)0.1s: train loss = 0.67460 = (mf)0.67458 + (embed)0.00002
Epoch 2 (train)0.0s + (eval)0.7s: train loss = 0.67271 = (mf)0.67269 + (embed)0.00002, 
All embeddings are already calculated


In [7]:
# hr.recommend_k_items(['0x29d5cea7d511810f3ff754886b898fce16a6d8fd', '0x30b1f4bd5476906f38385b891f2c09973196b742'], top_k=3)

In [8]:
train_users = set(folds[0][0]['userID'])
test_users = set(folds[0][1]['userID'])

in_test_not_in_train = test_users.difference(train_users)
assert in_test_not_in_train == set()

In [9]:
hr.recommend_k_items(folds[0][1]['userID'].unique(), top_k=3)

Unnamed: 0,userID,itemID,prediction,rec
0,0x1177ba1e2fa6dbf1c9753c4e3405410173af1e83,45a12f85-4bab-51a7-982a-06705ad469c9,0,nlp
1,0x1177ba1e2fa6dbf1c9753c4e3405410173af1e83,71b8f41a-96ae-58f9-87dc-450960c28e5f,0,gnn
2,0x1177ba1e2fa6dbf1c9753c4e3405410173af1e83,0a85880b-e091-58b4-88d3-cf30112dee14,1,nlp
3,0x1177ba1e2fa6dbf1c9753c4e3405410173af1e83,312c5654-0806-5473-8243-dbd6311208ed,1,gnn
4,0x1177ba1e2fa6dbf1c9753c4e3405410173af1e83,e00cd697-dd7b-5836-91b4-73cb5c8f9cc9,2,nlp
...,...,...,...,...
200,0xf7e158bd2b6e79ef2f2ab72ac6cb2fea239c2a9b,0a179f47-918b-5116-a453-bba13ecc477f,0,nlp
201,0xf7e158bd2b6e79ef2f2ab72ac6cb2fea239c2a9b,bc3ec30e-b7f7-5de5-aa8e-8acf9d430887,0,gnn
202,0xf7e158bd2b6e79ef2f2ab72ac6cb2fea239c2a9b,566eaf46-1c59-5907-8800-8eadb863f851,1,nlp
203,0xf7e158bd2b6e79ef2f2ab72ac6cb2fea239c2a9b,25281510-41ac-5a12-9200-33d99fcaffee,1,gnn


# Evaluating the model

In [10]:
hparams_df = pd.read_csv(BEST_HPARAMS_FILE, index_col=0)
hparams_df

FileNotFoundError: [Errno 2] No such file or directory: 'data/baseline/best-Decentraland-W-THU.csv'

In [None]:
hparams_df['config/eval_epoch'] = -1
hparams_df['config/save_epoch'] = -1
hparams_df['config/top_k'] = 0
hparams_df['config/metrics'] = [ [] for x in range(len(hparams_df)) ]
hparams_df['config/MODEL_DIR'] = ''
hparams_df['config/save_model'] = False
# hparams_df['config/iteration'] = 2

config_cols = [ c for c in hparams_df.columns if c.startswith('config/') and c != 'config/fold' ]
lightgcn_configs = hparams_df[config_cols] \
    .rename(columns=lambda x: x.replace('config/', '')) \
    .rename(columns={
        'embedding_dim': 'embed_size',
        'conv_layers': 'n_layers',
        'l2': 'decay',
        'iteration': 'epochs',
    }) \
    .to_dict('records')
# nlp_configs = [ {'filter_window': '14d'} for _ in lightgcn_configs ]
nlp_configs = [ {} for _ in lightgcn_configs ]
folds = list(timeFreqSplitCurrent(df, SPLITS_FREQ, dfp.reset_index(), return_open=True, remove_not_in_train_col='userID'))[-len(hparams_df):]

assert len(lightgcn_configs) == len(nlp_configs)
assert len(nlp_configs) == len(folds)

In [None]:
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from collections import defaultdict

metrics_f = { 'precision':precision_at_k, 'ndcg': ndcg_at_k, 'ndcg': ndcg_at_k, 'map': map_at_k, } # 'recall': recall_at_k, 

metrics = defaultdict(list)
metrics_nlp = defaultdict(list)
metrics_gnn = defaultdict(list)

pct_metrics = defaultdict(list)
debug_metrics = defaultdict(list)

for (train, test, t, open_proposals), lightgcn_config, nlp_config in zip(tqdm(folds), lightgcn_configs, nlp_configs):
    #### SET UP MODEL
    model = HybridRecommendation(train, test, dfp, merge_func=MERGE_FUNC, seed=SEED, lightgcn_config=lightgcn_config, nlp_config=nlp_config)

    #### FIT MODEL
    model.fit()

    #### EVALUATE MODEL
    metrics['t'].append(t)
    metrics_nlp['t'].append(t)
    metrics_gnn['t'].append(t)
    
    for k_recs in K_RECOMMENDATIONS:
        recs = model.recommend_k_items(
            to_users=test['userID'].unique(),
            top_k=k_recs,
            recommend_from=open_proposals,
        )

        _pct = (recs.groupby('rec').size() / recs.groupby('rec').size().sum()) \
            .reindex(['both', 'gnn', 'nlp'], fill_value=0.0)

        for rec, v in _pct.to_dict().items():
            pct_metrics[(k_recs, rec)].append(v)

        _gby = pd.concat((model.nlp_recs, model.gnn_recs)).groupby('userID')['itemID']
        _pctDupes = ((_gby.size() - _gby.nunique())/k_recs).mean()
        debug_metrics[(k_recs, 'pctDupes')].append(_pctDupes)
        
        for m, f in metrics_f.items():
            metrics[m+f'@{k_recs}'].append(f(test, recs, k=k_recs))
            metrics_nlp[m+f'@{k_recs}'].append(f(test, model.nlp_recs, k=k_recs))
            metrics_gnn[m+f'@{k_recs}'].append(f(test, model.gnn_recs, k=k_recs))

In [None]:
model.gnn_recs

In [None]:
{ k:len(v) for k, v in pct_metrics.items() }

In [None]:
pct_metrics = pd.DataFrame(pct_metrics)
pct_metrics

In [None]:
debug_metrics = pd.DataFrame(debug_metrics)
debug_metrics.describe()

In [None]:
user = "0x338571a641d8c43f9e5a306300c5d89e0cb2cfaf"

all_recs = pd.concat((model.nlp_recs, model.gnn_recs)).reset_index()
group = all_recs[all_recs["userID"] == user].set_index('userID').copy()

# display(group)
group.sort_values('rec', key=lambda s: group.groupby(s).cumcount())

In [None]:
def _merge_apply_mean(group, top_k = 5):
    group['hyb_score'] = group.groupby('rec').cumcount()

    common = pd.DataFrame(index=group['itemID'][group['itemID'].duplicated(keep='first')])
    common['hyb_score'] = group.groupby('itemID')['hyb_score'].mean()
    common['prediction'] = common['hyb_score']
    common['rec'] = 'both'
    
    notcommon = group.drop_duplicates('itemID', keep=False).set_index('itemID')

    both = pd.concat((common, notcommon)).sort_values('hyb_score')
    both['prediction'] = top_k - both['hyb_score']

    return both.head(top_k)[['prediction', 'rec']]

merged_recs = all_recs.groupby('userID').apply(_merge_apply_mean).reset_index()
merged_recs

In [None]:
merged_recs.value_counts('rec')

In [None]:
dfm_hyb = pd.DataFrame(metrics)
dfm_hyb['model'] = 'hybrid'
dfm_nlp = pd.DataFrame(metrics_nlp)
dfm_nlp['model'] = 'nlp'
dfm_gnn = pd.DataFrame(metrics_gnn)
dfm_gnn['model'] = 'gnn'

dfm_all = pd.concat((dfm_hyb, dfm_nlp, dfm_gnn)).set_index('t')
dfm_all.to_csv(f'./data/results/hybrid/{ORG_NAME}-{SPLITS_FREQ}.csv')
dfm_all.groupby('model').describe().T[['gnn', 'nlp', 'hybrid']]

In [None]:
dfm_all.groupby('model').mean().T[['gnn', 'nlp', 'hybrid']]

In [None]:
dfm_all.groupby('model')['ndcg@5'].plot(legend=True)
plt.title(f'ndcg@5 con merge={MERGE_FUNC}')

In [None]:
dfm_all.groupby('model')['ndcg@10'].plot(legend=True)
plt.title(f'ndcg@10 con merge={MERGE_FUNC}')

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,4))
plt.suptitle(f'Porcentaje de propuestas de cada recomendador merge={MERGE_FUNC}')
for ax, k in zip(axes, [5,10]):
    pct_metrics[k].plot.area(ax=ax)
    ax.set_title(f'k={k}')

In [None]:
# TODO: Imprimir el porcentaje de propuestas duplicadas/exclusivas
# for k in debug_metrics.columns.get_level_values(0):
#     debug_metrics[k, 'pctExclusive'] = 1 - debug_metrics[k, 'pctDupes']
# debug_metrics = debug_metrics.sort_index(axis=1)

debug_metrics.plot()
plt.title("Porcentaje de elementos comunes entre los dos recsys")

In [None]:
model.nlp.recommend_k_items(
    to_users=test['userID'].unique(),
    top_k=5,
    recommend_from=open_proposals,
)