Vamos a hacer un _mixed_ recommendation system, en el que simplemente se hacen $k$ recomendaciones de cada uno, y se combinan las dos listas para obtener $k$ recomendaciones.

- Si un item está en las dos listas, se escoge sí o sí (sin importar su posición en la lista)
- Los huecos restantes se rellenan con un item de cada lista alternativamente

Es decir, si R1 recomienda {A,B,C,D,E} y R2 recomienda {A,D,F,G,H,I}, el recomendador mixto recomendaría {A,D,B,F,C}. A y D se recomiendan porque están en ambas, y luego se recomendaría el restante de cada uno.

In [1]:
import pandas as pd
import numpy as np

from src.datasets import daocensus, to_microsoft
from src.models.nlp import NLPModel, NLPSimilarity

%load_ext autoreload
%autoreload 2

2023-11-29 19:13:56.604277: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-29 19:13:56.604326: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-29 19:13:56.605219: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
ORG_NAME = 'Decentraland'
SEED: int = 42

# Reading the dataset

In [3]:
dfptext = pd.read_csv('./snapshot_proposals.csv')[['proposal_id', 'title', 'description', 'start', 'end']]
dfv, dfp = daocensus.get("./data/daos-census", ORG_NAME, 'snapshot')
dfv['voter'] = dfv['voter'].astype('str')
dfp = dfp.merge(dfptext, how='left', left_on='platform_proposal', right_on='proposal_id')
dfp[['start', 'end']] = dfp[['start', 'end']].astype('datetime64')
dfp = dfp.set_index('id')
print(dfv.info())
print(dfp.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116560 entries, 0 to 116559
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   platform       116560 non-null  object        
 1   name           116560 non-null  object        
 2   id             116560 non-null  object        
 3   proposal       116560 non-null  category      
 4   deployment     116560 non-null  object        
 5   platform_vote  116560 non-null  object        
 6   voter          116560 non-null  object        
 7   date           116560 non-null  datetime64[ns]
 8   choice         116560 non-null  object        
 9   weight         116560 non-null  float64       
dtypes: category(1), datetime64[ns](1), float64(1), object(7)
memory usage: 8.3+ MB
None
<class 'pandas.core.frame.DataFrame'>
CategoricalIndex: 1942 entries, 1e570406-6963-530d-8f67-0a7686449f64 to 19a58960-27e4-54d2-b0ae-87e258c741c3
Data columns (total 13 column

In [4]:
from src.model_selection import timeFreqSplitCurrent

df = to_microsoft(dfv)
folds = list(timeFreqSplitCurrent(df, '1M', dfp.reset_index(), return_open=True, remove_not_in_train_col='userID'))
f = folds[0][0]
f

Unnamed: 0,userID,itemID,timestamp,rating
7634,0x30b1f4bd5476906f38385b891f2c09973196b742,bc3ec30e-b7f7-5de5-aa8e-8acf9d430887,2021-05-31 14:01:44,1
7635,0x29d5cea7d511810f3ff754886b898fce16a6d8fd,bc3ec30e-b7f7-5de5-aa8e-8acf9d430887,2021-05-31 06:08:00,1
7636,0x361b9fbf20ed8de4b62cd5b0ccdf36face38bdc4,bc3ec30e-b7f7-5de5-aa8e-8acf9d430887,2021-05-30 11:35:12,1
7637,0xd4a08cf067c83d1b2cc1d26831569b7850804be7,bc3ec30e-b7f7-5de5-aa8e-8acf9d430887,2021-05-30 09:35:44,1
7638,0x4d29b7f953ba471fb650fc5842127b05e35949b5,bc3ec30e-b7f7-5de5-aa8e-8acf9d430887,2021-05-30 08:26:26,1
...,...,...,...,...
108180,0xe2b6024873d218b2e83b462d3658d8d7c3f55a18,954eac50-670b-5b29-b2a9-f97bfbfab26b,2021-05-24 17:40:40,1
108181,0xd210dc1dd26751503cbf1b8c9154224707820da8,954eac50-670b-5b29-b2a9-f97bfbfab26b,2021-05-24 17:37:43,1
108182,0x9982b469910c2ee2ea566dcfcc250cdd34056397,954eac50-670b-5b29-b2a9-f97bfbfab26b,2021-05-24 17:05:54,1
110957,0xd210dc1dd26751503cbf1b8c9154224707820da8,91b9847d-4873-5ca7-b997-72f668c27a98,2021-05-31 00:43:58,1


# Creating the model

In [30]:
from src.models.hybrid import HybridRecommendation

lightgcn_config = dict(
    n_layers=3,
    batch_size=512,
    embed_size=64,
    epochs=2,
    learning_rate=0.001,
    decay=0.00001,
    metrics=[],
    eval_epoch=2,
    top_k=5,
    save_model=False,
    MODEL_DIR='',
    cf_seed=SEED,
)

nlp_config = dict(
    filter_window='12M',
)

hr = HybridRecommendation(folds[0][0], folds[0][1], dfp, lightgcn_config=lightgcn_config, nlp_config=nlp_config)
hr

  df = train if test is None else train.append(test)


Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.


<src.models.hybrid.HybridRecommendation at 0x7f2fc4561040>

In [19]:
hr.fit()

Epoch 1 (train)0.1s: train loss = 0.68022 = (mf)0.68021 + (embed)0.00002
Epoch 2 (train)0.0s + (eval)0.0s: train loss = 0.67887 = (mf)0.67885 + (embed)0.00002, 
All embeddings are already calculated


In [20]:
hr.recommend_k_items(['0x29d5cea7d511810f3ff754886b898fce16a6d8fd', '0x30b1f4bd5476906f38385b891f2c09973196b742'], top_k=3)

Unnamed: 0,userID,itemID,prediction,rec
0,0x29d5cea7d511810f3ff754886b898fce16a6d8fd,8a1ceac1-bde7-5953-8810-9d92f14bc767,2.167689,nlp
1,0x29d5cea7d511810f3ff754886b898fce16a6d8fd,2a032d97-1c20-5307-af60-a2a43f1b1215,0.018846,gnn
2,0x29d5cea7d511810f3ff754886b898fce16a6d8fd,12e24fcd-65a0-5065-b2f0-7907d54b32a9,2.147544,nlp
3,0x29d5cea7d511810f3ff754886b898fce16a6d8fd,325bd14a-ef4c-5168-89b8-bf23aabd6d77,0.017913,gnn
4,0x29d5cea7d511810f3ff754886b898fce16a6d8fd,eb1556de-de48-5830-98ea-9f44f63e5fdd,2.141963,nlp
5,0x30b1f4bd5476906f38385b891f2c09973196b742,0a179f47-918b-5116-a453-bba13ecc477f,10.449469,nlp
6,0x30b1f4bd5476906f38385b891f2c09973196b742,e0367d20-22bc-5967-ad36-beb91a7d6525,0.01897,gnn
7,0x30b1f4bd5476906f38385b891f2c09973196b742,1c4da6f2-8c41-5850-9f8d-95d55da14651,10.322516,nlp
8,0x30b1f4bd5476906f38385b891f2c09973196b742,8bc92601-5cd5-5a1f-94c7-e0ede6574b4b,0.018682,gnn
9,0x30b1f4bd5476906f38385b891f2c09973196b742,afd31758-ab40-59cc-9c51-b3b9b31a541f,10.308725,nlp


# Tuning the model

In [None]:
class TrainerHybrid(tune.Trainable):
    def setup(
        self,
        config,
        data,
    ):
        self.config = config

        train, test, self.t, self.open_proposals = data

    