Based on Pytorch Geometric official example: https://github.com/pyg-team/pytorch_geometric/blob/master/examples/lightgcn.py

In [2]:
import os
from pathlib import Path
import random

import datetime as dt
import itertools as it
import functools as ft

from collections import namedtuple

from tqdm.notebook import tqdm # Progress bars
from tqdm.autonotebook import tqdm, trange

# https://import-as.github.io
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import sklearn as sk
from sklearn import preprocessing as pp

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric as PyG
from torch_geometric.nn.conv import MessagePassing
from ray import tune
from ray.air import Checkpoint, session

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

import src
from src.data import get_df, filter_df

RANDOM_SEED = 1701

  from tqdm.autonotebook import tqdm, trange


Hyperparameters table in [Google Drive](https://docs.google.com/spreadsheets/d/1riafpWt1563w9pbqdt1g2QZVkc7TfRWGzFaCG5rudDI/edit?usp=sharing)

In [3]:
# Remove users with less than 6 votes from the dataset before splitting
DatasetConfig = namedtuple('DatasetConfig', ('min_votes_per_user', 'allowed_dao_names', 'num_folds'))
datasetConfig = DatasetConfig(
    min_votes_per_user=6,
    allowed_dao_names={'dxDAO', 'xDXdao'},
    num_folds=5,
)

ModelConfig = namedtuple('ModelConfig', 'max_epochs batch_size learning_rate embedding_dim conv_layers l2')
modelConfig = ModelConfig(
    max_epochs=50,
    batch_size=16,
    learning_rate=0.0001,
    embedding_dim=32,
    conv_layers=3,
    l2=1e-4,
)

# Reading data

In [4]:
import torch
from torch_geometric.data import InMemoryDataset, HeteroData, Data
from src.datasets import Daostack

def print_graph_stats(g: HeteroData):
    density = (g.num_edges) / (g.num_nodes*(g.num_nodes-1))
    print(f'Edges:   {g.num_edges:12}')
    print(f'Density: {density*100:12.4f}%')

data = Daostack("./data/dao-analyzer/", min_vpu=datasetConfig.min_votes_per_user, allowed_daos=datasetConfig.allowed_dao_names)[0]
print_graph_stats(data)
data

Edges:          16606
Density:       0.3087%


HeteroData(
  voter={ num_nodes=104 },
  proposal={ num_nodes=2216 },
  (voter, votes, proposal)={ edge_index=[2, 8303] },
  (proposal, voted, voter)={ edge_index=[2, 8303] }
)

At first, I thought the RandomLinkSplit function was not working properly, but it turns out that I wasn't understanding it very well. The tutorial I used for [01_mvp](./01_mvp.ipynb) is not very good either, it was written by students, and implemented before PyTorch Geometric bundled the LightGCN model with it.

> I think this is totally correct. It seems like you are looking at the shapes of edge_index, while you may want to look at the shapes of edge_label and edge_label_index (which correctly model a 80/10/10 split ratio). Here, edge_index is solely used for message passing, i.e.,
> 
> * for training, we exchange messages on all training edges
> * for validation, we exchange messages on all training edges
> * for testing, we exchange messages on all training and validation edges
> Let me know if this resolves your concerns :)
>
> -- [Split Error in RandomLinkSplit · Issue #3668 · pyg-team/pytorch_geometric · GitHub](https://github.com/pyg-team/pytorch_geometric/issues/3668)

In [4]:
def get_train_val_test(g: Data | HeteroData, train_ratio=0.75):
    t = ft.partial(PyG.transforms.RandomLinkSplit, 
        is_undirected=True,
        num_val=1-train_ratio,
        # split_labels=True,
        add_negative_train_samples=True,
        num_test=0,
    )
    
    if isinstance(g, HeteroData):
        t = t(
            edge_types=[g.edge_types[0]],
            rev_edge_types=[g.edge_types[1]] if len(g.edge_types) > 1 else None,
        )
    elif isinstance(g, Data):
        t = t()
            
    return t(g)

tr, val, ts = get_train_val_test(data, train_ratio=7/8)
tr, val, ts

(HeteroData(
   voter={ num_nodes=104 },
   proposal={ num_nodes=2216 },
   (voter, votes, proposal)={
     edge_index=[2, 7266],
     edge_label=[14532],
     edge_label_index=[2, 14532],
   },
   (proposal, voted, voter)={ edge_index=[2, 7266] }
 ),
 HeteroData(
   voter={ num_nodes=104 },
   proposal={ num_nodes=2216 },
   (voter, votes, proposal)={
     edge_index=[2, 7266],
     edge_label=[2074],
     edge_label_index=[2, 2074],
   },
   (proposal, voted, voter)={ edge_index=[2, 7266] }
 ),
 HeteroData(
   voter={ num_nodes=104 },
   proposal={ num_nodes=2216 },
   (voter, votes, proposal)={
     edge_index=[2, 8303],
     edge_label=[0],
     edge_label_index=[2, 0],
   },
   (proposal, voted, voter)={ edge_index=[2, 8303] }
 ))

In [5]:
from sklearn.model_selection import StratifiedKFold

def graph_k_fold(g: Data | HeteroData, folds, edge_type=None):
    skf = StratifiedKFold(folds, shuffle=True, random_state=RANDOM_SEED)

    folds = []

    # Stratify by voter
    if edge_type is None:
        edge_type = g.edge_types[0]
        rev_edge_type = g.edge_types[1]
        
    edge_index = g[edge_type].edge_index
    for train_idx, val_idx in skf.split(torch.zeros(edge_index.size(1)), edge_index[0]):
        gtrain = g.edge_subgraph({
            edge_type:torch.tensor(train_idx),
            rev_edge_type:torch.tensor(train_idx),
        })
        assert gtrain.is_undirected()
        assert len(gtrain[edge_type].edge_index[0].unique()) == len(g[edge_type].edge_index[0].unique())
        gtrain[edge_type].negative_samples = PyG.utils.negative_sampling(gtrain[edge_type].edge_index, num_nodes=gtrain.num_nodes)
        gval = g.edge_subgraph({
            edge_type:torch.tensor(val_idx),
            rev_edge_type:torch.tensor(val_idx),
        })
        assert gval.is_undirected()
        assert len(gval[edge_type].edge_index[0].unique()) == len(g[edge_type].edge_index[0].unique())
        assert (gtrain[edge_type].edge_index[0].unique() == gval[edge_type].edge_index[0].unique()).all()

        folds.append((gtrain, gval))

    return folds

graph_folds = graph_k_fold(data, datasetConfig.num_folds)
graph_folds

[(HeteroData(
    voter={ num_nodes=104 },
    proposal={ num_nodes=2216 },
    (voter, votes, proposal)={
      edge_index=[2, 6642],
      negative_samples=[2, 6642],
    },
    (proposal, voted, voter)={ edge_index=[2, 6642] }
  ),
  HeteroData(
    voter={ num_nodes=104 },
    proposal={ num_nodes=2216 },
    (voter, votes, proposal)={ edge_index=[2, 1661] },
    (proposal, voted, voter)={ edge_index=[2, 1661] }
  )),
 (HeteroData(
    voter={ num_nodes=104 },
    proposal={ num_nodes=2216 },
    (voter, votes, proposal)={
      edge_index=[2, 6642],
      negative_samples=[2, 6642],
    },
    (proposal, voted, voter)={ edge_index=[2, 6642] }
  ),
  HeteroData(
    voter={ num_nodes=104 },
    proposal={ num_nodes=2216 },
    (voter, votes, proposal)={ edge_index=[2, 1661] },
    (proposal, voted, voter)={ edge_index=[2, 1661] }
  )),
 (HeteroData(
    voter={ num_nodes=104 },
    proposal={ num_nodes=2216 },
    (voter, votes, proposal)={
      edge_index=[2, 6642],
      negativ

In [6]:
# Trying to make sense of all of this

th = tr.to_homogeneous()
print(th)
print(np.unique(th.edge_label))
pos = th.edge_label_index[:, th.edge_label[:14532] == 1]
assert (pos == th.edge_index[:, th.edge_type==0]).all()
pos.size(), th.edge_index.size()
th.node_type

Data(edge_index=[2, 14532], edge_label=[21798], edge_label_index=[2, 14532], node_type=[2320], edge_type=[14532])
[ 0.  1. nan]


tensor([0, 0, 0,  ..., 1, 1, 1])

In [7]:
def ensure_homogeneous(*args):
    def _apply(g):
        if isinstance(g, HeteroData):
            hg = g.to_homogeneous()
            # Removing final na
            if hasattr(hg, 'edge_label'):
                assert hg.edge_label[hg.edge_label_index.size(1):].isnan().all()
                hg.edge_label = hg.edge_label[:hg.edge_label_index.size(1)].bool()
            return hg
        else:
            return g

    ret = tuple(_apply(g) for g in args)
    if len(ret) == 1:
        return ret[0]
    else:
        return ret

_aux = ensure_homogeneous(val)
_aux.edge_label_index[:, _aux.edge_label]

tensor([[  75,   79,   99,  ...,   66,    8,   54],
        [1145,  908, 1585,  ...,  578, 1602, 1890]])

## Using the LightGCN

In [8]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [9]:
from torch_geometric.nn import LightGCN

# Based on:
# - https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
# - https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html
# - https://github.com/pyg-team/pytorch_geometric/blob/master/examples/lightgcn.py
def train_daostack(train: HeteroData, validation: HeteroData, original: HeteroData, modelConfig: ModelConfig, disable_tqdm=False):
    if not isinstance(modelConfig, ModelConfig):
        modelConfig = ModelConfig(**modelConfig)
    
    model = LightGCN(
        num_nodes=data.num_nodes,
        embedding_dim=modelConfig.embedding_dim,
        num_layers=modelConfig.conv_layers,
    ).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=modelConfig.learning_rate)

    checkpoint = session.get_checkpoint()

    if checkpoint:
        checkpoint_state = checkpoint.to_dict()
        start_epoch = checkpoint_state["epoch"]
        model.load_state_dict(checkpoint_state["net_state_dict"])
        optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"])
    else:
        start_epoch = 0

    # Use all message passing edges as training labels
    assert train.is_undirected()
    assert validation.is_undirected()

    # train, validation, test = ensure_homogeneous(train, validation, test)
    n_users = train['voter'].num_nodes
    n_items = train['proposal'].num_nodes
    users = torch.arange(0, n_users, device=device)
    items = torch.arange(n_users, n_items+n_users, device=device)
    # In message passing, bidirectional edges may cause duplicate information to
    # be passed between nodes.
    # The official LightGCN usage also uses this line of code (well, for homo graphs)
    # - https://github.com/pyg-team/pytorch_geometric/blob/master/examples/lightgcn.py
    
    
    # train_edge_label_index = train.edge_index[:, train.edge_type == 0]
    # train.edge_label = train.edge_label[:train.edge_label_index.size(1)] # Now this is done inside ensure_homogeneous
    pos_edge_label_index = train['voter', 'votes', 'proposal'].edge_index
    neg_edge_label_index = train['voter', 'votes', 'proposal'].negative_samples

    # TODO: Use LinkLoader instead (i don't know how)
    # Waiting for pyg-team/pytorch_geometric#7817
    # train_loader = PyG.loader.LinkLoader(
    train_loader = torch.utils.data.DataLoader(
        range(pos_edge_label_index.size(1)), # dataset
        batch_size=modelConfig.batch_size,
        shuffle=True,
    )

    @torch.no_grad()
    def _prec_rec(k: int, remove_training=False):
        # gt: ground truth (all edges)
        gt_index = original['voter', 'votes', 'proposal'].edge_index
        if remove_training:
            edge_index = validation['voter', 'votes', 'proposal'].edge_index
        else:
            # All edges
            edge_index = original['voter', 'votes', 'proposal'].edge_index

        R = item_count = PyG.utils.degree(gt_index[0], num_nodes=n_users)
        topr = model.recommend(edge_index, src_index=users, dst_index=items, k=int(R.max()))
        
        # assert (model.recommend(edge_index, src_index=users, dst_index=items, k=k) == topk).all()
        n_samples = len(users)

        # [104, 2216]
        ground_truth = torch.full((n_users, n_items), False, dtype=torch.bool, device=device)
        ground_truth[gt_index[0], gt_index[1] - n_users] = True

        isin_rmat = ground_truth.gather(1, topr - n_users)
        isin_mat = isin_rmat[:, :k]

        prec = (isin_mat.sum(dim=-1) / k).sum() / n_samples
        rec = (isin_mat.sum(dim=-1) / item_count).sum() / n_samples

        # Now mask isin_rmat to get only up to :R elements
        msk = torch.arange(1, R.max()+1, device=device) > R.unsqueeze(1)
        isin_rmat[msk] = 0
        rprec = (isin_rmat.sum(dim=-1) / R).sum() / n_samples

        # print('prec, rec:', (prec, rec))
        
        return float(prec), float(rec), float(rprec)

    for epoch in trange(start_epoch, modelConfig.max_epochs, disable=disable_tqdm):
        # index is an array of batch_size that indicates which edges from 
        # train.edge_index we should use
        acc_loss = n_samples = 0
        for index in tqdm(train_loader, leave=False, delay=1, disable=disable_tqdm):
            pos_edge_index = pos_edge_label_index[:, index]
            # neg_edge_index = torch.stack([
            #     pos_edge_index[0],
            #     # TODO: Use generated negative samples instead
            #     torch.randint(n_users, n_users+n_items, index.size(),device=device),
            # ])
            neg_edge_index = neg_edge_label_index[:, index]
            edge_label_index = torch.cat([
                pos_edge_index,
                neg_edge_index,
            ], dim=1)

            optimizer.zero_grad()
            pos_rank, neg_rank = model(train['voter', 'votes', 'proposal'].edge_index, edge_label_index).chunk(2)

            # Learning
            loss = model.recommendation_loss(
                pos_rank,
                neg_rank,
                node_id=edge_label_index.unique(),
                lambda_reg=modelConfig.l2,
            )
            loss.backward()
            optimizer.step()

            acc_loss += float(loss) * pos_rank.numel()
            n_samples += pos_rank.numel()

        checkpoint = Checkpoint.from_dict({
            'epoch': epoch,
            'net_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        })

        # Todo: Add val accuracy (pr@5, rec@5, etc.)
        prec5, rec5, rprec = _prec_rec(5, remove_training=False)
        prec5t, rec5t, rprect = _prec_rec(5, remove_training=True)
        session.report({
            'loss': acc_loss/n_samples,
            'rprec train': rprec, 'rprec test': rprect,
            'p@5 train': prec5, 'p@5 test': prec5t,
            'r@5 train': rec5, 'r@5 test': rec5t,
        }, checkpoint=checkpoint)

# Testing just syntax errors
train_daostack(graph_folds[0][0].to(device), graph_folds[0][1].to(device), data.to(device), ModelConfig(**(modelConfig._asdict() | {'max_epochs':2}))),



  0%|          | 0/2 [00:00<?, ?it/s]



(None,)

In [10]:
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search import Repeater
from ray.tune.search.hyperopt import HyperOptSearch

def _aux_train_daostack(config):
    # TODO: Is bad practice to pass a dataset trainable
    # config['embedding_dim'] = 2**config['embedding_dim']
    config['batch_size'] = 2**config['batch_size']
    n_fold = config.pop('__trial_index__')
    train, validation = graph_folds[n_fold]
    return train_daostack(train.to(device), validation.to(device), data.to(device), config, disable_tqdm=True)

tryConfigs = ModelConfig(
    max_epochs=50,
    conv_layers=tune.randint(2,6),
    learning_rate=tune.qloguniform(1e-5, 1, 1e-5),
    l2=tune.loguniform(1e-9, 1e-1),
    # These will be 2 to the power
    batch_size=tune.randint(4,10), # 16..1024
    # embedding_dim=tune.randint(4,8), # 16..128
    embedding_dim=tune.qlograndint(10, 500, 5),
)

# It is recommended to not use Repeater with a TrialScheduler. Early termination can negatively affect the average reported metric.
asha_scheduler = None
# asha_scheduler = ASHAScheduler(
#     time_attr='training_iteration',
#     max_t=50,
#     grace_period=5,
#     reduction_factor=3,
#     brackets=1,
# )

search_alg = HyperOptSearch()
search_alg = Repeater(search_alg,datasetConfig.num_folds)

# Every run takes approx half a gig of vram (no optimizations)
# The RTX 4090 has 24GB so we can run the model about 48 times
resources_per_trial={
    'cpu': 1,
    'memory': 0 if torch.cuda.is_available() else 2e9,
    'gpu': 1/32 if torch.cuda.is_available() else 0,
}

tuner = tune.Tuner(
    tune.with_resources(_aux_train_daostack, resources_per_trial),
    param_space=tryConfigs._asdict(),
    tune_config=tune.TuneConfig(
        # time_budget_s=60,
        num_samples=datasetConfig.num_folds*500,
        scheduler=asha_scheduler,
        search_alg=search_alg,
        metric='rprec test',
        mode='max',
    )
)
exp = tuner.fit()

0,1
Current time:,2023-08-04 19:24:48
Running for:,01:14:12.61
Memory:,8.3/125.6 GiB

Trial name,status,loc,__trial_index__,batch_size,conv_layers,embedding_dim,l2,learning_rate,max_epochs,iter,total time (s),loss,rprec train,rprec test
_aux_train_daostack_be19c589,TERMINATED,147.96.81.131:174822,0,8,5,90,2.77341e-09,0.00594,50,50,2.45824,0.00618729,0.239733,0.0694089
_aux_train_daostack_f84a6ea6,TERMINATED,147.96.81.131:174872,1,8,5,90,2.77341e-09,0.00594,50,50,3.18823,0.00491508,0.236278,0.0997582
_aux_train_daostack_dcf20589,TERMINATED,147.96.81.131:174822,2,8,5,90,2.77341e-09,0.00594,50,50,2.98848,0.0054812,0.193913,0.0845103
_aux_train_daostack_338bf529,TERMINATED,147.96.81.131:174963,3,8,5,90,2.77341e-09,0.00594,50,50,4.2739,0.0049411,0.210671,0.097345
_aux_train_daostack_ebffa50f,TERMINATED,147.96.81.131:174872,4,8,5,90,2.77341e-09,0.00594,50,50,3.62039,0.00697652,0.236285,0.0993206
_aux_train_daostack_31a7dc52,TERMINATED,147.96.81.131:174822,0,8,3,40,2.01295e-08,6e-05,50,50,3.49953,0.576786,0.193067,0.0523402
_aux_train_daostack_108d5e96,TERMINATED,147.96.81.131:175078,1,8,3,40,2.01295e-08,6e-05,50,50,4.55111,0.566462,0.190192,0.0939734
_aux_train_daostack_5dab6520,TERMINATED,147.96.81.131:174872,2,8,3,40,2.01295e-08,6e-05,50,50,4.12585,0.613595,0.19287,0.0919849
_aux_train_daostack_de11200e,TERMINATED,147.96.81.131:174963,3,8,3,40,2.01295e-08,6e-05,50,50,4.17974,0.582421,0.190275,0.0839503
_aux_train_daostack_b59d0a27,TERMINATED,147.96.81.131:174822,4,8,3,40,2.01295e-08,6e-05,50,50,4.34139,0.57276,0.194017,0.0701842


[2m[36m(bundle_reservation_check_func pid=187223)[0m 2023-08-04 19:22:39,788	ERROR worker.py:779 -- Worker exits with an exit code 1.
[2m[36m(bundle_reservation_check_func pid=187223)[0m Traceback (most recent call last):
[2m[36m(bundle_reservation_check_func pid=187223)[0m   File "python/ray/_raylet.pyx", line 1787, in ray._raylet.task_execution_handler
[2m[36m(bundle_reservation_check_func pid=187223)[0m   File "python/ray/_raylet.pyx", line 1684, in ray._raylet.execute_task_with_cancellation_handler
[2m[36m(bundle_reservation_check_func pid=187223)[0m   File "python/ray/_raylet.pyx", line 1366, in ray._raylet.execute_task
[2m[36m(bundle_reservation_check_func pid=187223)[0m   File "python/ray/_raylet.pyx", line 1367, in ray._raylet.execute_task
[2m[36m(bundle_reservation_check_func pid=187223)[0m   File "python/ray/_raylet.pyx", line 1370, in ray._raylet.execute_task
[2m[36m(bundle_reservation_check_func pid=187223)[0m   File "python/ray/_raylet.pyx", line 13

In [11]:
exp_df = exp.get_dataframe().drop(columns=['hostname', 'node_ip', 'logdir', 'should_checkpoint', 'pid'])
exp_df.sort_values('p@5 test', ascending=False)

Unnamed: 0,loss,rprec train,rprec test,p@5 train,p@5 test,r@5 train,r@5 test,time_this_iter_s,done,training_iteration,...,time_total_s,time_since_restore,iterations_since_restore,config/__trial_index__,config/batch_size,config/conv_layers,config/embedding_dim,config/l2,config/learning_rate,config/max_epochs
1899,0.257128,0.424381,0.237711,0.569231,0.530769,0.106530,0.115097,0.632266,False,50,...,34.181666,34.181666,50,4,8,5,390,1.468841e-09,0.43554,50
1239,0.262197,0.477308,0.243353,0.628846,0.523077,0.114746,0.108436,0.971889,False,50,...,56.063737,56.063737,50,4,7,4,380,1.730752e-09,0.48846,50
2019,0.178992,0.403897,0.237665,0.548077,0.521154,0.095035,0.107429,0.374413,False,50,...,18.718153,18.718153,50,4,9,5,465,1.000882e-09,0.39469,50
1419,0.364647,0.446062,0.249872,0.576923,0.517308,0.115157,0.118423,0.302377,False,50,...,15.469065,15.469065,50,4,9,4,255,1.930908e-09,0.99838,50
2169,0.226340,0.399207,0.232635,0.540385,0.511539,0.093875,0.107283,0.259620,False,50,...,17.434805,17.434805,50,4,9,5,330,1.782130e-07,0.47550,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250,0.285403,0.187560,0.043266,0.286538,0.055769,0.031405,0.002722,0.220665,False,50,...,11.232218,11.232218,50,0,9,3,10,1.555773e-05,0.00184,50
70,0.474831,0.191620,0.046120,0.294231,0.055769,0.033519,0.003258,0.440742,False,50,...,22.156305,22.156305,50,0,8,5,15,2.025833e-06,0.00021,50
1155,0.446818,0.191526,0.048986,0.294231,0.055769,0.033519,0.007200,0.315804,False,50,...,16.052947,16.052947,50,0,9,5,210,1.632579e-04,0.00014,50
1885,0.456367,0.191386,0.048974,0.294231,0.051923,0.033519,0.005323,0.340755,False,50,...,18.164279,18.164279,50,0,9,5,320,3.336936e-09,0.00010,50


# Using all of this

Crearé una función que reciba una dirección de un usuario y retorne k propuestas que puedan interesarle

In [12]:
def recommend(user: str, K: int = 12, ignore_train: bool=False):
    uid = encoder_user.transform([user])[0]
    print(f"Recommending {K} proposals for user {user} (uid:{uid}) with {vpu.at[user]} votes")
    
    # Getting embedding
    out = model(edge_index)
    user_embed, item_embed = torch.split(out, (model.n_users, model.n_items))
    relevance_score = torch.matmul(user_embed, torch.transpose(item_embed, 0, 1))
    if ignore_train:
        i = torch.stack([
            torch.LongTensor(train_df['uid'].values),
            torch.LongTensor(train_df['pid'].values),
        ])
        v = torch.ones(len(train_df), dtype=torch.float64)
        t_interactions = torch.sparse.FloatTensor(i, v, (model.n_users, model.n_items)).to_dense().to(device)
        # mask out training user-item interactions from metric computation
        # We are only interested in novel items, as a user won't be interested
        # in "voting again"
        relevance_score = torch.mul(relevance_score, (1 - t_interactions))
    
    topk_relevance_indices = torch.topk(relevance_score, K).indices
    
    pids = topk_relevance_indices[uid].tolist()
    proposals = dfp.loc[encoder_prop.inverse_transform(pids)]
    
    proposals['userVoted'] = dfv.groupby('proposal')['voter'].apply(lambda x: user in set(x))
    
    print(f"precision@{K}={sum(proposals['userVoted'])/len(proposals)*100:.2f}%")
    
    return proposals

user = "0x334f12afb7d8740868be04719639616533075234" # vpu[(12 < vpu) & (vpu < 38)].sample().index[0]
recommend(user, ignore_train=True)[['network', 'createdAt', 'title', 'description', 'userVoted']]

NameError: name 'encoder_user' is not defined

In [None]:
dfv[dfv['proposal'] == '0xb92d2df99a47244c07a9d7ef73530c273f1d65230dbff9e95873d82c0314534e']