Based on Pytorch Geometric official example: https://github.com/pyg-team/pytorch_geometric/blob/master/examples/lightgcn.py

In [1]:
import os
from pathlib import Path
import random

import datetime as dt
import itertools as it
import functools as ft

from collections import namedtuple

from tqdm.notebook import tqdm # Progress bars
from tqdm.autonotebook import tqdm, trange

# https://import-as.github.io
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import sklearn as sk
from sklearn import preprocessing as pp

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric as PyG
from torch_geometric.nn.conv import MessagePassing

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

import src
from src.data import get_df, filter_df

POSSIBLE_PATHS = ["./datawarehouse", "/kaggle/input/dao-analyzer"]

DW = None
for p in POSSIBLE_PATHS:
    DW = Path(p)
    if DW.is_dir():
        break
else:
    print("No se ha encontrado el DW")

src.data.DEFAULT_PATH = DW

  from tqdm.autonotebook import tqdm, trange


Hyperparameters table in [Google Drive](https://docs.google.com/spreadsheets/d/1riafpWt1563w9pbqdt1g2QZVkc7TfRWGzFaCG5rudDI/edit?usp=sharing)

In [2]:
# Remove users with less than 6 votes from the dataset before splitting
DatasetConfig = namedtuple('DatasetConfig', ('min_votes_per_user', 'allowed_dao_names', 'train_split'))
datasetConfig = DatasetConfig(
    min_votes_per_user=6,
    allowed_dao_names={'dxDAO', 'xDXdao'},
    train_split=1/5,
)

ModelConfig = namedtuple('ModelConfig', 'max_epochs batch_size learning_rate embedding_dim conv_layers')
modelConfig = ModelConfig(
    max_epochs=50,
    batch_size=16,
    learning_rate=0.0001,
    embedding_dim=32,
    conv_layers=3,
)

# Reading data

In [3]:
import torch
from torch_geometric.data import InMemoryDataset, HeteroData, Data

class Daostack(InMemoryDataset):
    """ Creates a heterogeneus graph with two kinds of nodes: voters and proposals """
    def __init__(self, root: str, min_vpu=6, allowed_daos=None):
        self._min_vpu = min_vpu
        self._allowed_daos = allowed_daos
        
        super().__init__(root)

        self.data = torch.load(self.processed_paths[0])

    def process(self):
        import pandas as pd

        df = pd.read_csv(self.raw_paths[0])

        if self._allowed_daos:
            dfd = pd.read_csv(self.raw_paths[1]).set_index('id')
            allowed_dao_ids = set(dfd[dfd['name'].isin(self._allowed_daos)].index)
            df = df[df['dao'].isin(allowed_dao_ids)]
            assert not df.empty, "Dataframe is empty"
            
        if self._min_vpu:
            vpu = df.groupby('voter').size()
            allowed_voters = vpu[vpu >= self._min_vpu].index
            df = df[df['voter'].isin(allowed_voters)]
        
        data = HeteroData()
        node_types = ['voter', 'proposal']
        for nt in node_types:
            df[nt] = df[nt].astype('category')
            data[nt].num_nodes = df[nt].nunique()

        u_t = torch.LongTensor(df['voter'].cat.codes)
        p_t = torch.LongTensor(df['proposal'].cat.codes)

        edge_index = torch.stack([
            torch.cat([u_t, p_t]),
            torch.cat([p_t, u_t]),
        ])

        data['voter', 'votes', 'proposal']['edge_index'] = torch.stack([u_t, p_t])
        data['proposal', 'voted', 'voter']['edge_index'] = torch.stack([p_t, u_t])

        data.validate()
        assert not data.is_directed(), "The created graph shouldn't be directed"

        torch.save(data, self.processed_paths[0])

    @property
    def raw_dir(self) -> str:
        return self.root / 'daostack'

    @property
    def processed_dir(self) -> str:
        return self.raw_dir / 'processed'

    @property
    def raw_file_names(self) -> str:
        return ['votes.csv', 'daos.csv']
    
    @property
    def processed_file_names(self) -> str:
        return f"daostack_votes_{self._min_vpu}_{'-'.join(self._allowed_daos)}.pt"

def print_graph_stats(g: HeteroData):
    density = (g.num_edges) / (g.num_nodes*(g.num_nodes-1))
    print(f'Edges:   {g.num_edges:12}')
    print(f'Density: {density*100:12.4f}%')

data = Daostack(DW, min_vpu=datasetConfig.min_votes_per_user, allowed_daos=datasetConfig.allowed_dao_names)[0]
print_graph_stats(data)
data

Edges:          16606
Density:       0.3087%


HeteroData(
  voter={ num_nodes=104 },
  proposal={ num_nodes=2216 },
  (voter, votes, proposal)={ edge_index=[2, 8303] },
  (proposal, voted, voter)={ edge_index=[2, 8303] }
)

At first, I thought the RandomLinkSplit function was not working properly, but it turns out that I wasn't understanding it very well. The tutorial I used for [01_mvp](./01_mvp.ipynb) is not very good either, it was written by students, and implemented before PyTorch Geometric bundled the LightGCN model with it.

> I think this is totally correct. It seems like you are looking at the shapes of edge_index, while you may want to look at the shapes of edge_label and edge_label_index (which correctly model a 80/10/10 split ratio). Here, edge_index is solely used for message passing, i.e.,
> 
> * for training, we exchange messages on all training edges
> * for validation, we exchange messages on all training edges
> * for testing, we exchange messages on all training and validation edges
> Let me know if this resolves your concerns :)
>
> -- [Split Error in RandomLinkSplit · Issue #3668 · pyg-team/pytorch_geometric · GitHub](https://github.com/pyg-team/pytorch_geometric/issues/3668)

In [4]:
def get_train_val_test(g: Data | HeteroData, train_ratio=0.75):
    t = ft.partial(PyG.transforms.RandomLinkSplit, 
        is_undirected=True,
        num_val=1-train_ratio,
        # split_labels=True,
        add_negative_train_samples=True,
        num_test=0,
    )
    
    if isinstance(g, HeteroData):
        t = t(
            edge_types=[g.edge_types[0]],
            rev_edge_types=[g.edge_types[1]] if len(g.edge_types) > 1 else None,
        )
    elif isinstance(g, Data):
        t = t()
            
    return t(g)

tr, val, ts = get_train_val_test(data, train_ratio=7/8)
tr, val, ts

(HeteroData(
   voter={ num_nodes=104 },
   proposal={ num_nodes=2216 },
   (voter, votes, proposal)={
     edge_index=[2, 7266],
     edge_label=[14532],
     edge_label_index=[2, 14532],
   },
   (proposal, voted, voter)={ edge_index=[2, 7266] }
 ),
 HeteroData(
   voter={ num_nodes=104 },
   proposal={ num_nodes=2216 },
   (voter, votes, proposal)={
     edge_index=[2, 7266],
     edge_label=[2074],
     edge_label_index=[2, 2074],
   },
   (proposal, voted, voter)={ edge_index=[2, 7266] }
 ),
 HeteroData(
   voter={ num_nodes=104 },
   proposal={ num_nodes=2216 },
   (voter, votes, proposal)={
     edge_index=[2, 8303],
     edge_label=[0],
     edge_label_index=[2, 0],
   },
   (proposal, voted, voter)={ edge_index=[2, 8303] }
 ))

In [5]:
# Trying to make sense of all of this

th = tr.to_homogeneous()
print(th)
print(np.unique(th.edge_label))
pos = th.edge_label_index[:, th.edge_label[:14532] == 1]
assert (pos == th.edge_index[:, th.edge_type==0]).all()
pos.size(), th.edge_index.size()
th.node_type

Data(edge_index=[2, 14532], edge_label=[21798], edge_label_index=[2, 14532], node_type=[2320], edge_type=[14532])
[ 0.  1. nan]


tensor([0, 0, 0,  ..., 1, 1, 1])

In [7]:
def ensure_homogeneous(*args):
    def _apply(g):
        if isinstance(g, HeteroData):
            hg = g.to_homogeneous()
            # Removing final na
            if hasattr(hg, 'edge_label'):
                assert hg.edge_label[hg.edge_label_index.size(1):].isnan().all()
                hg.edge_label = hg.edge_label[:hg.edge_label_index.size(1)].bool()
            return hg
        else:
            return g

    ret = tuple(_apply(g) for g in args)
    if len(ret) == 1:
        return ret[0]
    else:
        return ret

_aux = ensure_homogeneous(val)
_aux.edge_label_index[:, _aux.edge_label]

tensor([[  73,    7,   74,  ...,   29,   41,   57],
        [ 931,  498,  784,  ...,  212, 1458,  955]])

## Using the LightGCN

In [None]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

In [None]:
from torch_geometric.nn import LightGCN
from ray import tune
from ray.air import Checkpoint, session

# Based on:
# - https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
# - https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html
# - https://github.com/pyg-team/pytorch_geometric/blob/master/examples/lightgcn.py
def train_daostack(train: HeteroData, validation: HeteroData, test: HeteroData, modelConfig: ModelConfig, disable_tqdm=False):
    if not isinstance(modelConfig, ModelConfig):
        modelConfig = ModelConfig(**modelConfig)
    
    model = LightGCN(
        num_nodes=data.num_nodes,
        embedding_dim=modelConfig.embedding_dim,
        num_layers=modelConfig.conv_layers,
    ).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=modelConfig.learning_rate)

    # checkpoint = session.get_checkpoint()
    checkpoint = None

    if checkpoint:
        checkpoint_state = checkpoint.to_dict()
        start_epoch = checkpoint_state["epoch"]
        model.load_state_dict(checkpoint_state["net_state_dict"])
        optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"])
    else:
        start_epoch = 0

    # Use all message passing edges as training labels
    # TODO: The train/test data should also be saved in a checkpoint?
    # TODO: Transform the graph to homogeneous
    assert train.is_undirected()
    assert validation.is_undirected()

    train, validation, test = ensure_homogeneous(train, validation, test)

    users = torch.nonzero(train.node_type == 0).squeeze()
    items = torch.nonzero(train.node_type == 1).squeeze()
    n_users = len(users)
    n_items = len(items)

    # In message passing, bidirectional edges may cause duplicate information to
    # be passed between nodes.
    # The official LightGCN usage also uses this line of code (well, for homo graphs)
    # - https://github.com/pyg-team/pytorch_geometric/blob/master/examples/lightgcn.py
    
    # train_edge_label_index = train.edge_index[:, train.edge_type == 0]
    # train.edge_label = train.edge_label[:train.edge_label_index.size(1)] # Now this is done inside ensure_homogeneous
    pos_edge_label_index = train.edge_label_index[:, train.edge_label == 1]
    neg_edge_label_index = train.edge_label_index[:, train.edge_label == 0]

    # TODO: Use LinkLoader instead (i don't know how)
    # Waiting for pyg-team/pytorch_geometric#7817
    # train_loader = PyG.loader.LinkLoader(
    train_loader = torch.utils.data.DataLoader(
        range(pos_edge_label_index.size(1)), # dataset
        batch_size=modelConfig.batch_size,
        shuffle=True,
    )

    @torch.no_grad()
    def _prec_rec(k: int, remove_training=False):
        # [104, 5]
        gt_index = test.edge_index
        if remove_training:
            edge_index = validation.edge_label_index[:, validation.edge_label]
        else:
            # All edges
            edge_index = test.edge_index

        topk = model.recommend(edge_index, src_index=users, dst_index=items, k=k)
        n_samples = len(users)

        # [104, 2216]
        ground_truth = torch.full((n_users, n_items), False, dtype=torch.bool, device=device)
        # print('ground_truth.size()', ground_truth.size())
        ground_truth[gt_index[0], gt_index[1] - n_users] = True

        # print('topk.size()', topk.size())
        # print('topk.min(), max():', topk.min(), topk.max())
        isin_mat = ground_truth.gather(1, topk - n_users)
        item_count = PyG.utils.degree(pos_edge_label_index[0], num_nodes=n_users)

        prec = (isin_mat.sum(dim=-1) / k).sum() / n_samples
        rec = (isin_mat.sum(dim=-1) / item_count).sum() / n_samples

        # print('prec, rec:', (prec, rec))
        
        return float(prec), float(rec)

    for epoch in trange(start_epoch, modelConfig.max_epochs, disable=disable_tqdm):
        # index is an array of batch_size that indicates which edges from 
        # train.edge_index we should use
        acc_loss = n_samples = 0
        for index in tqdm(train_loader, leave=False, delay=1, disable=disable_tqdm):
            pos_edge_index = pos_edge_label_index[:, index]
            # neg_edge_index = torch.stack([
            #     pos_edge_index[0],
            #     # TODO: Use generated negative samples instead
            #     torch.randint(n_users, n_users+n_items, index.size(),device=device),
            # ])
            neg_edge_index = neg_edge_label_index[:, index]
            edge_label_index = torch.cat([
                pos_edge_index,
                neg_edge_index,
            ])

            optimizer.zero_grad()

            pos_rank, neg_rank = model(train.edge_index, edge_label_index).chunk(2)

            # Learning
            loss = model.recommendation_loss(
                pos_rank,
                neg_rank,
                node_id=edge_label_index.unique(),
            )
            loss.backward()
            optimizer.step()

            acc_loss += float(loss) * pos_rank.numel()
            n_samples += pos_rank.numel()

        checkpoint = Checkpoint.from_dict({
            'epoch': epoch,
            'net_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        })

        # Todo: Add val accuracy (pr@5, rec@5, etc.)
        prec5, rec5 = _prec_rec(5, remove_training=False)
        prec5t, rec5t = _prec_rec(5, remove_training=True)
        session.report({
            'loss': acc_loss/n_samples,
            'p@5 train': prec5, 'p@5 test': prec5t,
            'r@5 train': rec5, 'r@5 test': rec5t,
        }, checkpoint=checkpoint)

# Testing just syntax errors
train_daostack(tr.to(device), val.to(device), ts.to(device), ModelConfig(**(modelConfig._asdict() | {'max_epochs':2}))),

In [15]:
def _aux_train_daostack(config):
    return train_daostack(tr.to(device), val.to(device), ts.to(device), config, disable_tqdm=True)

# Every run takes approx half a gig of vram (no optimizations)
# The RTX 4090 has 24GB so we can run the model about 48 times
exp = tune.run(
    _aux_train_daostack,
    resources_per_trial={
        'cpu': 1,
        'gpu': 1/32,
    },
    num_samples=5,
    config=ModelConfig(
        max_epochs=10,
        embedding_dim=tune.choice([16,32,64,128]),
        conv_layers=tune.choice([2,3,4,5]),
        learning_rate=tune.loguniform(1e-5, 1e-1),
        batch_size=tune.choice([16,32,64,128,256,512,1024]),
    )._asdict(),
)

2023-08-01 11:46:57,295	INFO tune.py:657 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2023-08-01 11:47:23
Running for:,00:00:26.51
Memory:,7.1/125.6 GiB

Trial name,# failures,error file
_aux_train_daostack_1e494_00000,1,"/home/daviddavo/ray_results/_aux_train_daostack_2023-08-01_11-46-57/_aux_train_daostack_1e494_00000_0_batch_size=1024,conv_layers=3,embedding_dim=32,learning_rate=0.0001_2023-08-01_11-46-57/error.txt"
_aux_train_daostack_1e494_00001,1,"/home/daviddavo/ray_results/_aux_train_daostack_2023-08-01_11-46-57/_aux_train_daostack_1e494_00001_1_batch_size=32,conv_layers=2,embedding_dim=16,learning_rate=0.0191_2023-08-01_11-46-57/error.txt"
_aux_train_daostack_1e494_00003,1,"/home/daviddavo/ray_results/_aux_train_daostack_2023-08-01_11-46-57/_aux_train_daostack_1e494_00003_3_batch_size=128,conv_layers=2,embedding_dim=64,learning_rate=0.0165_2023-08-01_11-46-57/error.txt"
_aux_train_daostack_1e494_00004,1,"/home/daviddavo/ray_results/_aux_train_daostack_2023-08-01_11-46-57/_aux_train_daostack_1e494_00004_4_batch_size=32,conv_layers=3,embedding_dim=64,learning_rate=0.0024_2023-08-01_11-46-57/error.txt"

Trial name,status,loc,batch_size,conv_layers,embedding_dim,learning_rate
_aux_train_daostack_1e494_00002,RUNNING,147.96.81.131:33741,16,5,32,1.7001e-05
_aux_train_daostack_1e494_00000,ERROR,147.96.81.131:33739,1024,3,32,0.000146309
_aux_train_daostack_1e494_00001,ERROR,147.96.81.131:33740,32,2,16,0.0191353
_aux_train_daostack_1e494_00003,ERROR,147.96.81.131:33742,128,2,64,0.0165211
_aux_train_daostack_1e494_00004,ERROR,147.96.81.131:33743,32,3,64,0.00244475


[2m[36m(_aux_train_daostack pid=33739)[0m ../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [25,0,0], thread: [96,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
[2m[36m(_aux_train_daostack pid=33739)[0m ../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [25,0,0], thread: [97,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
[2m[36m(_aux_train_daostack pid=33739)[0m ../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [25,0,0], thread: [98,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
[2m[36m(_aux_train_daostack pid=33739)[0m ../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [25,0,0], thread: [99,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
[2m[36m(_aux_train_daostack pid=33739)[0m ../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): blo

Trial name
_aux_train_daostack_1e494_00000
_aux_train_daostack_1e494_00001
_aux_train_daostack_1e494_00003
_aux_train_daostack_1e494_00004


[2m[36m(_aux_train_daostack pid=33742)[0m ../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [23,0,0], thread: [31,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.[32m [repeated 640x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
2023-08-01 11:47:06,180	ERROR tune_controller.py:911 -- Trial task failed for trial _aux_train_daostack_1e494_00003
Traceback (most recent call last):
  File "/home/daviddavo/.local/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/daviddavo/.local/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/daviddavo/.local/lib/python3.10/site-packages/ray/_private/

KeyboardInterrupt: 

# Using all of this

Crearé una función que reciba una dirección de un usuario y retorne k propuestas que puedan interesarle

In [11]:
def recommend(user: str, K: int = 12, ignore_train: bool=False):
    uid = encoder_user.transform([user])[0]
    print(f"Recommending {K} proposals for user {user} (uid:{uid}) with {vpu.at[user]} votes")
    
    # Getting embedding
    out = model(edge_index)
    user_embed, item_embed = torch.split(out, (model.n_users, model.n_items))
    relevance_score = torch.matmul(user_embed, torch.transpose(item_embed, 0, 1))
    if ignore_train:
        i = torch.stack([
            torch.LongTensor(train_df['uid'].values),
            torch.LongTensor(train_df['pid'].values),
        ])
        v = torch.ones(len(train_df), dtype=torch.float64)
        t_interactions = torch.sparse.FloatTensor(i, v, (model.n_users, model.n_items)).to_dense().to(device)
        # mask out training user-item interactions from metric computation
        # We are only interested in novel items, as a user won't be interested
        # in "voting again"
        relevance_score = torch.mul(relevance_score, (1 - t_interactions))
    
    topk_relevance_indices = torch.topk(relevance_score, K).indices
    
    pids = topk_relevance_indices[uid].tolist()
    proposals = dfp.loc[encoder_prop.inverse_transform(pids)]
    
    proposals['userVoted'] = dfv.groupby('proposal')['voter'].apply(lambda x: user in set(x))
    
    print(f"precision@{K}={sum(proposals['userVoted'])/len(proposals)*100:.2f}%")
    
    return proposals

user = "0x334f12afb7d8740868be04719639616533075234" # vpu[(12 < vpu) & (vpu < 38)].sample().index[0]
recommend(user, ignore_train=True)[['network', 'createdAt', 'title', 'description', 'userVoted']]

NameError: name 'encoder_user' is not defined

In [None]:
dfv[dfv['proposal'] == '0xb92d2df99a47244c07a9d7ef73530c273f1d65230dbff9e95873d82c0314534e']