Based on Pytorch Geometric official example: https://github.com/pyg-team/pytorch_geometric/blob/master/examples/lightgcn.py

In [None]:
import os
from pathlib import Path
import random

import datetime as dt
import itertools as it
import functools as ft

from collections import namedtuple

from tqdm.notebook import tqdm # Progress bars
from tqdm.autonotebook import tqdm, trange

# https://import-as.github.io
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import sklearn as sk
from sklearn import preprocessing as pp

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric as PyG
from torch_geometric.nn.conv import MessagePassing
from ray import tune
from ray.air import Checkpoint, session

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

import src
from src.data import get_df, filter_df
from src.graph_utils import shift_edge_indices, unshift_edge_indices
from src.neg_sampling import structured_negative_sampling

RANDOM_SEED = 1701

Hyperparameters table in [Google Drive](https://docs.google.com/spreadsheets/d/1riafpWt1563w9pbqdt1g2QZVkc7TfRWGzFaCG5rudDI/edit?usp=sharing)

In [None]:
# Remove users with less than 6 votes from the dataset before splitting
DatasetConfig = namedtuple('DatasetConfig', ('min_votes_per_user', 'allowed_dao_names', 'num_folds'))
datasetConfig = DatasetConfig(
    min_votes_per_user=6,
    allowed_dao_names={'dxDAO', 'xDXdao'},
    num_folds=5,
)

ModelConfig = namedtuple('ModelConfig', 'max_epochs batch_size learning_rate embedding_dim conv_layers l2')
modelConfig = ModelConfig(
    max_epochs=50,
    batch_size=64,
    learning_rate=0.001,
    embedding_dim=32,
    conv_layers=3,
    l2=1e-4,
)

# Reading data

In [None]:
import torch
from torch_geometric.data import InMemoryDataset, HeteroData, Data
from src.datasets import Daostack

def print_graph_stats(g: HeteroData):
    density = (g.num_edges) / (g.num_nodes*(g.num_nodes-1))
    print(f'Edges:   {g.num_edges:12}')
    print(f'Density: {density*100:12.4f}%')

data = Daostack("./data/dao-analyzer/", min_vpu=datasetConfig.min_votes_per_user, allowed_daos=datasetConfig.allowed_dao_names)[0]
print_graph_stats(data)
data

At first, I thought the RandomLinkSplit function was not working properly, but it turns out that I wasn't understanding it very well. The tutorial I used for [01_mvp](./01_mvp.ipynb) is not very good either, it was written by students, and implemented before PyTorch Geometric bundled the LightGCN model with it.

> I think this is totally correct. It seems like you are looking at the shapes of edge_index, while you may want to look at the shapes of edge_label and edge_label_index (which correctly model a 80/10/10 split ratio). Here, edge_index is solely used for message passing, i.e.,
> 
> * for training, we exchange messages on all training edges
> * for validation, we exchange messages on all training edges
> * for testing, we exchange messages on all training and validation edges
> Let me know if this resolves your concerns :)
>
> -- [Split Error in RandomLinkSplit · Issue #3668 · pyg-team/pytorch_geometric · GitHub](https://github.com/pyg-team/pytorch_geometric/issues/3668)

In [None]:
from sklearn.model_selection import StratifiedKFold

def graph_k_fold(g: Data | HeteroData, folds, edge_type=None):
    skf = StratifiedKFold(folds, shuffle=True, random_state=RANDOM_SEED)

    folds = []

    # Stratify by voter
    if edge_type is None:
        edge_type = g.edge_types[0]
        rev_edge_type = g.edge_types[1]
        
    edge_index = g[edge_type].edge_index
    for train_idx, val_idx in skf.split(torch.zeros(edge_index.size(1)), edge_index[0]):
        gtrain = g.edge_subgraph({
            edge_type:torch.tensor(train_idx),
            rev_edge_type:torch.tensor(train_idx),
        })
        assert gtrain.is_undirected()
        assert len(gtrain[edge_type].edge_index[0].unique()) == len(g[edge_type].edge_index[0].unique())
        # The negative samples should be different each epoch
        # gtrain[edge_type].negative_samples = structured_negative_sampling(gtrain[edge_type].edge_index, (aux[edge_type[0]].num_nodes, aux[edge_type[2]].num_nodes))[2]
        gval = g.edge_subgraph({
            edge_type:torch.tensor(val_idx),
            rev_edge_type:torch.tensor(val_idx),
        })
        assert gval.is_undirected()
        assert len(gval[edge_type].edge_index[0].unique()) == len(g[edge_type].edge_index[0].unique())
        assert (gtrain[edge_type].edge_index[0].unique() == gval[edge_type].edge_index[0].unique()).all()

        folds.append((gtrain, gval))

    return folds

graph_folds = graph_k_fold(data, datasetConfig.num_folds)
graph_folds

## Using the LightGCN

In [None]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

In [None]:
from torch_geometric.nn import LightGCN

# Based on:
# - https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
# - https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html
# - https://github.com/pyg-team/pytorch_geometric/blob/master/examples/lightgcn.py
def train_daostack(train: HeteroData, validation: HeteroData, original: HeteroData, modelConfig: ModelConfig, disable_tqdm=False):
    if not isinstance(modelConfig, ModelConfig):
        modelConfig = ModelConfig(**modelConfig)
    
    model = LightGCN(
        num_nodes=original.num_nodes,
        embedding_dim=modelConfig.embedding_dim,
        num_layers=modelConfig.conv_layers,
    ).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=modelConfig.learning_rate)

    checkpoint = session.get_checkpoint()

    if checkpoint:
        checkpoint_state = checkpoint.to_dict()
        start_epoch = checkpoint_state["epoch"]
        model.load_state_dict(checkpoint_state["net_state_dict"])
        optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"])
    else:
        start_epoch = 0

    assert train.is_undirected()
    assert validation.is_undirected()

    # We need to convert the edge indices to homogeneous
    # In hetero data the numbers are shared between the node types
    # while in homo data they are shifted
    original, train, validation = map(shift_edge_indices, [original, train, validation])
    
    # nodes = torch.arange(0, train.num_nodes, device=device)
    users = torch.arange(train['voter'].shift, train['voter'].end, device=device)
    items = torch.arange(train['proposal'].shift, train['proposal'].end, device=device)
    n_users = train['voter'].num_nodes
    n_items = train['proposal'].num_nodes

    message_passing_edge_index = torch.concat([s.edge_index for s in train.edge_stores], dim=1)

    # The official LightGCN usage also uses this line of code (well, for homo graphs)
    # - https://github.com/pyg-team/pytorch_geometric/blob/master/examples/lightgcn.py
    # In our case, we will use just voter ---> proposal
    train_edge_label_index = train['voter', 'votes', 'proposal'].edge_index
    assert (train_edge_label_index[0] < train['voter'].end).all()
    assert (train['proposal'].shift <= train_edge_label_index[1]).all()

    # TODO: Use LinkLoader instead (i don't know how)
    # Waiting for pyg-team/pytorch_geometric#7817
    # train_loader = PyG.loader.LinkLoader(
    train_loader = torch.utils.data.DataLoader(
        range(train_edge_label_index.size(1)), # dataset
        batch_size=modelConfig.batch_size,
        shuffle=True,
    )

    @torch.no_grad()
    def _prec_rec(k: int, remove_training=False):
        # gt: ground truth (all edges)
        gt_index = original['voter', 'votes', 'proposal'].edge_index
        if remove_training:
            edge_index = validation['voter', 'votes', 'proposal'].edge_index
        else:
            # All edges
            edge_index = original['voter', 'votes', 'proposal'].edge_index

        R = item_count = PyG.utils.degree(gt_index[0], num_nodes=n_users)
        # topr.size [104, R.max()]
        # TODO: Usar otro edge_index tal vez ponga a 0 la convolución,
        # pero no "descartará" los items. Hay que calcularlo "a mano" usando get_embedding y punto
        # además así no dependemos de mi fork
        topr = model.recommend(message_passing_edge_index, src_index=users, dst_index=items, k=int(R.max()), sorted=True)
        
        # assert (model.recommend(edge_index, src_index=users, dst_index=items, k=k) == topk).all()
        n_samples = len(users)

        # [104, 2216]
        ground_truth = torch.full((n_users, n_items), False, dtype=torch.bool, device=device)
        ground_truth[gt_index[0], gt_index[1] - original['proposal'].shift] = True

        # This is the only line that depends on topr and thus depends on edge_index
        isin_rmat = ground_truth.gather(1, topr - original['proposal'].shift)
        isin_mat = isin_rmat[:, :k]

        prec = (isin_mat.sum(dim=-1) / k).sum() / n_samples
        rec = (isin_mat.sum(dim=-1) / item_count).sum() / n_samples

        # Now mask isin_rmat to get only up to :R elements
        msk = torch.arange(1, R.max()+1, device=device) > R.unsqueeze(1)
        isin_rmat[msk] = 0
        rprec = (isin_rmat.sum(dim=-1) / R).sum() / n_samples

        # print('prec, rec:', (prec, rec))
        
        return float(prec), float(rec), float(rprec)

    for epoch in trange(start_epoch, modelConfig.max_epochs, disable=disable_tqdm):
        # index is an array of batch_size that indicates which edges from 
        # train.edge_index we should use
        acc_loss = n_samples = 0        
        
        for index in tqdm(train_loader, leave=False, delay=1, disable=disable_tqdm):
            pos_edge_index = train_edge_label_index[:, index]
            # TODO: Change to negative structured sampling like in original LightGCN implementation
            neg_edge_index = torch.stack([
                pos_edge_index[0],
                torch.randint(train['proposal'].shift, train['proposal'].end,
                          (pos_edge_index.size(1), ), device=device)
            ], dim=0)
            
            edge_label_index = torch.cat([
                pos_edge_index,
                neg_edge_index,
            ], dim=1)

            optimizer.zero_grad()
            pos_rank, neg_rank = model(message_passing_edge_index, edge_label_index).chunk(2)

            # Learning
            loss = model.recommendation_loss(
                pos_rank,
                neg_rank,
                node_id=edge_label_index.unique(),
                lambda_reg=modelConfig.l2,
            )
            loss.backward()
            optimizer.step()

            acc_loss += float(loss) * pos_rank.numel()
            n_samples += pos_rank.numel()

        checkpoint = Checkpoint.from_dict({
            'epoch': epoch,
            'net_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        })

        prec5, rec5, rprec = _prec_rec(5, remove_training=False)
        prec5t, rec5t, rprect = _prec_rec(5, remove_training=True)
        session.report({
            'loss': acc_loss/n_samples,
            'rprec train': rprec, 'rprec test': rprect,
            'p@5 train': prec5, 'p@5 test': prec5t,
            'r@5 train': rec5, 'r@5 test': rec5t,
        }, checkpoint=checkpoint)

    return model

# Testing just syntax errors
model = train_daostack(graph_folds[0][0].to(device), graph_folds[0][1].to(device), data.to(device), ModelConfig(**(modelConfig._asdict() | {'max_epochs':2})))
print(PyG.nn.summary(model, data['voter', 'votes', 'proposal'].edge_index))

In [None]:
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search import Repeater
from ray.tune.search.hyperopt import HyperOptSearch

def _aux_train_daostack(config):
    # TODO: Is bad practice to pass a dataset trainable
    # config['embedding_dim'] = 2**config['embedding_dim']
    config['batch_size'] = 2**config['batch_size']
    n_fold = config.pop('__trial_index__')
    train, validation = graph_folds[n_fold]
    return train_daostack(train.to(device), validation.to(device), data.to(device), config, disable_tqdm=True)

tryConfigs = ModelConfig(
    max_epochs=50,
    conv_layers=tune.randint(2,6),
    learning_rate=tune.qloguniform(1e-5, 1, 1e-5),
    l2=tune.loguniform(1e-9, 1e-1),
    # These will be 2 to the power
    batch_size=tune.randint(4,10), # 16..1024
    # embedding_dim=tune.randint(4,8), # 16..128
    embedding_dim=tune.qlograndint(10, 500, 5),
)

# It is recommended to not use Repeater with a TrialScheduler. Early termination can negatively affect the average reported metric.
asha_scheduler = None
# asha_scheduler = ASHAScheduler(
#     time_attr='training_iteration',
#     max_t=50,
#     grace_period=5,
#     reduction_factor=3,
#     brackets=1,
# )

search_alg = HyperOptSearch()
search_alg = Repeater(search_alg,datasetConfig.num_folds)

# Every run takes approx half a gig of vram (no optimizations)
# The RTX 4090 has 24GB so we can run the model about 48 times
resources_per_trial={
    'cpu': 1,
    'memory': 0 if torch.cuda.is_available() else 2e9,
    # 'gpu': 1/32 if torch.cuda.is_available() else 0,
    'gpu': torch.cuda.is_available(),
}

tuner = tune.Tuner(
    tune.with_resources(_aux_train_daostack, resources_per_trial),
    param_space=tryConfigs._asdict(),
    tune_config=tune.TuneConfig(
        # time_budget_s=60,
        num_samples=datasetConfig.num_folds*1,
        scheduler=asha_scheduler,
        search_alg=search_alg,
        metric='rprec test',
        mode='max',
    )
)
exp = tuner.fit()

In [None]:
exp_df = exp.get_dataframe().drop(columns=['hostname', 'node_ip', 'logdir', 'should_checkpoint', 'pid'])
exp_df.sort_values('p@5 test', ascending=False)

# Using all of this

Crearé una función que reciba una dirección de un usuario y retorne k propuestas que puedan interesarle

In [None]:
def recommend(user: str, K: int = 12, ignore_train: bool=False):
    uid = encoder_user.transform([user])[0]
    print(f"Recommending {K} proposals for user {user} (uid:{uid}) with {vpu.at[user]} votes")
    
    # Getting embedding
    out = model(edge_index)
    user_embed, item_embed = torch.split(out, (model.n_users, model.n_items))
    relevance_score = torch.matmul(user_embed, torch.transpose(item_embed, 0, 1))
    if ignore_train:
        i = torch.stack([
            torch.LongTensor(train_df['uid'].values),
            torch.LongTensor(train_df['pid'].values),
        ])
        v = torch.ones(len(train_df), dtype=torch.float64)
        t_interactions = torch.sparse.FloatTensor(i, v, (model.n_users, model.n_items)).to_dense().to(device)
        # mask out training user-item interactions from metric computation
        # We are only interested in novel items, as a user won't be interested
        # in "voting again"
        relevance_score = torch.mul(relevance_score, (1 - t_interactions))
    
    topk_relevance_indices = torch.topk(relevance_score, K).indices
    
    pids = topk_relevance_indices[uid].tolist()
    proposals = dfp.loc[encoder_prop.inverse_transform(pids)]
    
    proposals['userVoted'] = dfv.groupby('proposal')['voter'].apply(lambda x: user in set(x))
    
    print(f"precision@{K}={sum(proposals['userVoted'])/len(proposals)*100:.2f}%")
    
    return proposals

user = "0x334f12afb7d8740868be04719639616533075234" # vpu[(12 < vpu) & (vpu < 38)].sample().index[0]
recommend(user, ignore_train=True)[['network', 'createdAt', 'title', 'description', 'userVoted']]

In [None]:
dfv[dfv['proposal'] == '0xb92d2df99a47244c07a9d7ef73530c273f1d65230dbff9e95873d82c0314534e']