Based on Pytorch Geometric official example: https://github.com/pyg-team/pytorch_geometric/blob/master/examples/lightgcn.py

In [1]:
import os
from pathlib import Path
import random

import datetime as dt
import itertools as it
import functools as ft

from collections import namedtuple

from tqdm.notebook import tqdm # Progress bars
from tqdm.autonotebook import tqdm, trange

# https://import-as.github.io
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import sklearn as sk
from sklearn import preprocessing as pp

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric as PyG
from torch_geometric.nn.conv import MessagePassing
from ray import tune
from ray.air import Checkpoint, session

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

import src
from src.data import get_df, filter_df

POSSIBLE_PATHS = ["./datawarehouse", "/kaggle/input/dao-analyzer"]

DW = None
for p in POSSIBLE_PATHS:
    DW = Path(p)
    if DW.is_dir():
        break
else:
    print("No se ha encontrado el DW")

src.data.DEFAULT_PATH = DW

  from tqdm.autonotebook import tqdm, trange


Hyperparameters table in [Google Drive](https://docs.google.com/spreadsheets/d/1riafpWt1563w9pbqdt1g2QZVkc7TfRWGzFaCG5rudDI/edit?usp=sharing)

In [2]:
# Remove users with less than 6 votes from the dataset before splitting
DatasetConfig = namedtuple('DatasetConfig', ('min_votes_per_user', 'allowed_dao_names', 'train_split'))
datasetConfig = DatasetConfig(
    min_votes_per_user=6,
    allowed_dao_names={'dxDAO', 'xDXdao'},
    train_split=1/5,
)

ModelConfig = namedtuple('ModelConfig', 'max_epochs batch_size learning_rate embedding_dim conv_layers')
modelConfig = ModelConfig(
    max_epochs=50,
    batch_size=16,
    learning_rate=0.0001,
    embedding_dim=32,
    conv_layers=3,
)

# Reading data

In [3]:
import torch
from torch_geometric.data import InMemoryDataset, HeteroData, Data

class Daostack(InMemoryDataset):
    """ Creates a heterogeneus graph with two kinds of nodes: voters and proposals """
    def __init__(self, root: str, min_vpu=6, allowed_daos=None):
        self._min_vpu = min_vpu
        self._allowed_daos = allowed_daos
        
        super().__init__(root)

        self.data = torch.load(self.processed_paths[0])

    def process(self):
        import pandas as pd

        df = pd.read_csv(self.raw_paths[0])

        if self._allowed_daos:
            dfd = pd.read_csv(self.raw_paths[1]).set_index('id')
            allowed_dao_ids = set(dfd[dfd['name'].isin(self._allowed_daos)].index)
            df = df[df['dao'].isin(allowed_dao_ids)]
            assert not df.empty, "Dataframe is empty"
            
        if self._min_vpu:
            vpu = df.groupby('voter').size()
            allowed_voters = vpu[vpu >= self._min_vpu].index
            df = df[df['voter'].isin(allowed_voters)]
        
        data = HeteroData()
        node_types = ['voter', 'proposal']
        for nt in node_types:
            df[nt] = df[nt].astype('category')
            data[nt].num_nodes = df[nt].nunique()

        u_t = torch.LongTensor(df['voter'].cat.codes)
        p_t = torch.LongTensor(df['proposal'].cat.codes)

        edge_index = torch.stack([
            torch.cat([u_t, p_t]),
            torch.cat([p_t, u_t]),
        ])

        data['voter', 'votes', 'proposal']['edge_index'] = torch.stack([u_t, p_t])
        data['proposal', 'voted', 'voter']['edge_index'] = torch.stack([p_t, u_t])

        data.validate()
        assert not data.is_directed(), "The created graph shouldn't be directed"

        torch.save(data, self.processed_paths[0])

    @property
    def raw_dir(self) -> str:
        return self.root / 'daostack'

    @property
    def processed_dir(self) -> str:
        return self.raw_dir / 'processed'

    @property
    def raw_file_names(self) -> str:
        return ['votes.csv', 'daos.csv']
    
    @property
    def processed_file_names(self) -> str:
        return f"daostack_votes_{self._min_vpu}_{'-'.join(self._allowed_daos)}.pt"

def print_graph_stats(g: HeteroData):
    density = (g.num_edges) / (g.num_nodes*(g.num_nodes-1))
    print(f'Edges:   {g.num_edges:12}')
    print(f'Density: {density*100:12.4f}%')

data = Daostack(DW, min_vpu=datasetConfig.min_votes_per_user, allowed_daos=datasetConfig.allowed_dao_names)[0]
print_graph_stats(data)
data

Edges:          16606
Density:       0.3087%


HeteroData(
  voter={ num_nodes=104 },
  proposal={ num_nodes=2216 },
  (voter, votes, proposal)={ edge_index=[2, 8303] },
  (proposal, voted, voter)={ edge_index=[2, 8303] }
)

At first, I thought the RandomLinkSplit function was not working properly, but it turns out that I wasn't understanding it very well. The tutorial I used for [01_mvp](./01_mvp.ipynb) is not very good either, it was written by students, and implemented before PyTorch Geometric bundled the LightGCN model with it.

> I think this is totally correct. It seems like you are looking at the shapes of edge_index, while you may want to look at the shapes of edge_label and edge_label_index (which correctly model a 80/10/10 split ratio). Here, edge_index is solely used for message passing, i.e.,
> 
> * for training, we exchange messages on all training edges
> * for validation, we exchange messages on all training edges
> * for testing, we exchange messages on all training and validation edges
> Let me know if this resolves your concerns :)
>
> -- [Split Error in RandomLinkSplit · Issue #3668 · pyg-team/pytorch_geometric · GitHub](https://github.com/pyg-team/pytorch_geometric/issues/3668)

In [4]:
def get_train_val_test(g: Data | HeteroData, train_ratio=0.75):
    t = ft.partial(PyG.transforms.RandomLinkSplit, 
        is_undirected=True,
        num_val=1-train_ratio,
        # split_labels=True,
        add_negative_train_samples=True,
        num_test=0,
    )
    
    if isinstance(g, HeteroData):
        t = t(
            edge_types=[g.edge_types[0]],
            rev_edge_types=[g.edge_types[1]] if len(g.edge_types) > 1 else None,
        )
    elif isinstance(g, Data):
        t = t()
            
    return t(g)

tr, val, ts = get_train_val_test(data, train_ratio=7/8)
tr, val, ts

(HeteroData(
   voter={ num_nodes=104 },
   proposal={ num_nodes=2216 },
   (voter, votes, proposal)={
     edge_index=[2, 7266],
     edge_label=[14532],
     edge_label_index=[2, 14532],
   },
   (proposal, voted, voter)={ edge_index=[2, 7266] }
 ),
 HeteroData(
   voter={ num_nodes=104 },
   proposal={ num_nodes=2216 },
   (voter, votes, proposal)={
     edge_index=[2, 7266],
     edge_label=[2074],
     edge_label_index=[2, 2074],
   },
   (proposal, voted, voter)={ edge_index=[2, 7266] }
 ),
 HeteroData(
   voter={ num_nodes=104 },
   proposal={ num_nodes=2216 },
   (voter, votes, proposal)={
     edge_index=[2, 8303],
     edge_label=[0],
     edge_label_index=[2, 0],
   },
   (proposal, voted, voter)={ edge_index=[2, 8303] }
 ))

In [5]:
# Trying to make sense of all of this

th = tr.to_homogeneous()
print(th)
print(np.unique(th.edge_label))
pos = th.edge_label_index[:, th.edge_label[:14532] == 1]
assert (pos == th.edge_index[:, th.edge_type==0]).all()
pos.size(), th.edge_index.size()
th.node_type

Data(edge_index=[2, 14532], edge_label=[21798], edge_label_index=[2, 14532], node_type=[2320], edge_type=[14532])
[ 0.  1. nan]


tensor([0, 0, 0,  ..., 1, 1, 1])

In [6]:
def ensure_homogeneous(*args):
    def _apply(g):
        if isinstance(g, HeteroData):
            hg = g.to_homogeneous()
            # Removing final na
            if hasattr(hg, 'edge_label'):
                assert hg.edge_label[hg.edge_label_index.size(1):].isnan().all()
                hg.edge_label = hg.edge_label[:hg.edge_label_index.size(1)].bool()
            return hg
        else:
            return g

    ret = tuple(_apply(g) for g in args)
    if len(ret) == 1:
        return ret[0]
    else:
        return ret

_aux = ensure_homogeneous(val)
_aux.edge_label_index[:, _aux.edge_label]

tensor([[  98,   52,   43,  ...,   75,   85,    3],
        [ 493,  168, 1738,  ..., 1276, 1412,  184]])

## Using the LightGCN

In [7]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [8]:
from torch_geometric.nn import LightGCN

# Based on:
# - https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
# - https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html
# - https://github.com/pyg-team/pytorch_geometric/blob/master/examples/lightgcn.py
def train_daostack(train: HeteroData, validation: HeteroData, test: HeteroData, modelConfig: ModelConfig, disable_tqdm=False):
    if not isinstance(modelConfig, ModelConfig):
        modelConfig = ModelConfig(**modelConfig)
    
    model = LightGCN(
        num_nodes=data.num_nodes,
        embedding_dim=modelConfig.embedding_dim,
        num_layers=modelConfig.conv_layers,
    ).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=modelConfig.learning_rate)

    checkpoint = session.get_checkpoint()

    if checkpoint:
        checkpoint_state = checkpoint.to_dict()
        start_epoch = checkpoint_state["epoch"]
        model.load_state_dict(checkpoint_state["net_state_dict"])
        optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"])
    else:
        start_epoch = 0

    # Use all message passing edges as training labels
    # TODO: The train/test data should also be saved in a checkpoint?
    # TODO: Transform the graph to homogeneous
    assert train.is_undirected()
    assert validation.is_undirected()

    train, validation, test = ensure_homogeneous(train, validation, test)

    users = torch.nonzero(train.node_type == 0).squeeze()
    items = torch.nonzero(train.node_type == 1).squeeze()
    n_users = len(users)
    n_items = len(items)

    # In message passing, bidirectional edges may cause duplicate information to
    # be passed between nodes.
    # The official LightGCN usage also uses this line of code (well, for homo graphs)
    # - https://github.com/pyg-team/pytorch_geometric/blob/master/examples/lightgcn.py
    
    # train_edge_label_index = train.edge_index[:, train.edge_type == 0]
    # train.edge_label = train.edge_label[:train.edge_label_index.size(1)] # Now this is done inside ensure_homogeneous
    pos_edge_label_index = train.edge_label_index[:, train.edge_label == 1]
    neg_edge_label_index = train.edge_label_index[:, train.edge_label == 0]

    # TODO: Use LinkLoader instead (i don't know how)
    # Waiting for pyg-team/pytorch_geometric#7817
    # train_loader = PyG.loader.LinkLoader(
    train_loader = torch.utils.data.DataLoader(
        range(pos_edge_label_index.size(1)), # dataset
        batch_size=modelConfig.batch_size,
        shuffle=True,
    )

    @torch.no_grad()
    def _prec_rec(k: int, remove_training=False):
        # [104, 5]
        gt_index = test.edge_index[:, test.edge_type == 0]
        if remove_training:
            edge_index = validation.edge_label_index[:, validation.edge_label]
        else:
            # All edges
            edge_index = test.edge_index

        topk = model.recommend(edge_index, src_index=users, dst_index=items, k=k)
        n_samples = len(users)

        # [104, 2216]
        ground_truth = torch.full((n_users, n_items), False, dtype=torch.bool, device=device)
        # print('ground_truth.size()', ground_truth.size())
        ground_truth[gt_index[0], gt_index[1] - n_users] = True

        # print('topk.size()', topk.size())
        # print('topk.min(), max():', topk.min(), topk.max())
        isin_mat = ground_truth.gather(1, topk - n_users)
        item_count = PyG.utils.degree(pos_edge_label_index[0], num_nodes=n_users)

        prec = (isin_mat.sum(dim=-1) / k).sum() / n_samples
        rec = (isin_mat.sum(dim=-1) / item_count).sum() / n_samples

        # print('prec, rec:', (prec, rec))
        
        return float(prec), float(rec)

    for epoch in trange(start_epoch, modelConfig.max_epochs, disable=disable_tqdm):
        # index is an array of batch_size that indicates which edges from 
        # train.edge_index we should use
        acc_loss = n_samples = 0
        for index in tqdm(train_loader, leave=False, delay=1, disable=disable_tqdm):
            pos_edge_index = pos_edge_label_index[:, index]
            # neg_edge_index = torch.stack([
            #     pos_edge_index[0],
            #     # TODO: Use generated negative samples instead
            #     torch.randint(n_users, n_users+n_items, index.size(),device=device),
            # ])
            neg_edge_index = neg_edge_label_index[:, index]
            edge_label_index = torch.cat([
                pos_edge_index,
                neg_edge_index,
            ], dim=1)

            optimizer.zero_grad()
            pos_rank, neg_rank = model(train.edge_index, edge_label_index).chunk(2)

            # Learning
            loss = model.recommendation_loss(
                pos_rank,
                neg_rank,
                node_id=edge_label_index.unique(),
            )
            loss.backward()
            optimizer.step()

            acc_loss += float(loss) * pos_rank.numel()
            n_samples += pos_rank.numel()

        checkpoint = Checkpoint.from_dict({
            'epoch': epoch,
            'net_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        })

        # Todo: Add val accuracy (pr@5, rec@5, etc.)
        prec5, rec5 = _prec_rec(5, remove_training=False)
        prec5t, rec5t = _prec_rec(5, remove_training=True)
        session.report({
            'loss': acc_loss/n_samples,
            'p@5 train': prec5, 'p@5 test': prec5t,
            'r@5 train': rec5, 'r@5 test': rec5t,
        }, checkpoint=checkpoint)

    return model

# Testing just syntax errors
train_daostack(tr.to(device), val.to(device), ts.to(device), ModelConfig(**(modelConfig._asdict() | {'max_epochs':2}))),



  0%|          | 0/2 [00:00<?, ?it/s]



(LightGCN(2320, 32, num_layers=3),)

In [25]:
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.hyperopt import HyperOptSearch

def _aux_train_daostack(config):
    # TODO: Is bad practice to pass a dataset trainable
    config['embedding_dim'] = 2**config['embedding_dim']
    config['batch_size'] = 2**config['batch_size']
    return train_daostack(tr.to(device), val.to(device), ts.to(device), config, disable_tqdm=True)

tryConfigs = ModelConfig(
    max_epochs=50,
    conv_layers=tune.randint(2,6),
    learning_rate=tune.qloguniform(1e-5, 1, 1e-5),
    # These will be 2 to the power
    batch_size=tune.randint(4,10), # 16..1024
    embedding_dim=tune.randint(4,8), # 16..256
)

asha_scheduler = ASHAScheduler(
    time_attr='training_iteration',
    max_t=50,
    grace_period=5,
    reduction_factor=3,
    brackets=1,
)

search_alg = HyperOptSearch()

# Every run takes approx half a gig of vram (no optimizations)
# The RTX 4090 has 24GB so we can run the model about 48 times
resources_per_trial={
    'cpu': 1,
    'memory': 0 if torch.cuda.is_available() else 2e9,
    'gpu': 1/32 if torch.cuda.is_available() else 0,
}

tuner = tune.Tuner(
    tune.with_resources(_aux_train_daostack, resources_per_trial),
    param_space=tryConfigs._asdict(),
    tune_config=tune.TuneConfig(
        time_budget_s=60,
        num_samples=-1,
        scheduler=asha_scheduler,
        search_alg=search_alg,
        metric='loss',
        mode='min',
    )
)
exp = tuner.fit()

0,1
Current time:,2023-08-02 16:03:36
Running for:,00:01:00.22
Memory:,22.8/125.6 GiB

Trial name,status,loc,batch_size,conv_layers,embedding_dim,learning_rate,max_epochs,iter,total time (s),loss,p@5 train,p@5 test
_aux_train_daostack_e2da9a8a,TERMINATED,147.96.81.131:98641,5,4,6,0.02169,50,50.0,38.4003,0.00504007,0.630769,0.419231
_aux_train_daostack_ba965b5f,TERMINATED,147.96.81.131:98685,9,5,5,0.00331,50,5.0,0.548037,0.225845,0.440385,0.296154
_aux_train_daostack_56b7b250,TERMINATED,147.96.81.131:98685,4,4,7,1e-05,50,5.0,4.56163,0.686882,0.636538,0.132692
_aux_train_daostack_d061f2b9,TERMINATED,147.96.81.131:98755,8,2,4,0.00069,50,5.0,0.827316,0.594725,0.428846,0.253846
_aux_train_daostack_fcd2cce2,TERMINATED,147.96.81.131:98755,4,2,7,0.15457,50,5.0,4.85991,1.9251,0.880769,0.851923
_aux_train_daostack_15d7e4be,TERMINATED,147.96.81.131:98685,7,3,6,3e-05,50,5.0,0.650614,0.691751,0.715385,0.0788462
_aux_train_daostack_adebcc16,TERMINATED,147.96.81.131:98685,6,4,5,0.07761,50,50.0,22.2906,0.0404303,0.761539,0.653846
_aux_train_daostack_b47f2338,TERMINATED,147.96.81.131:98947,7,2,4,0.59141,50,5.0,1.34602,0.648045,0.757692,0.634615
_aux_train_daostack_a6a61551,TERMINATED,147.96.81.131:98755,7,3,5,3e-05,50,5.0,0.759533,0.692317,0.519231,0.0576923
_aux_train_daostack_99c7e7fa,TERMINATED,147.96.81.131:98755,7,2,6,0.17505,50,15.0,2.5339,0.266458,0.853846,0.794231


2023-08-02 16:03:36,007	INFO timeout.py:54 -- Reached timeout of 60 seconds. Stopping all trials.
2023-08-02 16:03:37,547	INFO tune.py:1148 -- Total run time: 61.74 seconds (60.19 seconds for the tuning loop).
- /home/daviddavo/ray_results/_aux_train_daostack_2023-08-02_16-02-35/_aux_train_daostack_7b840430_62_batch_size=6,conv_layers=4,embedding_dim=5,learning_rate=0.0001,max_epochs=50_2023-08-02_16-03-34
- /home/daviddavo/ray_results/_aux_train_daostack_2023-08-02_16-02-35/_aux_train_daostack_83f1f7ca_63_batch_size=5,conv_layers=3,embedding_dim=4,learning_rate=0.0077,max_epochs=50_2023-08-02_16-03-35
- /home/daviddavo/ray_results/_aux_train_daostack_2023-08-02_16-02-35/_aux_train_daostack_3071016b_64_batch_size=8,conv_layers=4,embedding_dim=5,learning_rate=0.0131,max_epochs=50_2023-08-02_16-03-35


In [23]:
exp_df = exp.get_dataframe().drop(columns=['hostname', 'node_ip', 'logdir', 'should_checkpoint', 'pid'])
exp_df.sort_values('p@5 test', ascending=False)

- /home/daviddavo/ray_results/_aux_train_daostack_2023-08-02_15-52-56/_aux_train_daostack_9ba2bbbb_66_batch_size=8,conv_layers=5,embedding_dim=6,learning_rate=0.9851,max_epochs=50_2023-08-02_15-53-56


Unnamed: 0,loss,p@5 train,p@5 test,r@5 train,r@5 test,time_this_iter_s,done,training_iteration,trial_id,date,timestamp,time_total_s,time_since_restore,iterations_since_restore,config/batch_size,config/conv_layers,config/embedding_dim,config/learning_rate,config/max_epochs
62,1.360198,0.842308,0.800000,0.231115,0.189667,0.235776,False,18,8545afd8,2023-08-02_15-53-56,1690991636,4.243093,4.243093,18,7,2,5,0.63793,50
3,1.815670,0.803846,0.726923,0.217099,0.172195,0.105648,True,15,94969f3b,2023-08-02_15-53-03,1690991583,1.601382,1.601382,15,7,4,5,0.83618,50
33,0.226437,0.805769,0.709615,0.213328,0.159748,0.147412,True,50,6eab0a6e,2023-08-02_15-53-33,1690991613,6.636717,6.636717,50,7,3,6,0.18221,50
21,1.467784,0.769231,0.688462,0.201313,0.147167,0.067959,True,15,29154650,2023-08-02_15-53-17,1690991597,0.947414,0.947414,15,8,5,6,0.89547,50
61,0.864354,0.840385,0.686539,0.226512,0.157390,0.124850,False,40,f8b911c2,2023-08-02_15-53-56,1690991636,5.477503,5.477503,40,8,3,7,0.33586,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,0.689345,0.438462,0.086538,0.099213,0.006887,0.097300,True,5,01107862,2023-08-02_15-53-12,1690991592,0.537367,0.537367,5,7,4,5,0.00005,50
5,0.691613,0.707692,0.059615,0.218286,0.002830,0.194479,True,5,362338af,2023-08-02_15-53-05,1690991585,0.916141,0.916141,5,6,3,6,0.00002,50
18,0.692199,0.551923,0.057692,0.140729,0.004069,0.124343,True,5,b05f3d84,2023-08-02_15-53-14,1690991594,0.700850,0.700850,5,7,4,5,0.00003,50
43,0.693034,0.184615,0.040385,0.045946,0.002027,0.128587,True,5,12ebb57a,2023-08-02_15-53-34,1690991614,0.690873,0.690873,5,7,3,4,0.00001,50


# Using all of this

Crearé una función que reciba una dirección de un usuario y retorne k propuestas que puedan interesarle

In [11]:
def recommend(user: str, K: int = 12, ignore_train: bool=False):
    uid = encoder_user.transform([user])[0]
    print(f"Recommending {K} proposals for user {user} (uid:{uid}) with {vpu.at[user]} votes")
    
    # Getting embedding
    out = model(edge_index)
    user_embed, item_embed = torch.split(out, (model.n_users, model.n_items))
    relevance_score = torch.matmul(user_embed, torch.transpose(item_embed, 0, 1))
    if ignore_train:
        i = torch.stack([
            torch.LongTensor(train_df['uid'].values),
            torch.LongTensor(train_df['pid'].values),
        ])
        v = torch.ones(len(train_df), dtype=torch.float64)
        t_interactions = torch.sparse.FloatTensor(i, v, (model.n_users, model.n_items)).to_dense().to(device)
        # mask out training user-item interactions from metric computation
        # We are only interested in novel items, as a user won't be interested
        # in "voting again"
        relevance_score = torch.mul(relevance_score, (1 - t_interactions))
    
    topk_relevance_indices = torch.topk(relevance_score, K).indices
    
    pids = topk_relevance_indices[uid].tolist()
    proposals = dfp.loc[encoder_prop.inverse_transform(pids)]
    
    proposals['userVoted'] = dfv.groupby('proposal')['voter'].apply(lambda x: user in set(x))
    
    print(f"precision@{K}={sum(proposals['userVoted'])/len(proposals)*100:.2f}%")
    
    return proposals

user = "0x334f12afb7d8740868be04719639616533075234" # vpu[(12 < vpu) & (vpu < 38)].sample().index[0]
recommend(user, ignore_train=True)[['network', 'createdAt', 'title', 'description', 'userVoted']]

NameError: name 'encoder_user' is not defined

In [None]:
dfv[dfv['proposal'] == '0xb92d2df99a47244c07a9d7ef73530c273f1d65230dbff9e95873d82c0314534e']