In [1]:
from typing import List, Dict, Tuple, Any

import pandas as pd
import numpy as np
from tqdm import tqdm

from ray import train, tune
import ray.train.torch # Seems unused, but is used via train.torch

import torch
import torch_geometric as PyG

from src import datasets
from src import graph_utils as gu

In [2]:
NUM_FOLDS: int = 5
NUM_SAMPLES: int = 1
MAX_ITERS: int = 5

## Reading and splitting data

In [3]:
from torch_geometric.data import InMemoryDataset, HeteroData, Data

In [4]:
original = datasets.DAOCensus("./data/daos-census", 'Decentraland', 'snapshot', min_vpu=6)[0]
original

HeteroData(
  user={
    num_nodes=2902,
    voters=[2900],
    authors=[2],
  },
  proposal={ num_nodes=1962 },
  (user, vote, proposal)={ edge_index=[2, 107735] },
  (proposal, rev_vote, user)={ edge_index=[2, 107735] },
  (user, creates, proposal)={ edge_index=[2, 1962] },
  (proposal, rev_creates, user)={ edge_index=[2, 1962] }
)

In [5]:
original.to_homogeneous()

Data(edge_index=[2, 219394], node_type=[4864], edge_type=[219394])

In [6]:
folds = gu.k_fold(original, NUM_FOLDS)
folds[0]

(HeteroData(
   user={
     num_nodes=2902,
     voters=[2900],
     authors=[2],
   },
   proposal={ num_nodes=1962 },
   (user, vote, proposal)={ edge_index=[2, 86188] },
   (proposal, rev_vote, user)={ edge_index=[2, 86188] },
   (user, creates, proposal)={ edge_index=[2, 1570] },
   (proposal, rev_creates, user)={ edge_index=[2, 1570] }
 ),
 HeteroData(
   user={
     num_nodes=2902,
     voters=[2900],
     authors=[2],
   },
   proposal={ num_nodes=1962 },
   (user, vote, proposal)={ edge_index=[2, 21547] },
   (proposal, rev_vote, user)={ edge_index=[2, 21547] },
   (user, creates, proposal)={ edge_index=[2, 392] },
   (proposal, rev_creates, user)={ edge_index=[2, 392] }
 ))

## Defining training

In [7]:
# import ray.train.torch
train.torch.get_device()

device(type='cpu')

In [8]:
from torch_geometric.nn import LightGCN
from src.loader import BPRLoader

class TrainLightGCN(tune.Trainable):
    def setup(
        self,
        config: Dict[str, Any],
        folds: List[Tuple[HeteroData, HeteroData]], 
        original: HeteroData,
        disable_tqdm: bool = True,
    ):
        self.config = config
        self.fold = config['__trial_index__']
        self.disable_tqdm = disable_tqdm
        
        self.model = LightGCN(
            num_nodes=original.num_nodes,
            embedding_dim=config['embedding_dim'],
            num_layers=config['conv_layers'],
        ).to(train.torch.get_device())
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=config['learning_rate'])
        
        self.data_train, self.data_validation = folds[self.fold]
        self.data_original = original

        self.train_loader = BPRLoader(
            self.data_train,
            ('user', 'vote', 'proposal'),
            batch_size = self.config['batch_size'],
            input_nodes=('user', original['user'].voters),
            subgraph_type='bidirectional',
            transform_global=True,
        )

    def step(self):
        """
        As a rule of thumb, the execution time of step should be large enough to avoid overheads 
        (i.e. more than a few seconds), but short enough to report progress periodically 
        (i.e. at most a few minutes).
        """
        acc_loss = n_samples = 0

        # sg: subgraph
        # every iteration, a subgraph is returned (see PyG's NeighborLoader)
        for sg in tqdm(self.train_loader, leave=False, delay=1, disable=self.disable_tqdm):            
            pos_edge_index = train_edge_label_index[:, index]
            # TODO: Change to negative structured sampling like in original LightGCN implementation
            neg_edge_index = sg['user', 'vote', 'proposal'].edge_index
            
            edge_label_index = torch.cat([
                pos_edge_index,
                neg_edge_index,
            ], dim=1)

            optimizer.zero_grad()
            pos_rank, neg_rank = model(message_passing_edge_index, edge_label_index).chunk(2)

            # Learning
            loss = model.recommendation_loss(
                pos_rank,
                neg_rank,
                node_id=edge_label_index.unique(),
                lambda_reg=self.config['l2'],
            )
            loss.backward()
            optimizer.step()

            acc_loss += float(loss) * pos_rank.numel()
            n_samples += pos_rank.numel()

        return {
            'loss': acc_loss/n_samples,
        }

## Tune hyperparameters

In [9]:
from ray.tune.search.hyperopt import HyperOptSearch

search_alg = HyperOptSearch()
search_alg = tune.search.Repeater(search_alg, NUM_FOLDS)

tuner = tune.Tuner(
    tune.with_parameters(TrainLightGCN, folds=folds, original=original),
    run_config=train.RunConfig(
        stop={'training_iteration': MAX_ITERS}
    ),
    param_space=dict(
        batch_size=64,
        learning_rate=0.001,
        embedding_dim=32,
        conv_layers=tune.randint(2,6),
        l2=1e-4,
    ),
    tune_config=tune.TuneConfig(
        search_alg=search_alg,
        num_samples=NUM_FOLDS*NUM_SAMPLES,
        metric='loss',
        mode='min',
    )
)
tuner.fit()

0,1
Current time:,2023-10-10 16:33:04
Running for:,00:00:18.82
Memory:,12.9/15.3 GiB

Trial name,# failures,error file
TrainLightGCN_56bd2099,1,"/home/davo/ray_results/TrainLightGCN_2023-10-10_16-32-42/TrainLightGCN_56bd2099_1_trial_index=0,batch_size=64,conv_layers=5,embedding_dim=32,l2=0.0001,learning_rate=0.0010_2023-10-10_16-32-45/error.txt"
TrainLightGCN_31160e51,1,"/home/davo/ray_results/TrainLightGCN_2023-10-10_16-32-42/TrainLightGCN_31160e51_2_trial_index=1,batch_size=64,conv_layers=5,embedding_dim=32,l2=0.0001,learning_rate=0.0010_2023-10-10_16-32-50/error.txt"
TrainLightGCN_b3225bfb,1,"/home/davo/ray_results/TrainLightGCN_2023-10-10_16-32-42/TrainLightGCN_b3225bfb_3_trial_index=2,batch_size=64,conv_layers=5,embedding_dim=32,l2=0.0001,learning_rate=0.0010_2023-10-10_16-32-54/error.txt"
TrainLightGCN_daf5738b,1,"/home/davo/ray_results/TrainLightGCN_2023-10-10_16-32-42/TrainLightGCN_daf5738b_4_trial_index=3,batch_size=64,conv_layers=5,embedding_dim=32,l2=0.0001,learning_rate=0.0010_2023-10-10_16-32-58/error.txt"
TrainLightGCN_ab536924,1,"/home/davo/ray_results/TrainLightGCN_2023-10-10_16-32-42/TrainLightGCN_ab536924_5_trial_index=4,batch_size=64,conv_layers=5,embedding_dim=32,l2=0.0001,learning_rate=0.0010_2023-10-10_16-33-01/error.txt"

Trial name,status,loc,__trial_index__,batch_size,conv_layers,embedding_dim,l2,learning_rate
TrainLightGCN_56bd2099,ERROR,10.8.33.181:25159,0,64,5,32,0.0001,0.001
TrainLightGCN_31160e51,ERROR,,1,64,5,32,0.0001,0.001
TrainLightGCN_b3225bfb,ERROR,,2,64,5,32,0.0001,0.001
TrainLightGCN_daf5738b,ERROR,,3,64,5,32,0.0001,0.001
TrainLightGCN_ab536924,ERROR,,4,64,5,32,0.0001,0.001


2023-10-10 16:32:50,957	ERROR tune_controller.py:1502 -- Trial task failed for trial TrainLightGCN_56bd2099
Traceback (most recent call last):
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/_private/worker.py", line 2547, in get
    raise value.as_instanceof_

[2m[36m(TrainLightGCN pid=25159)[0m HeteroData(
[2m[36m(TrainLightGCN pid=25159)[0m   user={
[2m[36m(TrainLightGCN pid=25159)[0m     num_nodes=64,
[2m[36m(TrainLightGCN pid=25159)[0m     voters=[2900],
[2m[36m(TrainLightGCN pid=25159)[0m     authors=[2],
[2m[36m(TrainLightGCN pid=25159)[0m     n_id=[64],
[2m[36m(TrainLightGCN pid=25159)[0m     input_id=[64],
[2m[36m(TrainLightGCN pid=25159)[0m     batch_size=64,
[2m[36m(TrainLightGCN pid=25159)[0m     dst_pos_index=[64],
[2m[36m(TrainLightGCN pid=25159)[0m     dst_neg_index=[64],
[2m[36m(TrainLightGCN pid=25159)[0m   },
[2m[36m(TrainLightGCN pid=25159)[0m   proposal={
[2m[36m(TrainLightGCN pid=25159)[0m     num_nodes=118,
[2m[36m(TrainLightGCN pid=25159)[0m     n_id=[118],
[2m[36m(TrainLightGCN pid=25159)[0m     src_index=[64],
[2m[36m(TrainLightGCN pid=25159)[0m   },
[2m[36m(TrainLightGCN pid=25159)[0m   (user, vote, proposal)={ edge_index=[2, 64] },
[2m[36m(TrainLightGCN pid=25159)

2023-10-10 16:32:54,553	ERROR tune_controller.py:1502 -- Trial task failed for trial TrainLightGCN_31160e51
Traceback (most recent call last):
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/_private/worker.py", line 2549, in get
    raise value
ray.exceptions

ResultGrid<[
  Result(
    error='RayTaskError(NameError)',
    metrics={},
    path='/home/davo/ray_results/TrainLightGCN_2023-10-10_16-32-42/TrainLightGCN_56bd2099_1_trial_index=0,batch_size=64,conv_layers=5,embedding_dim=32,l2=0.0001,learning_rate=0.0010_2023-10-10_16-32-45',
    filesystem='local',
    checkpoint=None
  ),
  Result(
    error='TuneError',
    metrics={},
    path='/home/davo/ray_results/TrainLightGCN_2023-10-10_16-32-42/TrainLightGCN_31160e51_2_trial_index=1,batch_size=64,conv_layers=5,embedding_dim=32,l2=0.0001,learning_rate=0.0010_2023-10-10_16-32-50',
    filesystem='local',
    checkpoint=None
  ),
  Result(
    error='TuneError',
    metrics={},
    path='/home/davo/ray_results/TrainLightGCN_2023-10-10_16-32-42/TrainLightGCN_b3225bfb_3_trial_index=2,batch_size=64,conv_layers=5,embedding_dim=32,l2=0.0001,learning_rate=0.0010_2023-10-10_16-32-54',
    filesystem='local',
    checkpoint=None
  ),
  Result(
    error='TuneError',
    metrics={},
    path='/home/da