In [1]:
from typing import List, Dict, Tuple

import pandas as pd
import numpy as np
from tqdm import tqdm

from ray import train, tune
import ray.train.torch # Seems unused, but is used via train.torch

import torch
import torch_geometric as PyG

from src import datasets
from src import graph_utils as gu

In [2]:
NUM_FOLDS: int = 5
NUM_SAMPLES: int = 1
MAX_ITERS: int = 5

## Reading and splitting data

In [3]:
from torch_geometric.data import InMemoryDataset, HeteroData, Data

In [4]:
original = datasets.DAOCensus("./data/daos-census", 'Decentraland', 'snapshot', min_vpu=6)[0]
original

HeteroData(
  user={
    num_nodes=2902,
    voters=[2900],
    authors=[2],
  },
  proposal={ num_nodes=1962 },
  (user, vote, proposal)={ edge_index=[2, 107735] },
  (proposal, vote, user)={ edge_index=[2, 107735] },
  (user, creates, proposal)={ edge_index=[2, 1962] },
  (proposal, creates, user)={ edge_index=[2, 1962] }
)

In [5]:
folds = gu.k_fold(original, NUM_FOLDS)

## Defining training

In [6]:
# import ray.train.torch
train.torch.get_device()

device(type='cpu')

In [7]:
from torch_geometric.nn import LightGCN

class TrainLightGCN(tune.Trainable):
    def setup(self, config: Dict, folds: List[Tuple[HeteroData, HeteroData]], original: HeteroData):
        self.config = config
        self.fold = config['__trial_index__']
        print(config)
        
        self.model = LightGCN(
            num_nodes=original.num_nodes,
            embedding_dim=config['embedding_dim'],
            num_layers=config['conv_layers'],
        ).to(train.torch.get_device())
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=config['learning_rate'])
        
        self.data_train, self.data_validation = folds[self.fold]
        self.data_original = original

    def step(self):
        """
        As a rule of thumb, the execution time of step should be large enough to avoid overheads 
        (i.e. more than a few seconds), but short enough to report progress periodically 
        (i.e. at most a few minutes).
        """
        
        acc_loss = n_samples = 0
        for index in tqdm(train_loader, leave=False, delay=1, disable=disable_tqdm):
            pos_edge_index = train_edge_label_index[:, index]
            # TODO: Change to negative structured sampling like in original LightGCN implementation
            neg_edge_index = torch.stack([
                pos_edge_index[0],
                torch.randint(train['proposal'].shift, train['proposal'].end,
                          (pos_edge_index.size(1), ), device=device)
            ], dim=0)
            
            edge_label_index = torch.cat([
                pos_edge_index,
                neg_edge_index,
            ], dim=1)

            optimizer.zero_grad()
            pos_rank, neg_rank = model(message_passing_edge_index, edge_label_index).chunk(2)

            # Learning
            loss = model.recommendation_loss(
                pos_rank,
                neg_rank,
                node_id=edge_label_index.unique(),
                lambda_reg=self.config['l2'],
            )
            loss.backward()
            optimizer.step()

            acc_loss += float(loss) * pos_rank.numel()
            n_samples += pos_rank.numel()

        return {
            'loss': acc_loss/n_samples,
        }

## Tune hyperparameters

In [8]:
from ray.tune.search.hyperopt import HyperOptSearch

search_alg = HyperOptSearch()
search_alg = tune.search.Repeater(search_alg, NUM_FOLDS)

tuner = tune.Tuner(
    tune.with_parameters(TrainLightGCN, folds=folds, original=original),
    run_config=train.RunConfig(
        stop={'training_iteration': MAX_ITERS}
    ),
    param_space=dict(
        batch_size=64,
        learning_rate=0.001,
        embedding_dim=32,
        conv_layers=tune.randint(2,6),
        l2=1e-4,
    ),
    tune_config=tune.TuneConfig(
        search_alg=search_alg,
        num_samples=NUM_FOLDS*NUM_SAMPLES,
        metric='loss',
        mode='min',
    )
)
tuner.fit()

0,1
Current time:,2023-09-26 15:31:09
Running for:,00:00:10.02
Memory:,11.0/15.3 GiB

Trial name,# failures,error file
TrainLightGCN_39614ea3,1,"/home/davo/ray_results/TrainLightGCN_2023-09-26_15-30-56/TrainLightGCN_39614ea3_1_trial_index=0,batch_size=64,conv_layers=4,embedding_dim=32,l2=0.0001,learning_rate=0.0010_2023-09-26_15-30-59/error.txt"
TrainLightGCN_622da3b9,1,"/home/davo/ray_results/TrainLightGCN_2023-09-26_15-30-56/TrainLightGCN_622da3b9_2_trial_index=1,batch_size=64,conv_layers=4,embedding_dim=32,l2=0.0001,learning_rate=0.0010_2023-09-26_15-31-01/error.txt"
TrainLightGCN_0123ae2a,1,"/home/davo/ray_results/TrainLightGCN_2023-09-26_15-30-56/TrainLightGCN_0123ae2a_3_trial_index=2,batch_size=64,conv_layers=4,embedding_dim=32,l2=0.0001,learning_rate=0.0010_2023-09-26_15-31-03/error.txt"
TrainLightGCN_17633c09,1,"/home/davo/ray_results/TrainLightGCN_2023-09-26_15-30-56/TrainLightGCN_17633c09_4_trial_index=3,batch_size=64,conv_layers=4,embedding_dim=32,l2=0.0001,learning_rate=0.0010_2023-09-26_15-31-05/error.txt"
TrainLightGCN_5e4f57da,1,"/home/davo/ray_results/TrainLightGCN_2023-09-26_15-30-56/TrainLightGCN_5e4f57da_5_trial_index=4,batch_size=64,conv_layers=4,embedding_dim=32,l2=0.0001,learning_rate=0.0010_2023-09-26_15-31-07/error.txt"

Trial name,status,loc,__trial_index__,batch_size,conv_layers,embedding_dim,l2,learning_rate
TrainLightGCN_39614ea3,ERROR,147.96.25.138:35920,0,64,4,32,0.0001,0.001
TrainLightGCN_622da3b9,ERROR,147.96.25.138:35956,1,64,4,32,0.0001,0.001
TrainLightGCN_0123ae2a,ERROR,147.96.25.138:35987,2,64,4,32,0.0001,0.001
TrainLightGCN_17633c09,ERROR,147.96.25.138:36021,3,64,4,32,0.0001,0.001
TrainLightGCN_5e4f57da,ERROR,147.96.25.138:36056,4,64,4,32,0.0001,0.001


2023-09-26 15:31:01,805	ERROR tune_controller.py:1502 -- Trial task failed for trial TrainLightGCN_39614ea3
Traceback (most recent call last):
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/_private/worker.py", line 2547, in get
    raise value.as_instanceof_

[2m[36m(TrainLightGCN pid=35920)[0m {'batch_size': 64, 'learning_rate': 0.001, 'embedding_dim': 32, 'conv_layers': 4, 'l2': 0.0001, '__trial_index__': 0}


2023-09-26 15:31:03,736	ERROR tune_controller.py:1502 -- Trial task failed for trial TrainLightGCN_622da3b9
Traceback (most recent call last):
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/_private/worker.py", line 2547, in get
    raise value.as_instanceof_

[2m[36m(TrainLightGCN pid=36021)[0m {'batch_size': 64, 'learning_rate': 0.001, 'embedding_dim': 32, 'conv_layers': 4, 'l2': 0.0001, '__trial_index__': 3}[32m [repeated 3x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m


2023-09-26 15:31:09,780	ERROR tune_controller.py:1502 -- Trial task failed for trial TrainLightGCN_5e4f57da
Traceback (most recent call last):
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/_private/worker.py", line 2547, in get
    raise value.as_instanceof_

ResultGrid<[
  Result(
    error='RayTaskError(NameError)',
    metrics={},
    path='/home/davo/ray_results/TrainLightGCN_2023-09-26_15-30-56/TrainLightGCN_39614ea3_1_trial_index=0,batch_size=64,conv_layers=4,embedding_dim=32,l2=0.0001,learning_rate=0.0010_2023-09-26_15-30-59',
    filesystem='local',
    checkpoint=None
  ),
  Result(
    error='RayTaskError(NameError)',
    metrics={},
    path='/home/davo/ray_results/TrainLightGCN_2023-09-26_15-30-56/TrainLightGCN_622da3b9_2_trial_index=1,batch_size=64,conv_layers=4,embedding_dim=32,l2=0.0001,learning_rate=0.0010_2023-09-26_15-31-01',
    filesystem='local',
    checkpoint=None
  ),
  Result(
    error='RayTaskError(NameError)',
    metrics={},
    path='/home/davo/ray_results/TrainLightGCN_2023-09-26_15-30-56/TrainLightGCN_0123ae2a_3_trial_index=2,batch_size=64,conv_layers=4,embedding_dim=32,l2=0.0001,learning_rate=0.0010_2023-09-26_15-31-03',
    filesystem='local',
    checkpoint=None
  ),
  Result(
    error='RayTaskError(NameE