In [1]:
import pandas as pd
import numpy as np

from src import datasets
from src import graph_utils as gu

from ray import train, tune
import ray.train.torch # Seems unused, but is used via train.torch

import torch
import torch_geometric as PyG

from typing import List, Dict, Tuple

In [2]:
NUM_FOLDS: int = 5
NUM_SAMPLES: int = 1

## Reading and splitting data

In [3]:
from torch_geometric.data import InMemoryDataset, HeteroData, Data

In [4]:
original = datasets.DAOCensus("./data/daos-census", 'Decentraland', 'snapshot', min_vpu=6)[0]
original

HeteroData(
  user={
    num_nodes=2902,
    voters=[2900],
    authors=[2],
  },
  proposal={ num_nodes=1962 },
  (user, vote, proposal)={ edge_index=[2, 107735] },
  (proposal, vote, user)={ edge_index=[2, 107735] },
  (user, creates, proposal)={ edge_index=[2, 1962] },
  (proposal, creates, user)={ edge_index=[2, 1962] }
)

In [5]:
folds = gu.k_fold(original, NUM_FOLDS)

## Defining training

In [6]:
# import ray.train.torch
train.torch.get_device()

device(type='cpu')

In [7]:
from torch_geometric.nn import LightGCN

class TrainLightGCN(tune.Trainable):
    def setup(self, config: Dict, folds: List[Tuple[HeteroData, HeteroData]], original: HeteroData):
        self.config = config
        self.fold = config['__trial_index__']
        print(config)
        
        self.model = LightGCN(
            num_nodes=original.num_nodes,
            embedding_dim=config['embedding_dim'],
            num_layers=config['conv_layers'],
        ).to(train.torch.get_device())
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=config['learning_rate'])
        
        self.data_train, self.data_validation = folds[self.fold]
        self.data_original = original

    def step(self):
        """
        As a rule of thumb, the execution time of step should be large enough to avoid overheads 
        (i.e. more than a few seconds), but short enough to report progress periodically 
        (i.e. at most a few minutes).
        """
        raise NotImplementedError
        
        acc_loss = n_samples = 0

        for index in tqdm(train_loader, leave=False, delay=1, disable=disable_tqdm):
            pos_edge_index = train_edge_label_index[:, index]
            # TODO: Change to negative structured sampling like in original LightGCN implementation
            neg_edge_index = torch.stack([
                pos_edge_index[0],
                torch.randint(train['proposal'].shift, train['proposal'].end,
                          (pos_edge_index.size(1), ), device=device)
            ], dim=0)
            
            edge_label_index = torch.cat([
                pos_edge_index,
                neg_edge_index,
            ], dim=1)

            optimizer.zero_grad()
            pos_rank, neg_rank = model(message_passing_edge_index, edge_label_index).chunk(2)

            # Learning
            loss = model.recommendation_loss(
                pos_rank,
                neg_rank,
                node_id=edge_label_index.unique(),
                lambda_reg=self.config['l2'],
            )
            loss.backward()
            optimizer.step()

            acc_loss += float(loss) * pos_rank.numel()
            n_samples += pos_rank.numel()

        return {
            'loss': acc_loss/n_samples,
        }

## Tune hyperparameters

In [11]:
from ray.tune.search.hyperopt import HyperOptSearch

search_alg = HyperOptSearch()
search_alg = tune.search.Repeater(search_alg, NUM_FOLDS)

tuner = tune.Tuner(
    tune.with_parameters(TrainLightGCN, folds=folds, original=original),
    run_config=train.RunConfig(
        stop={'training_iteration': 5}
    ),
    param_space=dict(
        batch_size=64,
        learning_rate=0.001,
        embedding_dim=32,
        conv_layers=tune.randint(2,6),
        l2=1e-4,
    ),
    tune_config=tune.TuneConfig(
        search_alg=search_alg,
        num_samples=NUM_FOLDS*NUM_SAMPLES,
        metric='loss',
        mode='min',
    )
)
tuner.fit()

0,1
Current time:,2023-09-26 15:19:31
Running for:,00:00:10.07
Memory:,10.7/15.3 GiB

Trial name,# failures,error file
TrainLightGCN_5008c07d,1,"/home/davo/ray_results/TrainLightGCN_2023-09-26_15-19-21/TrainLightGCN_5008c07d_1_trial_index=0,batch_size=64,conv_layers=4,embedding_dim=32,l2=0.0001,learning_rate=0.0010_2023-09-26_15-19-21/error.txt"
TrainLightGCN_b9e3e1a2,1,"/home/davo/ray_results/TrainLightGCN_2023-09-26_15-19-21/TrainLightGCN_b9e3e1a2_2_trial_index=1,batch_size=64,conv_layers=4,embedding_dim=32,l2=0.0001,learning_rate=0.0010_2023-09-26_15-19-23/error.txt"
TrainLightGCN_460fe231,1,"/home/davo/ray_results/TrainLightGCN_2023-09-26_15-19-21/TrainLightGCN_460fe231_3_trial_index=2,batch_size=64,conv_layers=4,embedding_dim=32,l2=0.0001,learning_rate=0.0010_2023-09-26_15-19-25/error.txt"
TrainLightGCN_35f78854,1,"/home/davo/ray_results/TrainLightGCN_2023-09-26_15-19-21/TrainLightGCN_35f78854_4_trial_index=3,batch_size=64,conv_layers=4,embedding_dim=32,l2=0.0001,learning_rate=0.0010_2023-09-26_15-19-27/error.txt"
TrainLightGCN_264bb339,1,"/home/davo/ray_results/TrainLightGCN_2023-09-26_15-19-21/TrainLightGCN_264bb339_5_trial_index=4,batch_size=64,conv_layers=4,embedding_dim=32,l2=0.0001,learning_rate=0.0010_2023-09-26_15-19-29/error.txt"

Trial name,status,loc,__trial_index__,batch_size,conv_layers,embedding_dim,l2,learning_rate
TrainLightGCN_5008c07d,ERROR,,0,64,4,32,0.0001,0.001
TrainLightGCN_b9e3e1a2,ERROR,,1,64,4,32,0.0001,0.001
TrainLightGCN_460fe231,ERROR,,2,64,4,32,0.0001,0.001
TrainLightGCN_35f78854,ERROR,,3,64,4,32,0.0001,0.001
TrainLightGCN_264bb339,ERROR,,4,64,4,32,0.0001,0.001


2023-09-26 15:19:23,279	ERROR tune_controller.py:1502 -- Trial task failed for trial TrainLightGCN_5008c07d
Traceback (most recent call last):
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/_private/worker.py", line 2549, in get
    raise value
ray.exceptions

[2m[36m(TrainLightGCN pid=33236)[0m {'batch_size': 64, 'learning_rate': 0.001, 'embedding_dim': 32, 'conv_layers': 4, 'l2': 0.0001, '__trial_index__': 0}


2023-09-26 15:19:25,312	ERROR tune_controller.py:1502 -- Trial task failed for trial TrainLightGCN_b9e3e1a2
Traceback (most recent call last):
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/_private/worker.py", line 2549, in get
    raise value
ray.exceptions

[2m[36m(TrainLightGCN pid=33337)[0m {'batch_size': 64, 'learning_rate': 0.001, 'embedding_dim': 32, 'conv_layers': 4, 'l2': 0.0001, '__trial_index__': 3}[32m [repeated 3x across cluster][0m


2023-09-26 15:19:31,234	ERROR tune_controller.py:1502 -- Trial task failed for trial TrainLightGCN_264bb339
Traceback (most recent call last):
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/_private/worker.py", line 2549, in get
    raise value
ray.exceptions

ResultGrid<[
  Result(
    error='TuneError',
    metrics={},
    path='/home/davo/ray_results/TrainLightGCN_2023-09-26_15-19-21/TrainLightGCN_5008c07d_1_trial_index=0,batch_size=64,conv_layers=4,embedding_dim=32,l2=0.0001,learning_rate=0.0010_2023-09-26_15-19-21',
    filesystem='local',
    checkpoint=None
  ),
  Result(
    error='TuneError',
    metrics={},
    path='/home/davo/ray_results/TrainLightGCN_2023-09-26_15-19-21/TrainLightGCN_b9e3e1a2_2_trial_index=1,batch_size=64,conv_layers=4,embedding_dim=32,l2=0.0001,learning_rate=0.0010_2023-09-26_15-19-23',
    filesystem='local',
    checkpoint=None
  ),
  Result(
    error='TuneError',
    metrics={},
    path='/home/davo/ray_results/TrainLightGCN_2023-09-26_15-19-21/TrainLightGCN_460fe231_3_trial_index=2,batch_size=64,conv_layers=4,embedding_dim=32,l2=0.0001,learning_rate=0.0010_2023-09-26_15-19-25',
    filesystem='local',
    checkpoint=None
  ),
  Result(
    error='TuneError',
    metrics={},
    path='/home/davo/ray_results

[2m[36m(TrainLightGCN pid=33370)[0m Exception raised in creation task: The actor died because of an error raised in its creation task, [36mray::_Inner.__init__()[39m (pid=33370, ip=147.96.25.138, actor_id=61b13cd1dfea13b918214afb01000000, repr=<ray.tune.trainable.util.TrainLightGCN object at 0x7f3518528710>)
[2m[36m(TrainLightGCN pid=33370)[0m            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[2m[36m(TrainLightGCN pid=33370)[0m            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[2m[36m(TrainLightGCN pid=33370)[0m   File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/tune/trainable/trainable.py", line 185, in __init__
[2m[36m(TrainLightGCN pid=33370)[0m     self.setup(copy.deepcopy(self.config))
[2m[36m(TrainLightGCN pid=33370)[0m            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[2m[36m(TrainLightGCN pid=33370)[0m   File "/home/davo/Documents/MUIA/upm-tfm-notebooks/.direnv/python-3.11/lib/python3.11/site-packages/ray/tune/trainab

[2m[36m(TrainLightGCN pid=33370)[0m {'batch_size': 64, 'learning_rate': 0.001, 'embedding_dim': 32, 'conv_layers': 4, 'l2': 0.0001, '__trial_index__': 4}
