In [1]:
from typing import Dict, List, Tuple, Union, Any, Optional

import os
import sys
import time
print("Python version", sys.version)

# Ignore pandas warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from pathlib import Path

import pandas as pd
import numpy as np
import tensorflow as tf

from tqdm.autonotebook import tqdm

import ray
from ray import train, tune

from src.datasets import daocensus

%load_ext autoreload
%autoreload 2

tf.get_logger().setLevel('WARNING')

tf.config.list_physical_devices("GPU")
sys_details = tf.sysconfig.get_build_info()
cuda = sys_details.get("cuda_version", -1)
cudnn = sys_details.get("cudnn_version", -1)
print(cuda, cudnn)
print('Ray version:', ray.__version__)

Python version 3.9.18 (main, Oct 24 2023, 09:18:18) 
[GCC 11.4.0]


2023-11-27 11:31:32.724675: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-27 11:31:32.724699: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-27 11:31:32.724709: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


11.8 8
Ray version: 2.7.1


In [2]:
# Others config
SEED: int = 57
RAY_RESULTS_PATH: Path = Path('~/ray_results').expanduser()

# Dataset config
DAO_NAME = 'Decentraland'

# Dataset splits config
SPLIT_FREQ = '1W' # Split weekly
LAST_SPLITS = 10 # Use just last 10 splits

# Training config
MAX_EPOCHS: int = 200
EPOCHS_PER_ITER: int = 5
SAMPLES_PER_SPLIT: int = 10

# Eval config
TOP_K: List[int] = [5, 10]
METRICS: List[str] = ["recall", "ndcg", "precision", "map"]

# Load data

In [3]:
dfptext = pd.read_csv('./snapshot_proposals.csv')[['proposal_id', 'title', 'description', 'start', 'end']]
dfv, dfp = daocensus.get("./data/daos-census", DAO_NAME, 'snapshot')
dfv['voter'] = dfv['voter'].astype('str')
dfp = dfp.merge(dfptext, how='left', left_on='platform_proposal', right_on='proposal_id')
dfp[['start', 'end']] = dfp[['start', 'end']].astype('datetime64')
print(dfv.info())
print(dfp.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116560 entries, 0 to 116559
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   platform       116560 non-null  object        
 1   name           116560 non-null  object        
 2   id             116560 non-null  object        
 3   proposal       116560 non-null  category      
 4   deployment     116560 non-null  object        
 5   platform_vote  116560 non-null  object        
 6   voter          116560 non-null  object        
 7   date           116560 non-null  datetime64[ns]
 8   choice         116560 non-null  object        
 9   weight         116560 non-null  float64       
dtypes: category(1), datetime64[ns](1), float64(1), object(7)
memory usage: 8.3+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1942 entries, 0 to 1941
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype         
---  ------ 

## Transform data

In [4]:
def to_microsoft(dfv):
    df = dfv[['voter', 'proposal', 'date']].rename(columns={
        'voter': 'userID',
        'proposal': 'itemID',
        'date': 'timestamp',
    })
    df['itemID'] = df['itemID'].astype('str')
    df['rating'] = 1
    return df

df = to_microsoft(dfv)
df.head()

Unnamed: 0,userID,itemID,timestamp,rating
0,0xe7af1c70f8f089c4c3bd71999692c6c5a15d9e2a,b86aa059-3d31-5d41-a472-70962816f779,2021-12-17 12:28:01,1
1,0xc54a6c3778016b06cbd126ccc3b5bc06c5f666fb,b86aa059-3d31-5d41-a472-70962816f779,2021-12-17 02:16:23,1
2,0xd82d005e8f8d5385db40ba23884a5c967bb1e8af,b86aa059-3d31-5d41-a472-70962816f779,2021-12-17 00:38:22,1
3,0xf4c64db66ffb301985f5ecd85c8f3f9c02f2659d,b86aa059-3d31-5d41-a472-70962816f779,2021-12-16 18:47:08,1
4,0xd5e9ef1cedad0d135d543d286a2c190b16cbb89e,b86aa059-3d31-5d41-a472-70962816f779,2021-12-16 18:32:15,1


# Split data

Each proposal remains open for a few days, our environment is different of a movies recommender system. For this reason, we will use a TimeSeriesSplit instead of a K-Fold to cross-validate the model.

![](https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_013.png)

El TimeSeriesSplit de scikit-learn no nos vale porque el número de elementos en cada split es el mismo, pero el tamaño del intervalo, no. Como queremos simular un comportamiento realista, haremos el split dividiendo por intervalos de igual longitud.

In [5]:
from recommenders.evaluation.python_evaluation import metrics as metrics_dict

In [6]:
from src.model_selection import timeFreqSplitCurrent, timeIntervalSplitCurrent

# max_train_prev = df['timestamp'].min().date()
N_SPLITS = 10; SKIP_SPLIT = 0
folds = list(timeIntervalSplitCurrent(df, N_SPLITS, dfp, skip=SKIP_SPLIT, remove_not_in_train_col='userID', return_open=True))
# folds = list(timeFreqSplitCurrent(df, SPLIT_FREQ, dfp, return_open=True, remove_not_in_train_col='userID'))[-LAST_SPLITS:]
for i, (dftrain, dftest, t, open_proposals) in enumerate(folds):
    min_train = dftrain['timestamp'].min().date()
    max_train = dftrain['timestamp'].max().date()
    min_test  = dftest['timestamp'].min().date()
    max_test  = dftest['timestamp'].max().date()

    train_users = len(set(dftrain['userID']))
    test_users = len(set(dftest['userID']))
    
    print(f"Split {i}, train from: {min_train} to {max_train}, test from: {min_test} to {max_test}")
    print(f"  t: {t}")
    print(f"  len(train): {len(dftrain)}, len(test): {len(dftest)}")
    print(f"  users(train): {train_users}, users(test): {test_users}")

    print()
    dftest['prediction'] = 1
    for m in METRICS:
        f = metrics_dict[f'{m}_at_k']
        print(f"  highest possible {m}@{TOP_K[0]}:\t{f(dftest, dftest, k=TOP_K[0], relevancy_method='top_k'):.4f}")

    print("-"*30)

Split 0, train from: 2021-05-24 to 2021-08-04, test from: 2021-08-04 to 2021-08-10
  t: 2021-08-04 00:57:07
  len(train): 3453, len(test): 48
  users(train): 581, users(test): 27

  highest possible recall@5:	0.9894
  highest possible ndcg@5:	1.0000
  highest possible precision@5:	0.3407
  highest possible map@5:	0.9894
------------------------------
Split 1, train from: 2021-05-24 to 2021-10-14, test from: 2021-10-15 to 2021-10-19
  t: 2021-10-14 16:08:30
  len(train): 6397, len(test): 132
  users(train): 930, users(test): 121

  highest possible recall@5:	0.9976
  highest possible ndcg@5:	1.0000
  highest possible precision@5:	0.2149
  highest possible map@5:	0.9976
------------------------------
Split 2, train from: 2021-05-24 to 2021-12-25, test from: 2021-12-25 to 2021-12-31
  t: 2021-12-25 02:40:07
  len(train): 15187, len(test): 99
  users(train): 2388, users(test): 47

  highest possible recall@5:	0.9721
  highest possible ndcg@5:	1.0000
  highest possible precision@5:	0.3660
 

# Defining training

In [7]:
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.utils.python_utils import get_top_k_scored_items

class LightGCNCustom(LightGCN):
    # Copied from LightGCN.fit but RETURNING the data and deleting unnecessary things
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.epochs_done = 0
    
    def fit_epoch(self):
        """Fit the model on self.data.train.
        """
        loss, mf_loss, emb_loss = 0.0, 0.0, 0.0
        n_batch = self.data.train.shape[0] // self.batch_size + 1
        for idx in range(n_batch):
            users, pos_items, neg_items = self.data.train_loader(self.batch_size)
            _, batch_loss, batch_mf_loss, batch_emb_loss = self.sess.run(
                [self.opt, self.loss, self.mf_loss, self.emb_loss],
                feed_dict={
                    self.users: users,
                    self.pos_items: pos_items,
                    self.neg_items: neg_items,
                },
            )
            loss += batch_loss / n_batch
            mf_loss += batch_mf_loss / n_batch
            emb_loss += batch_emb_loss / n_batch

        if np.isnan(loss):
            print("ERROR: loss is nan.")
            sys.exit()

        self.epochs_done += 1

        return loss, mf_loss, emb_loss

    def recommend_k_items(
        self, test, top_k=10, sort_top_k=True, remove_seen=True, use_id=False, recommend_from=None,
    ):
        """
        Copy-pasted from LightGCN but adding the `recommend_from` argument
        """
        data = self.data
        if not use_id:
            user_ids = np.array([data.user2id[x] for x in test[data.col_user].unique()])
        else:
            user_ids = np.array(test[data.col_user].unique())

        test_scores = self.score(user_ids, remove_seen=remove_seen)

        ### START NEW BEHAVIOUR
        if recommend_from is not None:
            from_idx = np.array([data.item2id[x] for x in set(recommend_from)])
            msk = np.ones(test_scores.shape[1], bool)
            msk[from_idx] = False

            # Set the score of that proposal to zero for every user
            test_scores[:, msk] = -np.inf
        ### END NEW BEHAVIOUR

        top_items, top_scores = get_top_k_scored_items(
            scores=test_scores, top_k=top_k, sort_top_k=sort_top_k
        )

        df = pd.DataFrame(
            {
                data.col_user: np.repeat(
                    test[data.col_user].drop_duplicates().values, top_items.shape[1]
                ),
                data.col_item: top_items.flatten()
                if use_id
                else [data.id2item[item] for item in top_items.flatten()],
                data.col_prediction: top_scores.flatten(),
            }
        )

        return df.replace(-np.inf, np.nan).dropna()

## Small test of `LightGCNCustom`

In [8]:
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.models.deeprec.deeprec_utils import prepare_hparams

In [9]:
hparams = prepare_hparams(
    model_type='lightgcn',
    n_layers=3,
    batch_size=512,
    embed_size=64,
    epochs=2,
    learning_rate=0.001,
    decay=0.001,
    metrics=["recall", "ndcg", "precision", "map"],
    eval_epoch=2,
    top_k=TOP_K[0],
    save_model=False,
    MODEL_DIR='./data/model/lightgcn/',
)
dataloader = ImplicitCF(train=folds[-1][0], test=folds[-1][1], seed=SEED)
print("items:", dataloader.n_items, "user:", dataloader.n_users)
model = LightGCNCustom(data=dataloader, hparams=hparams)

items: 1836 user: 6857
Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.


In [10]:
model.fit()

Epoch 1 (train)0.7s: train loss = 0.59989 = (mf)0.59914 + (embed)0.00075
Epoch 2 (train)0.5s + (eval)0.5s: train loss = 0.35067 = (mf)0.34676 + (embed)0.00391, recall = 0.00400, ndcg = 0.00394, precision = 0.00320, map = 0.00200


In [11]:
model.run_eval()

[0.004, 0.00394083821985168, 0.0032, 0.002]

In [12]:
model.recommend_k_items(dataloader.test, top_k=3, use_id=True, remove_seen=True, recommend_from={'b86aa059-3d31-5d41-a472-70962816f779', '56b4d333-4138-5aa3-9890-3502b9478079', 'd083109e-4819-54b9-a01c-67bd5a770f65' })

Unnamed: 0,userID,itemID,prediction
0,293,1175,6.576969
1,293,1835,1.817568
2,293,0,-4.204648
3,466,1835,1.430669
4,466,0,-0.766504
...,...,...,...
369,245,1835,1.773020
370,245,0,-5.174218
372,578,1175,3.910652
373,578,1835,0.921735


# Defining trainer

In [13]:
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k

class TrainLightGCN(tune.Trainable):
    def setup(
        self,
        config: Dict[str, Any],
        fold,
        data,
    ):
        self.config = config
        self.fold = fold
        config['batch_size'] = 2**config['batch_size']

        self.hparams = prepare_hparams(
            model_type='lightgcn',
            n_layers=config['conv_layers'],
            batch_size=config['batch_size'],
            embed_size=config['embedding_dim'],
            epochs=EPOCHS_PER_ITER,
            learning_rate=config['learning_rate'],
            decay=config['l2'],
            metrics=METRICS,
            eval_epoch=-1,
            top_k=TOP_K[0],
            save_model=False,
            MODEL_DIR='./data/model/lightgcn/',
        )

        train, test, self.t, self.open_proposals = data
        self.dataloader = ImplicitCF(train=train, test=test, seed=SEED)
        self.model = LightGCNCustom(self.hparams, self.dataloader, seed=SEED)
        self.total_train = 0
        self.total_eval = 0

    @property
    def iteration(self):
        return self.model.epochs_done

    @property
    def training_iteration(self):
        return self.model.epochs_done

    def step(self):
        """
        As a rule of thumb, the execution time of step should be large enough to avoid overheads 
        (i.e. more than a few seconds), but short enough to report progress periodically 
        (i.e. at most a few minutes).
        """
        assert EPOCHS_PER_ITER > 0

        train_start = time.time()
        for _ in range(EPOCHS_PER_ITER):
            ret = self.model.fit_epoch()
        eval_start = train_end = time.time()

        eval_dict = {'model_'+k:v for k,v in zip(self.model.metrics, self.model.run_eval())}
        for k in TOP_K:
            recs = self.model.recommend_k_items(
                self.dataloader.test, 
                top_k=k,
                use_id=True, 
                remove_seen=True, 
                recommend_from=self.open_proposals,
            )
            
            eval_dict[f'precision@{k}'] = precision_at_k(self.dataloader.test, recs, k=k)
            eval_dict[f'ndcg@{k}'] = ndcg_at_k(self.dataloader.test, recs, k=k)
            eval_dict[f'recall@{k}'] = recall_at_k(self.dataloader.test, recs, k=k)
            eval_dict[f'map@{k}'] = map_at_k(self.dataloader.test, recs, k=k)

        eval_end = time.time()

        self.total_train += train_end - train_start
        self.total_eval += eval_end - eval_start
        
        return {
            'iteration': self.iteration,
            'loss': ret[0],
            'mf_loss': ret[1],
            'emb_loss': ret[2],
            **eval_dict,
            'time_train': train_end-train_start,
            'time_test': eval_end-eval_start,
            'time_total_train': self.total_train,
            'time_total_test': self.total_eval,
        }

    def save_checkpoint(self, checkpoint_dir):
        checkpoint_path = os.path.join(checkpoint_dir, "model")
        self.model.saver.save(
            sess=self.model.sess,
            save_path=checkpoint_path,
        )
        return checkpoint_dir

    def load_checkpoint(self, checkpoint_path):
        self.model.load(checkpoint_path)

# Big experiment

In [14]:
os.uname().nodename

'lamarck'

In [15]:
RAY_RESULTS_PATH

PosixPath('/home/daviddavo/ray_results')

In [16]:
import ray
from ray.tune.search.hyperopt import HyperOptSearch
import datetime as dt

def getTunerOnFold(f):
    name = f'LightGCN_dao={DAO_NAME},fold={f}'
    paths = list(RAY_RESULTS_PATH.glob(f'{name}_*'))
    last_experiment = max(paths, key=lambda x: x.stat().st_ctime) if paths else None
    
    ### SET TRAINING RESOURCES
    if os.uname().nodename == 'lamarck':
        # assert torch.cuda.is_available()
        
        NUM_SAMPLES = SAMPLES_PER_SPLIT
        # Every run takes approx half a gig of vram (no optimizations)
        # The RTX 4090 has 24GB so we can run the model about 48 times
        resources_per_trial={
            'cpu': 1,
            # GPU has 25GiB, and each run might take up to 2GiB (torch version was lighter)
            # so each run might take up to 1/12th of the GPU
            # I use 1/16th so I don't take all the resources in the machine
            'gpu': 1/8,
        }
    else:
        NUM_SAMPLES = 1
        resources_per_trial={
            'cpu': 1,
            # It takes about 1.5 GiB with full training data, but I put a bit more because
            # this notebook also takes a bit of memory
            'memory': 2e9,
        }

    dftrain,dftest,t,open_proposals = folds[f]
    param_space = dict(
        batch_size=tune.randint(4,10),
        embedding_dim=tune.qlograndint(1, 1000, 5),
        conv_layers=tune.randint(1,6),
        learning_rate=tune.qloguniform(1e-4, 1, 1e-4),
        l2=tune.loguniform(1e-7, 1e-2),
    )
    
    ### RESTORE EXPERIMENT OR CREATE A NEW ONE
    if last_experiment and tune.Tuner.can_restore(last_experiment):
        print(f"Restoring last experiment: {last_experiment}")
        tuner = tune.Tuner.restore(
            str(last_experiment),
            trainable=tune.with_resources(
                # tune.with_parameters(TrainLightGCN,  train=dftrain, test=dftest, open_proposals=open_proposals),
                tune.with_parameters(TrainLightGCN,  fold=f, folds=folds),
                resources_per_trial,
            ),
            restart_errored=True,
            param_space=param_space,
        )
    else:
        print(f"No experiment found for fold {f}, creating new tuner")
        search_alg = None
        search_alg = HyperOptSearch(
            points_to_evaluate=[{
                'batch_size': 8, # 2**8 = 256
                'learning_rate': 10e-2,
                'l2': 10e-6,
                'embedding_dim': 100,
                'conv_layers': 3,
            }],
            random_state_seed=SEED,
        )
        # search_alg = tune.search.Repeater(search_alg, N_SPLITS-SKIP_SPLIT)
        
        tuner = tune.Tuner(
            tune.with_resources(
                # tune.with_parameters(TrainLightGCN,  train=dftrain, test=dftest, open_proposals=open_proposals),
                tune.with_parameters(TrainLightGCN, fold=f, data=folds[f]),
                resources_per_trial,
            ),
            run_config=train.RunConfig(
                stop={'training_iteration': MAX_EPOCHS/EPOCHS_PER_ITER, 'time_total_train': 600},
                name=name + f'_{dt.datetime.now().isoformat()}',
                storage_path=RAY_RESULTS_PATH,
            ),
            param_space=param_space,
            tune_config=tune.TuneConfig(
                search_alg=search_alg,
                num_samples=NUM_SAMPLES,
                metric='ndcg@10',
                mode='max',
            )
        )

    return tuner

tuners = [ getTunerOnFold(f) for f in range(LAST_SPLITS) ]

No experiment found for fold 0, creating new tuner
No experiment found for fold 1, creating new tuner
No experiment found for fold 2, creating new tuner
No experiment found for fold 3, creating new tuner
No experiment found for fold 4, creating new tuner
No experiment found for fold 5, creating new tuner
No experiment found for fold 6, creating new tuner
No experiment found for fold 7, creating new tuner
No experiment found for fold 8, creating new tuner
No experiment found for fold 9, creating new tuner


In [17]:
tuners[0].fit()

0,1
Current time:,2023-11-27 11:33:03
Running for:,00:01:15.67
Memory:,7.4/125.6 GiB

Trial name,status,loc,batch_size,conv_layers,embedding_dim,l2,learning_rate,iter,total time (s),iteration,loss,mf_loss
TrainLightGCN_0c8f33b3,TERMINATED,147.96.81.131:3228932,8,3,100,1e-05,0.1,40,6.38276,200,0.0421986,0.0187669
TrainLightGCN_eaf512e7,TERMINATED,147.96.81.131:3229124,7,5,30,2.8606e-05,0.2347,40,10.94,200,0.178844,0.0840955
TrainLightGCN_e0d2668a,TERMINATED,147.96.81.131:3229260,8,3,5,5.24282e-05,0.22,40,7.27317,200,0.108728,0.104674
TrainLightGCN_2154c6f0,TERMINATED,147.96.81.131:3229393,6,4,300,8.92312e-07,0.0065,40,28.1228,200,0.000303895,8.61521e-05
TrainLightGCN_712b1820,TERMINATED,147.96.81.131:3229590,5,3,215,2.79193e-05,0.0005,40,48.4452,200,0.0106943,0.00893954
TrainLightGCN_ea8116f1,TERMINATED,147.96.81.131:3229724,5,4,5,0.00397996,0.0038,40,49.1856,200,0.172879,0.112907
TrainLightGCN_abfa9f33,TERMINATED,147.96.81.131:3229863,5,3,5,0.0011209,0.0002,40,50.0777,200,0.260389,0.25047
TrainLightGCN_9d7cec86,TERMINATED,147.96.81.131:3230001,7,4,405,0.00406073,0.0014,40,19.227,200,0.12467,0.0477474
TrainLightGCN_6d9d5af3,TERMINATED,147.96.81.131:3230190,5,1,205,6.80694e-06,0.0085,40,51.058,200,0.00467396,0.00322871
TrainLightGCN_aa9755fb,TERMINATED,147.96.81.131:3230324,9,4,110,0.000527247,0.0001,40,10.2457,200,0.386654,0.384152


[2m[36m(pid=3228932)[0m 2023-11-27 11:31:49.255099: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[2m[36m(pid=3228932)[0m 2023-11-27 11:31:49.255125: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[2m[36m(pid=3228932)[0m 2023-11-27 11:31:49.255137: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[2m[36m(TrainLightGCN pid=3228932)[0m   df = train if test is None else train.append(test)


[2m[36m(TrainLightGCN pid=3228932)[0m Already create adjacency matrix.
[2m[36m(TrainLightGCN pid=3228932)[0m Already normalize adjacency matrix.
[2m[36m(TrainLightGCN pid=3228932)[0m Using xavier initialization.


[2m[36m(pid=3229260)[0m 2023-11-27 11:31:54.368370: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
[2m[36m(pid=3229260)[0m 2023-11-27 11:31:54.368402: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(pid=3229260)[0m 2023-11-27 11:31:54.368414: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered[32m [repe

[2m[36m(TrainLightGCN pid=3229260)[0m Already create adjacency matrix.[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3229260)[0m Already normalize adjacency matrix.[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3229124)[0m Using xavier initialization.
[2m[36m(TrainLightGCN pid=3229260)[0m Using xavier initialization.


[2m[36m(TrainLightGCN pid=3228932)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_dao=Decentraland,fold=0_2023-11-27T11:31:45.345890/TrainLightGCN_0c8f33b3_1_batch_size=8,conv_layers=3,embedding_dim=100,l2=0.0000,learning_rate=0.1000_2023-11-27_11-31-48/checkpoint_000000)
[2m[36m(pid=3229590)[0m 2023-11-27 11:31:59.708204: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(pid=3229590)[0m 2023-11-27 11:31:59.708240: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(pid=3229590)[0m 2023-11-27 11:31:59.708253: E tensorflow/compiler/xla/stream_executor/cuda/cuda_b

[2m[36m(TrainLightGCN pid=3229590)[0m Already create adjacency matrix.[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3229590)[0m Already normalize adjacency matrix.[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3229393)[0m Using xavier initialization.
[2m[36m(TrainLightGCN pid=3229590)[0m Using xavier initialization.


[2m[36m(TrainLightGCN pid=3229260)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_dao=Decentraland,fold=0_2023-11-27T11:31:45.345890/TrainLightGCN_e0d2668a_3_batch_size=8,conv_layers=3,embedding_dim=5,l2=0.0001,learning_rate=0.2200_2023-11-27_11-31-53/checkpoint_000000)
[2m[36m(pid=3229863)[0m 2023-11-27 11:32:05.181483: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(pid=3229863)[0m 2023-11-27 11:32:05.181513: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(pid=3229863)[0m 2023-11-27 11:32:05.181527: E tensorflow/compiler/xla/stream_executor/cuda/cuda_bla

[2m[36m(TrainLightGCN pid=3229863)[0m Already create adjacency matrix.[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3229863)[0m Already normalize adjacency matrix.[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3229863)[0m Using xavier initialization.[32m [repeated 2x across cluster][0m


[2m[36m(TrainLightGCN pid=3229124)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_dao=Decentraland,fold=0_2023-11-27T11:31:45.345890/TrainLightGCN_eaf512e7_2_batch_size=7,conv_layers=5,embedding_dim=30,l2=0.0000,learning_rate=0.2347_2023-11-27_11-31-50/checkpoint_000000)
[2m[36m(pid=3230190)[0m 2023-11-27 11:32:10.844224: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(pid=3230190)[0m 2023-11-27 11:32:10.844262: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(pid=3230190)[0m 2023-11-27 11:32:10.844275: E tensorflow/compiler/xla/stream_executor/cuda/cuda_bl

[2m[36m(TrainLightGCN pid=3230190)[0m Already create adjacency matrix.[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3230190)[0m Already normalize adjacency matrix.[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3230190)[0m Using xavier initialization.[32m [repeated 2x across cluster][0m


[2m[36m(TrainLightGCN pid=3230324)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_dao=Decentraland,fold=0_2023-11-27T11:31:45.345890/TrainLightGCN_aa9755fb_10_batch_size=9,conv_layers=4,embedding_dim=110,l2=0.0005,learning_rate=0.0001_2023-11-27_11-32-12/checkpoint_000000)
[2m[36m(pid=3230324)[0m 2023-11-27 11:32:13.841549: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[2m[36m(pid=3230324)[0m 2023-11-27 11:32:13.841584: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[2m[36m(pid=3230324)[0m 2023-11-27 11:32:13.841599: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register fact

ResultGrid<[
  Result(
    metrics={'iteration': 200, 'loss': 0.04219864987369095, 'mf_loss': 0.01876692866393819, 'emb_loss': 0.023431720877332345, 'model_recall': 0.0, 'model_ndcg': 0.0, 'model_precision': 0.0, 'model_map': 0.0, 'precision@5': 0.19259259259259262, 'ndcg@5': 0.32814707703587825, 'recall@5': 0.5288359788359789, 'map@5': 0.22530717225161667, 'precision@10': 0.17777777777777778, 'ndcg@10': 0.5042029036420986, 'recall@10': 1.0, 'map@10': 0.3384486016628873, 'time_train': 0.12309741973876953, 'time_test': 0.029708147048950195, 'time_total_train': 4.546944856643677, 'time_total_test': 1.835418939590454},
    path='/home/daviddavo/ray_results/LightGCN_dao=Decentraland,fold=0_2023-11-27T11:31:45.345890/TrainLightGCN_0c8f33b3_1_batch_size=8,conv_layers=3,embedding_dim=100,l2=0.0000,learning_rate=0.1000_2023-11-27_11-31-48',
    filesystem='local',
    checkpoint=Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_dao=Decentraland,fold=0_2023-11-27T11:31:45.3

[2m[36m(TrainLightGCN pid=3230190)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_dao=Decentraland,fold=0_2023-11-27T11:31:45.345890/TrainLightGCN_6d9d5af3_9_batch_size=5,conv_layers=1,embedding_dim=205,l2=0.0000,learning_rate=0.0085_2023-11-27_11-32-09/checkpoint_000000)


In [18]:
tuners[1].fit()

0,1
Current time:,2023-11-27 11:35:08
Running for:,00:02:04.27
Memory:,7.4/125.6 GiB

Trial name,status,loc,batch_size,conv_layers,embedding_dim,l2,learning_rate,iter,total time (s),iteration,loss,mf_loss
TrainLightGCN_b4d5ffd3,TERMINATED,147.96.81.131:3230663,8,3,100,1e-05,0.1,40,11.4349,200,0.154042,0.117106
TrainLightGCN_4c591499,TERMINATED,147.96.81.131:3230770,7,5,30,2.8606e-05,0.2347,40,27.5396,200,0.267354,0.12911
TrainLightGCN_1c5aa8d2,TERMINATED,147.96.81.131:3230900,8,3,5,5.24282e-05,0.22,40,15.6308,200,0.0976949,0.0929654
TrainLightGCN_2cacb1ad,TERMINATED,147.96.81.131:3231032,6,4,300,8.92312e-07,0.0065,40,62.3174,200,0.000994242,0.000709648
TrainLightGCN_3152e061,TERMINATED,147.96.81.131:3231165,5,3,215,2.79193e-05,0.0005,40,101.871,200,0.00608602,0.00425104
TrainLightGCN_76c7f30d,TERMINATED,147.96.81.131:3231299,5,4,5,0.00397996,0.0038,40,101.209,200,0.143354,0.0900171
TrainLightGCN_11266433,TERMINATED,147.96.81.131:3231439,5,3,5,0.0011209,0.0002,40,102.977,200,0.199206,0.188503
TrainLightGCN_1b8392ce,TERMINATED,147.96.81.131:3231569,7,4,405,0.00406073,0.0014,40,40.702,200,0.106335,0.0394764
TrainLightGCN_443a692a,TERMINATED,147.96.81.131:3231702,5,1,205,6.80694e-06,0.0085,40,99.0603,200,0.00687999,0.00483418
TrainLightGCN_14cef882,TERMINATED,147.96.81.131:3231843,9,4,110,0.000527247,0.0001,40,18.2314,200,0.240458,0.236538


[2m[36m(pid=3230663)[0m 2023-11-27 11:33:04.890277: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[2m[36m(pid=3230663)[0m 2023-11-27 11:33:04.890308: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[2m[36m(pid=3230663)[0m 2023-11-27 11:33:04.890321: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[2m[36m(TrainLightGCN pid=3230663)[0m   df = train if test is None else train.append(test)


[2m[36m(TrainLightGCN pid=3230663)[0m Already create adjacency matrix.
[2m[36m(TrainLightGCN pid=3230663)[0m Already normalize adjacency matrix.
[2m[36m(TrainLightGCN pid=3230663)[0m Using xavier initialization.


[2m[36m(pid=3230900)[0m 2023-11-27 11:33:10.003333: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(pid=3230900)[0m 2023-11-27 11:33:10.003365: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(pid=3230900)[0m 2023-11-27 11:33:10.003377: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3230900)[0m   df = train if test is None else train.append(test)[32m [repeated 2x across cluster][0m


[2m[36m(TrainLightGCN pid=3230900)[0m Already create adjacency matrix.[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3230900)[0m Already normalize adjacency matrix.[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3230900)[0m Using xavier initialization.[32m [repeated 2x across cluster][0m


[2m[36m(pid=3231165)[0m 2023-11-27 11:33:15.350001: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(pid=3231165)[0m 2023-11-27 11:33:15.350036: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(pid=3231165)[0m 2023-11-27 11:33:15.350051: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3231165)[0m   df = train if test is None else train.append(test)[32m [repeated 2x across cluster][0m


[2m[36m(TrainLightGCN pid=3231165)[0m Already create adjacency matrix.[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3231165)[0m Already normalize adjacency matrix.[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3231165)[0m Using xavier initialization.[32m [repeated 2x across cluster][0m


[2m[36m(TrainLightGCN pid=3230663)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_dao=Decentraland,fold=1_2023-11-27T11:31:45.350771/TrainLightGCN_b4d5ffd3_1_batch_size=8,conv_layers=3,embedding_dim=100,l2=0.0000,learning_rate=0.1000_2023-11-27_11-33-03/checkpoint_000000)
[2m[36m(pid=3231439)[0m 2023-11-27 11:33:21.028755: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(pid=3231439)[0m 2023-11-27 11:33:21.028790: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(pid=3231439)[0m 2023-11-27 11:33:21.028803: E tensorflow/compiler/xla/stream_executor/cuda/cuda_b

[2m[36m(TrainLightGCN pid=3231439)[0m Already create adjacency matrix.[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3231439)[0m Already normalize adjacency matrix.[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3231439)[0m Using xavier initialization.[32m [repeated 2x across cluster][0m


[2m[36m(pid=3231702)[0m 2023-11-27 11:33:27.024618: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(pid=3231702)[0m 2023-11-27 11:33:27.024656: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(pid=3231702)[0m 2023-11-27 11:33:27.024667: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3230900)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_dao=Decentraland,fo

[2m[36m(TrainLightGCN pid=3231702)[0m Already create adjacency matrix.[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3231702)[0m Already normalize adjacency matrix.[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3231702)[0m Using xavier initialization.[32m [repeated 2x across cluster][0m


[2m[36m(pid=3231843)[0m 2023-11-27 11:33:30.141335: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[2m[36m(pid=3231843)[0m 2023-11-27 11:33:30.141371: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[2m[36m(pid=3231843)[0m 2023-11-27 11:33:30.141384: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[2m[36m(TrainLightGCN pid=3230770)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_dao=Decentraland,fold=1_2023-11-27T11:31:45.350771/TrainLightGCN_4c591499_2_batch_size=7,conv_layers=5,embedding_dim=30,l2=0.0000,lea

ResultGrid<[
  Result(
    metrics={'iteration': 200, 'loss': 0.1540416529774666, 'mf_loss': 0.11710555890473452, 'emb_loss': 0.03693609341979027, 'model_recall': 0.8099173553719008, 'model_ndcg': 0.40266699365531883, 'model_precision': 0.16198347107438016, 'model_map': 0.2672176308539945, 'precision@5': 0.18842975206611567, 'ndcg@5': 0.5667554888817862, 'recall@5': 0.903482880755608, 'map@5': 0.44366883116883127, 'precision@10': 0.10247933884297519, 'ndcg@10': 0.5831849356243723, 'recall@10': 0.95543093270366, 'map@10': 0.45190869736324285, 'time_train': 0.339479923248291, 'time_test': 0.08572196960449219, 'time_total_train': 9.175868511199951, 'time_total_test': 2.2585694789886475},
    path='/home/daviddavo/ray_results/LightGCN_dao=Decentraland,fold=1_2023-11-27T11:31:45.350771/TrainLightGCN_b4d5ffd3_1_batch_size=8,conv_layers=3,embedding_dim=100,l2=0.0000,learning_rate=0.1000_2023-11-27_11-33-03',
    filesystem='local',
    checkpoint=Checkpoint(filesystem=local, path=/home/davidd

In [19]:
tuners[-1].fit()

0,1
Current time:,2023-11-27 11:35:36
Running for:,00:00:28.73
Memory:,15.9/125.6 GiB

Trial name,status,loc,batch_size,conv_layers,embedding_dim,l2,learning_rate,iter,total time (s),iteration,loss,mf_loss
TrainLightGCN_aa592f54,RUNNING,147.96.81.131:3232073,8,3,100,1e-05,0.1,3.0,25.5241,15.0,0.296893,0.248759
TrainLightGCN_4fd6baff,RUNNING,147.96.81.131:3232180,7,5,30,2.8606e-05,0.2347,1.0,14.8364,5.0,0.683147,0.562744
TrainLightGCN_4bd30b54,RUNNING,147.96.81.131:3232313,8,3,5,5.24282e-05,0.22,1.0,9.28025,5.0,0.289777,0.281659
TrainLightGCN_7e5fbfe6,RUNNING,147.96.81.131:3232445,6,4,300,8.92312e-07,0.0065,,,,,
TrainLightGCN_b0b80685,RUNNING,147.96.81.131:3232576,5,3,215,2.79193e-05,0.0005,,,,,
TrainLightGCN_edfbbcd5,RUNNING,147.96.81.131:3232709,5,4,5,0.00397996,0.0038,,,,,
TrainLightGCN_5b6c29b8,RUNNING,147.96.81.131:3232842,5,3,5,0.0011209,0.0002,,,,,
TrainLightGCN_9ba719d7,RUNNING,147.96.81.131:3232976,7,4,405,0.00406073,0.0014,,,,,
TrainLightGCN_5aa7ffb2,PENDING,,5,1,205,6.80694e-06,0.0085,,,,,


[2m[36m(pid=3232073)[0m 2023-11-27 11:35:09.224570: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[2m[36m(pid=3232073)[0m 2023-11-27 11:35:09.224601: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[2m[36m(pid=3232073)[0m 2023-11-27 11:35:09.224615: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[2m[36m(TrainLightGCN pid=3232073)[0m   df = train if test is None else train.append(test)


[2m[36m(TrainLightGCN pid=3232073)[0m Already create adjacency matrix.
[2m[36m(TrainLightGCN pid=3232073)[0m Already normalize adjacency matrix.
[2m[36m(TrainLightGCN pid=3232073)[0m Using xavier initialization.


[2m[36m(pid=3232313)[0m 2023-11-27 11:35:15.530804: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(pid=3232313)[0m 2023-11-27 11:35:15.530837: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(pid=3232313)[0m 2023-11-27 11:35:15.530849: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3232180)[0m   df = train if test is None else train.append(test)
[2m[36m(TrainLightGCN pid=3232313)[0m   df = train if test is None els

[2m[36m(TrainLightGCN pid=3232313)[0m Already create adjacency matrix.[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3232313)[0m Already normalize adjacency matrix.[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3232313)[0m Using xavier initialization.[32m [repeated 2x across cluster][0m


[2m[36m(pid=3232576)[0m 2023-11-27 11:35:22.211792: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(pid=3232576)[0m 2023-11-27 11:35:22.211827: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(pid=3232576)[0m 2023-11-27 11:35:22.211839: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3232445)[0m   df = train if test is None else train.append(test)
[2m[36m(TrainLightGCN pid=3232576)[0m   df = train if test is None els

[2m[36m(TrainLightGCN pid=3232576)[0m Already create adjacency matrix.[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3232576)[0m Already normalize adjacency matrix.[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3232445)[0m Using xavier initialization.
[2m[36m(TrainLightGCN pid=3232576)[0m Using xavier initialization.


[2m[36m(pid=3232842)[0m 2023-11-27 11:35:29.526511: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(pid=3232842)[0m 2023-11-27 11:35:29.526546: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(pid=3232842)[0m 2023-11-27 11:35:29.526559: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3232709)[0m   df = train if test is None else train.append(test)
[2m[36m(TrainLightGCN pid=3232842)[0m   df = train if test is None els

[2m[36m(TrainLightGCN pid=3232842)[0m Already create adjacency matrix.[32m [repeated 2x across cluster][0m
[2m[36m(TrainLightGCN pid=3232709)[0m Already normalize adjacency matrix.
[2m[36m(TrainLightGCN pid=3232709)[0m Using xavier initialization.
[2m[36m(TrainLightGCN pid=3232842)[0m Already normalize adjacency matrix.
[2m[36m(TrainLightGCN pid=3232842)[0m Using xavier initialization.


2023-11-27 11:35:47,012	INFO tune.py:1143 -- Total run time: 38.79 seconds (28.71 seconds for the tuning loop).
Resume experiment with: Tuner.restore(path="/home/daviddavo/ray_results/LightGCN_dao=Decentraland,fold=9_2023-11-27T11:31:45.417618", trainable=...)
- TrainLightGCN_5aa7ffb2: FileNotFoundError('Could not fetch metrics for TrainLightGCN_5aa7ffb2: both result.json and progress.csv were not found at /home/daviddavo/ray_results/LightGCN_dao=Decentraland,fold=9_2023-11-27T11:31:45.417618/TrainLightGCN_5aa7ffb2_9_batch_size=5,conv_layers=1,embedding_dim=205,l2=0.0000,learning_rate=0.0085_2023-11-27_11-35-36')


ResultGrid<[
  Result(
    metrics={'iteration': 15, 'loss': 0.2968927699848113, 'mf_loss': 0.24875853340543255, 'emb_loss': 0.04813423667506998, 'model_recall': 0.05086666666666666, 'model_ndcg': 0.039404803619016916, 'model_precision': 0.033600000000000005, 'model_map': 0.021538888888888892, 'precision@5': 0.2368, 'ndcg@5': 0.3662519629136005, 'recall@5': 0.41640404040404044, 'map@5': 0.28596230880230883, 'precision@10': 0.1928000000000001, 'ndcg@10': 0.45274130243190497, 'recall@10': 0.6465419913419912, 'map@10': 0.351538647472457, 'time_train': 14.600561618804932, 'time_test': 0.08750391006469727, 'time_total_train': 24.69786787033081, 'time_total_test': 0.8261804580688477},
    path='/home/daviddavo/ray_results/LightGCN_dao=Decentraland,fold=9_2023-11-27T11:31:45.417618/TrainLightGCN_aa592f54_1_batch_size=8,conv_layers=3,embedding_dim=100,l2=0.0000,learning_rate=0.1000_2023-11-27_11-35-08',
    filesystem='local',
    checkpoint=None
  ),
  Result(
    metrics={'iteration': 5, 'lo