In [1]:
from typing import Dict, List, Tuple, Union, Any, Optional

import os
import sys
import time
print("Python version", sys.version)

# Ignore pandas warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from pathlib import Path

import pandas as pd
import numpy as np
import tensorflow as tf
import datetime as dt

from tqdm.autonotebook import tqdm

import ray
from ray import train, tune

from src.datasets import daocensus_text

%load_ext autoreload
%autoreload 2

tf.get_logger().setLevel('WARNING')

tf.config.list_physical_devices("GPU")
sys_details = tf.sysconfig.get_build_info()
cuda = sys_details.get("cuda_version", -1)
cudnn = sys_details.get("cudnn_version", -1)
print(cuda, cudnn)
print('Ray version:', ray.__version__)

Python version 3.9.18 (main, Oct 24 2023, 09:18:18) 
[GCC 11.4.0]


2024-03-09 07:48:01.576021: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-09 07:48:01.576045: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-09 07:48:01.576058: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


11.8 8
Ray version: 2.9.1


In [2]:
# Others config
SEED: int = 57
RAY_RESULTS_PATH: Path = Path('~/ray_results').expanduser()

# Dataset config
ORG_NAME = 'Decentraland'
ORG_PLATFORM = 'snapshot'
USE_ORG_NAMES: bool = False
CUTOFF_DATE_STR: str = None

# Dataset splits config
SPLITS_FREQ = 'W-THU' # Split weekly
LAST_SPLITS = 10 # Use just last 10 splits
SPLITS_NORMALIZE = True

# Training config
MAX_EPOCHS: int = 200
EPOCHS_PER_ITER: int = 5
SAMPLES_PER_SPLIT: int = 100
OPTIM_METRIC: str = 'map@10'

# Eval config
TOP_K: List[int] = [5, 10]
METRICS: List[str] = ["recall", "ndcg", "precision", "map"]

In [3]:
# Parameters
ORG_NAME = "DEAD FoundationsDAO"
ORG_PLATFORM = "daohaus"
SPLITS_FREQ = "2d"
SPLITS_NORMALIZE = True
LAST_SPLITS = 10
USE_ORG_NAMES = True
EXECUTION_ID = "2024-03-08 10:00"
CUTOFF_DATE_STR = "2021-11-28"


In [4]:
CUTOFF_DATE = dt.datetime.fromisoformat(CUTOFF_DATE_STR) if CUTOFF_DATE_STR else None

# Load data

In [5]:
# dfptext = pd.read_csv('./snapshot_proposals.csv')[['proposal_id', 'title', 'description', 'start', 'end']]
# dfv, dfp = daocensus.get("./data/daos-census", ORG_NAME, 'snapshot')
# dfv['voter'] = dfv['voter'].astype('str')
# dfp = dfp.merge(dfptext, how='left', left_on='platform_proposal', right_on='proposal_id')
# dfp[['start', 'end']] = dfp[['start', 'end']].astype('datetime64')

dfv, dfp = daocensus_text.get("./data/daos-census-text", ORG_NAME, ORG_PLATFORM, use_org_names=USE_ORG_NAMES, cutoff_date=CUTOFF_DATE)
print(dfv.info())
print(dfp.info())

Unnamed: 0,platform,name,id,proposal,deployment,platform_vote,voter,date,choice,weight
0,daohaus,DEAD FoundationsDAO,eff6e807-bff7-518c-82e4-6c388182a5fc,3447c17e-1b31-5f05-afb7-bf4f7b93292a,c37abcdd-a36b-51fc-8fa6-2c3ecb780df0,0x1b975a9daf25e7b01e0a6c72d657ff74925327a8-mem...,0x0000000813b34008a225de08a6a61835508c71f9,2021-11-17 19:14:10,1,1.0
1,daohaus,DEAD FoundationsDAO,1d9222d9-598c-5beb-92e2-8896a0142bdb,58a6ac3d-5b72-5db9-b3af-84118704d120,c37abcdd-a36b-51fc-8fa6-2c3ecb780df0,0x1b975a9daf25e7b01e0a6c72d657ff74925327a8-mem...,0x0000000813b34008a225de08a6a61835508c71f9,2021-11-17 19:14:02,1,1.0
2,daohaus,DEAD FoundationsDAO,d1fd6dad-5d3e-5178-8007-6e1e9e380a3a,d607ecf5-04f8-53b9-8518-33c2bb09e990,c37abcdd-a36b-51fc-8fa6-2c3ecb780df0,0x1b975a9daf25e7b01e0a6c72d657ff74925327a8-mem...,0x0000000813b34008a225de08a6a61835508c71f9,2021-11-17 19:15:20,1,1.0
3,daohaus,DEAD FoundationsDAO,92e59584-ff6c-5e1b-8dd7-c9aabf55abbc,94dc6060-d66d-52fe-9c5d-8c7e43979569,c37abcdd-a36b-51fc-8fa6-2c3ecb780df0,0x1b975a9daf25e7b01e0a6c72d657ff74925327a8-mem...,0x001be549fa377710b9e59d57bbdf593ce1e379ca,2021-11-03 21:24:45,1,1.0
4,daohaus,DEAD FoundationsDAO,28f9d7b6-0656-5038-a99b-9db7201acddd,470cf9e2-b994-5279-8096-87a475e28c44,c37abcdd-a36b-51fc-8fa6-2c3ecb780df0,0x1b975a9daf25e7b01e0a6c72d657ff74925327a8-mem...,0x001be549fa377710b9e59d57bbdf593ce1e379ca,2021-11-03 21:26:15,1,1.0
...,...,...,...,...,...,...,...,...,...,...
16596,daohaus,DEAD FoundationsDAO,3e2965f7-21e0-59a7-aa3f-92eca216d8b2,58d065d0-4a9f-53ff-82ed-89d793f9ab1e,c37abcdd-a36b-51fc-8fa6-2c3ecb780df0,0x1b975a9daf25e7b01e0a6c72d657ff74925327a8-mem...,0xffefdcfff613c9bbb9928f6ff44f07c7b562bfdf,2021-07-16 18:53:50,1,1.0
16597,daohaus,DEAD FoundationsDAO,162eccbd-8397-59b4-95cb-26e997a66a52,02078479-8fb8-5ce7-99cd-8af351f02331,c37abcdd-a36b-51fc-8fa6-2c3ecb780df0,0x1b975a9daf25e7b01e0a6c72d657ff74925327a8-mem...,0xffefdcfff613c9bbb9928f6ff44f07c7b562bfdf,2021-07-17 02:19:45,1,1.0
16598,daohaus,DEAD FoundationsDAO,d41c87a7-f6cb-5bd8-8f60-cc98535ff7e5,af846563-cee6-5f41-9e33-8f0bdab8ddb2,c37abcdd-a36b-51fc-8fa6-2c3ecb780df0,0x1b975a9daf25e7b01e0a6c72d657ff74925327a8-mem...,0xffefdcfff613c9bbb9928f6ff44f07c7b562bfdf,2021-07-19 01:23:10,1,1.0
16599,daohaus,DEAD FoundationsDAO,61cc5a24-1a11-537c-869b-19545560a363,313f146b-6064-538c-904f-a791a8293cd0,c37abcdd-a36b-51fc-8fa6-2c3ecb780df0,0x1b975a9daf25e7b01e0a6c72d657ff74925327a8-mem...,0xffefdcfff613c9bbb9928f6ff44f07c7b562bfdf,2021-07-23 01:51:35,1,1.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 16601 entries, 0 to 16600
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   platform       16601 non-null  object        
 1   name           16601 non-null  object        
 2   id             16601 non-null  object        
 3   proposal       16601 non-null  category      
 4   deployment     16601 non-null  object        
 5   platform_vote  16601 non-null  object        
 6   voter          16601 non-null  category      
 7   date           16601 non-null  datetime64[ns]
 8   choice         16601 non-null  object        
 9   weight         16601 non-null  float64       
dtypes: category(2), datetime64[ns](1), float64(1), object(6)
memory usage: 1.3+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5034 entries, 0 to 5033
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               

## Transform data

In [6]:
def to_microsoft(dfv):
    df = dfv[['voter', 'proposal', 'date']].rename(columns={
        'voter': 'userID',
        'proposal': 'itemID',
        'date': 'timestamp',
    })
    df['itemID'] = df['itemID'].astype('str')
    df['rating'] = 1
    return df

df = to_microsoft(dfv)
df.head()

Unnamed: 0,userID,itemID,timestamp,rating
0,0x0000000813b34008a225de08a6a61835508c71f9,3447c17e-1b31-5f05-afb7-bf4f7b93292a,2021-11-17 19:14:10,1
1,0x0000000813b34008a225de08a6a61835508c71f9,58a6ac3d-5b72-5db9-b3af-84118704d120,2021-11-17 19:14:02,1
2,0x0000000813b34008a225de08a6a61835508c71f9,d607ecf5-04f8-53b9-8518-33c2bb09e990,2021-11-17 19:15:20,1
3,0x001be549fa377710b9e59d57bbdf593ce1e379ca,94dc6060-d66d-52fe-9c5d-8c7e43979569,2021-11-03 21:24:45,1
4,0x001be549fa377710b9e59d57bbdf593ce1e379ca,470cf9e2-b994-5279-8096-87a475e28c44,2021-11-03 21:26:15,1


# Split data

Each proposal remains open for a few days, our environment is different of a movies recommender system. For this reason, we will use a TimeSeriesSplit instead of a K-Fold to cross-validate the model.

![](https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_013.png)

El TimeSeriesSplit de scikit-learn no nos vale porque el número de elementos en cada split es el mismo, pero el tamaño del intervalo, no. Como queremos simular un comportamiento realista, haremos el split dividiendo por intervalos de igual longitud.

In [7]:
from recommenders.evaluation.python_evaluation import metrics as metrics_dict

In [8]:
from src.model_selection import timeFreqSplitCurrent, timeIntervalSplitCurrent

# max_train_prev = df['timestamp'].min().date()
# N_SPLITS = 10; SKIP_SPLIT = 0
# folds = list(timeIntervalSplitCurrent(df, N_SPLITS, dfp, skip=SKIP_SPLIT, remove_not_in_train_col='userID', return_open=True))
folds = list(timeFreqSplitCurrent(df, SPLITS_FREQ, dfp, return_open=True, remove_not_in_train_col='userID'))[-LAST_SPLITS:]
for i, (dftrain, dftest, t, open_proposals) in enumerate(folds):
    min_train = dftrain['timestamp'].min().date()
    max_train = dftrain['timestamp'].max().date()
    min_test  = dftest['timestamp'].min().date()
    max_test  = dftest['timestamp'].max().date()

    train_users = len(set(dftrain['userID']))
    test_users = len(set(dftest['userID']))
    
    print(f"Split {i}, train from: {min_train} to {max_train}, test from: {min_test} to {max_test}")
    print(f"  t: {t}")
    print(f"  len(train): {len(dftrain)}, len(test): {len(dftest)}")
    print(f"  users(train): {train_users}, users(test): {test_users}")

    print()
    dftest['prediction'] = 1
    for m in METRICS:
        f = metrics_dict[f'{m}_at_k']
        print(f"  highest possible {m}@{TOP_K[0]}:\t{f(dftest, dftest, k=TOP_K[0], relevancy_method='top_k'):.4f}")

    print("-"*30)

Split 0, train from: 2020-11-25 to 2021-11-05, test from: 2021-11-06 to 2021-11-06
  t: 2021-11-06 00:00:00
  len(train): 9194, len(test): 11
  users(train): 1631, users(test): 7

  highest possible recall@5:	1.0000
  highest possible ndcg@5:	1.0000
  highest possible precision@5:	0.3143
  highest possible map@5:	1.0000
------------------------------
Split 1, train from: 2020-11-25 to 2021-11-07, test from: 2021-11-08 to 2021-11-09
  t: 2021-11-08 00:00:00
  len(train): 9485, len(test): 65
  users(train): 1676, users(test): 17

  highest possible recall@5:	0.9398
  highest possible ndcg@5:	1.0000
  highest possible precision@5:	0.4000
  highest possible map@5:	0.9398
------------------------------
Split 2, train from: 2020-11-25 to 2021-11-09, test from: 2021-11-10 to 2021-11-11
  t: 2021-11-10 00:00:00
  len(train): 9763, len(test): 66
  users(train): 1726, users(test): 13

  highest possible recall@5:	0.9028
  highest possible ndcg@5:	1.0000
  highest possible precision@5:	0.3538
  h

  highest possible recall@5:	0.9115
  highest possible ndcg@5:	1.0000
  highest possible precision@5:	0.3630
  highest possible map@5:	0.9115
------------------------------
Split 5, train from: 2020-11-25 to 2021-11-15, test from: 2021-11-16 to 2021-11-17
  t: 2021-11-16 00:00:00
  len(train): 12013, len(test): 93
  users(train): 2113, users(test): 26

  highest possible recall@5:	0.9647
  highest possible ndcg@5:	1.0000
  highest possible precision@5:	0.2923
  highest possible map@5:	0.9647
------------------------------
Split 6, train from: 2020-11-25 to 2021-11-17, test from: 2021-11-18 to 2021-11-19
  t: 2021-11-18 00:00:00
  len(train): 12705, len(test): 118
  users(train): 2251, users(test): 32

  highest possible recall@5:	0.9337
  highest possible ndcg@5:	1.0000
  highest possible precision@5:	0.3812
  highest possible map@5:	0.9337
------------------------------
Split 7, train from: 2020-11-25 to 2021-11-19, test from: 2021-11-20 to 2021-11-21
  t: 2021-11-20 00:00:00
  len(tr

  highest possible recall@5:	0.9693
  highest possible ndcg@5:	1.0000
  highest possible precision@5:	0.2889
  highest possible map@5:	0.9693
------------------------------
Split 9, train from: 2020-11-25 to 2021-11-23, test from: 2021-11-24 to 2021-11-25
  t: 2021-11-24 00:00:00
  len(train): 15538, len(test): 27
  users(train): 2888, users(test): 18

  highest possible recall@5:	1.0000
  highest possible ndcg@5:	1.0000
  highest possible precision@5:	0.3000
  highest possible map@5:	1.0000
------------------------------


# Defining training

In [9]:
# from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from src.models import LightGCNCustom
from recommenders.utils.python_utils import get_top_k_scored_items

## Small test of `LightGCNCustom`

In [10]:
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.models.deeprec.deeprec_utils import prepare_hparams

In [11]:
hparams = prepare_hparams(
    model_type='lightgcn',
    n_layers=3,
    batch_size=512,
    embed_size=64,
    epochs=2,
    learning_rate=0.001,
    decay=0.001,
    metrics=["recall", "ndcg", "precision", "map"],
    eval_epoch=2,
    top_k=TOP_K[0],
    save_model=False,
    MODEL_DIR='./data/model/lightgcn/',
)
dataloader = ImplicitCF(train=folds[-1][0], test=folds[-1][1], seed=SEED)
print("items:", dataloader.n_items, "user:", dataloader.n_users)
model = LightGCNCustom(data=dataloader, hparams=hparams)

items: 4942 user: 2888


Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.


In [12]:
model.fit()

Epoch 1 (train)0.2s: train loss = 0.68901 = (mf)0.68895 + (embed)0.00006


Epoch 2 (train)0.1s + (eval)0.5s: train loss = 0.66749 = (mf)0.66736 + (embed)0.00013, recall = 0.05556, ndcg = 0.05556, precision = 0.01111, map = 0.05556


In [13]:
model.run_eval()

[0.05555555555555555,
 0.05555555555555555,
 0.011111111111111112,
 0.05555555555555555]

In [14]:
model.recommend_k_items(
    dataloader.test, 
    top_k=3, 
    use_id=True, 
    remove_seen=True, 
    recommend_from=folds[-1][3]
)

Unnamed: 0,userID,itemID,prediction
0,74,1355,0.119852
1,74,86,0.1093
2,74,1220,0.095787
3,921,201,0.434024
4,921,1272,0.279984
5,921,125,0.210201
6,1061,65,0.264007
7,1061,201,0.263679
8,1061,64,0.238756
9,1165,201,0.356522


In [15]:
# problem_prop = 'b926a2eb-e2c7-5d0d-bbec-015efe30edde'

# tr, te, t, open_proposals = folds[-1]
# print("t:", t)
# print("In open?: ", problem_prop in set(open_proposals))
# print("In Train?:", any(tr['itemID'] == problem_prop))
# print("In Test?: ", any(te['itemID'] == problem_prop))
# print("In dfv?:  ", in_dfv := any(dfv['proposal'] == problem_prop))
# if in_dfv:
#     display(dfv[dfv['proposal'] == problem_prop])

# dfp.set_index('id').loc[problem_prop]

In [16]:
# Free the resources
del model
del dataloader

# Defining trainer

In [17]:
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k

class TrainLightGCN(tune.Trainable):
    def setup(
        self,
        config: Dict[str, Any],
        data,
    ):
        self.config = config

        self.hparams = prepare_hparams(
            model_type='lightgcn',
            n_layers=config['conv_layers'],
            batch_size=2**config['batch_size'],
            embed_size=config['embedding_dim'],
            epochs=EPOCHS_PER_ITER,
            learning_rate=config['learning_rate'],
            decay=config['l2'],
            metrics=METRICS,
            eval_epoch=-1,
            top_k=TOP_K[0],
            save_model=False,
            MODEL_DIR='./data/model/lightgcn/',
        )

        train, test, self.t, self.open_proposals = data
        self.dataloader = ImplicitCF(train=train, test=test, seed=SEED)
        self.model = LightGCNCustom(self.hparams, self.dataloader, seed=SEED)
        self.total_train = 0
        self.total_eval = 0

    @property
    def iteration(self):
        return self.model.epochs_done

    @property
    def training_iteration(self):
        return self.model.epochs_done

    def step(self):
        """
        As a rule of thumb, the execution time of step should be large enough to avoid overheads 
        (i.e. more than a few seconds), but short enough to report progress periodically 
        (i.e. at most a few minutes).
        """
        assert EPOCHS_PER_ITER > 0

        train_start = time.time()
        for _ in range(EPOCHS_PER_ITER):
            ret = self.model.fit_epoch()
        eval_start = train_end = time.time()

        eval_dict = {'model_'+k:v for k,v in zip(self.model.metrics, self.model.run_eval())}
        for k in TOP_K:
            recs = self.model.recommend_k_items(
                self.dataloader.test, 
                top_k=k,
                use_id=True, 
                remove_seen=True, 
                recommend_from=self.open_proposals,
            )
            
            eval_dict[f'precision@{k}'] = precision_at_k(self.dataloader.test, recs, k=k)
            eval_dict[f'ndcg@{k}'] = ndcg_at_k(self.dataloader.test, recs, k=k)
            eval_dict[f'recall@{k}'] = recall_at_k(self.dataloader.test, recs, k=k)
            eval_dict[f'map@{k}'] = map_at_k(self.dataloader.test, recs, k=k)

        eval_end = time.time()

        self.total_train += train_end - train_start
        self.total_eval += eval_end - eval_start
        
        return {
            'iteration': self.iteration,
            'loss': ret[0],
            'mf_loss': ret[1],
            'emb_loss': ret[2],
            **eval_dict,
            'time_train': train_end-train_start,
            'time_test': eval_end-eval_start,
            'time_total_train': self.total_train,
            'time_total_test': self.total_eval,
        }

    def save_checkpoint(self, checkpoint_dir):
        checkpoint_path = os.path.join(checkpoint_dir, "model")
        self.model.saver.save(
            sess=self.model.sess,
            save_path=checkpoint_path,
        )
        return checkpoint_dir

    def load_checkpoint(self, checkpoint_path):
        self.model.load(checkpoint_path)

# Big experiment

In [18]:
RAY_RESULTS_PATH

PosixPath('/home/daviddavo/ray_results')

In [19]:
print(os.uname().nodename)

### SET TRAINING RESOURCES
if os.uname().nodename == 'lamarck':
    # assert torch.cuda.is_available()
    if ORG_NAME in ['PancakeSwap']:
        # It seems that this model takes more resources, so we run less instances
        gpu_resources = 1/6
    elif ORG_NAME in ['DEAD FoundationsDAO']:
        gpu_resources = 1/20
    else:
        # GPU has 25GiB, and each run might take up to 2GiB (torch version was lighter)
        # so each run might take up to 1/12th of the GPU
        # I use 1/16th so I don't take all the resources in the machine
        gpu_resources = 1/16
    
    NUM_SAMPLES = SAMPLES_PER_SPLIT
    # Every run takes approx half a gig of vram (no optimizations)
    # The RTX 4090 has 24GB so we can run the model about 48 times
    resources_per_trial={
        'cpu': 1,
        'gpu': gpu_resources,
    }
else:
    NUM_SAMPLES = 1
    resources_per_trial={
        'cpu': 1,
        # It takes about 1.5 GiB with full training data, but I put a bit more because
        # this notebook also takes a bit of memory
        'memory': 2e9,
    }
print(resources_per_trial)

lamarck
{'cpu': 1, 'gpu': 0.05}


In [20]:
from ray.tune.search.hyperopt import HyperOptSearch
import datetime as dt
from src import paths

def getTunerOnFold(f, points_to_evaluate = None):
    name = f'LightGCN_optim={OPTIM_METRIC},dao={ORG_NAME},freq={SPLITS_FREQ},normalize={SPLITS_NORMALIZE},cutoff_date={CUTOFF_DATE.isoformat()},fold={f}'
    paths = list(RAY_RESULTS_PATH.glob(f'{name}_*'))
    last_experiment = max(paths, key=lambda x: x.stat().st_ctime) if paths else None

    dftrain,dftest,t,open_proposals = folds[f]
    param_space = dict(
        fold=f,
        batch_size=tune.randint(6,10), # 64 - 1024
        embedding_dim=tune.lograndint(1, 1024, base=2),
        conv_layers=tune.randint(1,6),
        learning_rate=tune.qloguniform(1e-4, 1, 1e-4),
        l2=tune.loguniform(1e-7, 1e-2, 1e-7),
    )
    
    ### RESTORE EXPERIMENT OR CREATE A NEW ONE
    if last_experiment and tune.Tuner.can_restore(last_experiment):
        print(f"Restoring last experiment: {last_experiment}")
        tuner = tune.Tuner.restore(
            str(last_experiment),
            trainable=tune.with_resources(
                # tune.with_parameters(TrainLightGCN,  train=dftrain, test=dftest, open_proposals=open_proposals),
                tune.with_parameters(TrainLightGCN, data=folds[f]),
                resources_per_trial,
            ),
            restart_errored=True,
            param_space=param_space,
        )
    else:
        print(f"No experiment found for fold {f}, creating new tuner with {NUM_SAMPLES} samples")
        search_alg = None
        search_alg = HyperOptSearch(
            # points_to_evaluate=[{
            #     'batch_size': 8, # 2**8 = 256
            #     'learning_rate': 10e-2,
            #     'l2': 10e-6,
            #     'embedding_dim': 100,
            #     'conv_layers': 3,
            # }],
            points_to_evaluate = points_to_evaluate,
            random_state_seed=SEED,
        )
        # search_alg = tune.search.Repeater(search_alg, N_SPLITS-SKIP_SPLIT)
        
        tuner = tune.Tuner(
            tune.with_resources(
                # tune.with_parameters(TrainLightGCN,  train=dftrain, test=dftest, open_proposals=open_proposals),
                tune.with_parameters(TrainLightGCN, data=folds[f]),
                resources_per_trial,
            ),
            run_config=train.RunConfig(
                stop={'training_iteration': MAX_EPOCHS/EPOCHS_PER_ITER, 'time_total_train': 300},
                name=name + f'_{dt.datetime.now().isoformat()}',
                storage_path=RAY_RESULTS_PATH,
                failure_config=train.FailureConfig(fail_fast='raise'),
            ),
            param_space=param_space,
            tune_config=tune.TuneConfig(
                search_alg=search_alg,
                num_samples=NUM_SAMPLES,
                metric=OPTIM_METRIC,
                mode='max',
            )
        )

    return tuner

In [21]:
import logging

def findConfig(rg):
    for r in rg:
        if r.config:
            msk = [r.config[k] == v for k, v in last_best_result.config.items() if k != 'fold']
            if all(msk):
                return r
            elif sum(msk) >= 3:
                print(r.config, msk)

    return None

tuners = []
results = []
last_best_result = None
for f in range(LAST_SPLITS):
    best_prev_config = None
    if last_best_result is not None:
        best_prev_config = last_best_result.config.copy()
        best_prev_config['fold'] += 1
        best_prev_config = [best_prev_config]
    
    t = getTunerOnFold(f, best_prev_config)
    tuners.append(t)

    rg = t.fit()
    assert rg.num_errors == 0, f"There are {rg.num_errors} errors"
    assert rg.num_terminated == NUM_SAMPLES, f'Some samples are not terminated ({rg.num_terminated} != {NUM_SAMPLES})'
    results.append(rg)

    # Assert that the prev config has been tried
    if last_best_result is not None:
        # if not any( 
        #     all((r.config[k] == v for k, v in last_best_result.config.items() if k != 'fold'))
        #     for r in rg if r.config
        # ):
        if not findConfig(rg):
            print("Best config:", last_best_result.config)
            assert False, f"The best config from previous fold has not been tested in fold {f}"
        else:
            logging.info(f'Fold {f}. Best prev result was {last_best_result.path} and config has been found {findConfig(rg).path}')
    
    last_best_result = rg.get_best_result()

0,1
Current time:,2024-03-09 08:02:00
Running for:,00:13:42.35
Memory:,8.5/125.6 GiB

Trial name,status,loc,batch_size,conv_layers,embedding_dim,fold,l2,learning_rate,iter,total time (s),iteration,loss,mf_loss
TrainLightGCN_0f936309,TERMINATED,147.96.81.131:3360311,6,3,2,9,5.75975e-07,0.0126,13,313.37,65,0.0712053,0.0711965
TrainLightGCN_427ef446,TERMINATED,147.96.81.131:3185094,8,5,11,9,3.87278e-07,0.9885,40,242.63,200,2.04702,2.02384
TrainLightGCN_ac5b0c32,TERMINATED,147.96.81.131:3360312,6,1,146,9,1.05111e-06,0.0008,13,316.83,65,0.00161877,0.00159832
TrainLightGCN_0b93c740,TERMINATED,147.96.81.131:3360316,9,4,34,9,0.000326489,0.0022,40,158.454,200,0.0103552,0.00414979
TrainLightGCN_4433bbee,TERMINATED,147.96.81.131:3360317,9,5,641,9,2.94385e-05,0.0384,40,158.335,200,0.0845272,0.0437971
TrainLightGCN_c7404e88,TERMINATED,147.96.81.131:3360363,7,5,252,9,2.46418e-05,0.0027,25,311.647,125,0.0015838,0.000554821
TrainLightGCN_5b8ec183,TERMINATED,147.96.81.131:3360410,8,4,61,9,0.000111582,0.0096,40,267.248,200,0.00370531,0.000616603
TrainLightGCN_35ba720b,TERMINATED,147.96.81.131:3360469,8,5,999,9,0.000901688,0.1191,40,300.198,200,1.721,0.213212
TrainLightGCN_b2a79698,TERMINATED,147.96.81.131:3183322,6,5,372,9,9.46725e-07,0.0043,13,304.982,65,0.000609461,0.000527356
TrainLightGCN_ff926038,TERMINATED,147.96.81.131:3360528,7,1,7,9,1.32226e-07,0.0001,24,305.606,120,0.329298,0.329298


2024-03-09 07:48:17,858	INFO experiment_state.py:404 -- A local experiment checkpoint was found and will be used to restore the previous experiment state.




2024-03-09 07:48:17,859	INFO tune_controller.py:404 -- Using the newest experiment state file found within the experiment directory: experiment_state-2024-03-08_15-49-07.json


[36m(pid=3360311)[0m 2024-03-09 07:48:19.236620: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3360311)[0m 2024-03-09 07:48:19.236652: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3360311)[0m 2024-03-09 07:48:19.236670: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3360311)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3360311)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3360311)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3360311)[0m Using xavier initialization.


[36m(pid=3361026)[0m 2024-03-09 07:48:21.788496: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered[32m [repeated 18x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
[36m(pid=3361026)[0m 2024-03-09 07:48:21.788530: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered[32m [repeated 18x across cluster][0m
[36m(pid=3361026)[0m 2024-03-09 07:48:21.788548: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered[32m [repeated 18x a

[36m(TrainLightGCN pid=3360888)[0m   df = train if test is None else train.append(test)[32m [repeated 18x across cluster][0m


[36m(pid=3363134)[0m 2024-03-09 07:48:29.250645: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3363134)[0m 2024-03-09 07:48:29.250679: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3363134)[0m 2024-03-09 07:48:29.250692: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3363134)[0m Already create adjacency matrix.[32m [repeated 19x across cluster][0m
[36m(TrainLightGCN pid=3363134)[0m Already normalize adjacency matrix.[32m [repeated 19x across cluster][0m
[36m(TrainLightGCN pid=3363134)[0m Using xavier initialization.[32m [repeated 19x across cluster][0m


[36m(TrainLightGCN pid=3360316)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_0b93c740_54_batch_size=9,conv_layers=4,embedding_dim=34,fold=9,l2=0.0003,learning_rate=0.0022_2024-03-08_15-57-36/checkpoint_000000)
[36m(TrainLightGCN pid=3363134)[0m   df = train if test is None else train.append(test)


[36m(pid=3363829)[0m 2024-03-09 07:51:04.721883: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3363829)[0m 2024-03-09 07:51:04.721921: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3363829)[0m 2024-03-09 07:51:04.721937: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3363829)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3363829)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3363829)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3363829)[0m Using xavier initialization.


[36m(TrainLightGCN pid=3361028)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_c3690edb_62_batch_size=9,conv_layers=4,embedding_dim=602,fold=9,l2=0.0001,learning_rate=0.0175_2024-03-08_15-59-27/checkpoint_000000)[32m [repeated 2x across cluster][0m


[36m(pid=3363978)[0m 2024-03-09 07:51:10.713791: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3363978)[0m 2024-03-09 07:51:10.713837: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3363978)[0m 2024-03-09 07:51:10.713857: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3363978)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3363978)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3363978)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3363978)[0m Using xavier initialization.


[36m(pid=3364144)[0m 2024-03-09 07:51:16.958690: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3364144)[0m 2024-03-09 07:51:16.958745: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3364144)[0m 2024-03-09 07:51:16.958767: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3364144)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3364144)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3364144)[0m Already normalize adjacency matrix.


[36m(TrainLightGCN pid=3364144)[0m Using xavier initialization.


[36m(TrainLightGCN pid=3361026)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_c05a126a_61_batch_size=9,conv_layers=5,embedding_dim=910,fold=9,l2=0.0007,learning_rate=0.0258_2024-03-08_15-59-21/checkpoint_000000)


[36m(pid=3364382)[0m 2024-03-09 07:51:34.775929: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3364382)[0m 2024-03-09 07:51:34.775972: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3364382)[0m 2024-03-09 07:51:34.775994: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3364382)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3364382)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3364382)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3364382)[0m Using xavier initialization.


[36m(TrainLightGCN pid=3360410)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_5b8ec183_49_batch_size=8,conv_layers=4,embedding_dim=61,fold=9,l2=0.0001,learning_rate=0.0096_2024-03-08_15-56-08/checkpoint_000000)


[36m(pid=3364794)[0m 2024-03-09 07:52:53.918196: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3364794)[0m 2024-03-09 07:52:53.918335: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3364794)[0m 2024-03-09 07:52:53.918355: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3360576)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_8bfca41c_55_batch_size=8,conv_layers=2,embedding_dim=504,fold=9,l2=0.0001,learning_rate=0.0002_2024-03-08_15-57-43/checkpoint_000000)


[36m(TrainLightGCN pid=3364794)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3364794)[0m Using xavier initialization.
[36m(TrainLightGCN pid=3364794)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3364794)[0m Already normalize adjacency matrix.


[36m(pid=3364948)[0m 2024-03-09 07:52:59.685201: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3364948)[0m 2024-03-09 07:52:59.685239: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3364948)[0m 2024-03-09 07:52:59.685255: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3360648)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_2d989d71_44_batch_size=8,conv_layers=5,embedding_dim=14,fold=9,l2=0.0000,learning_rate=0.1804_2024-03-08_15-55-36/checkpoint_000000)
[36m(TrainLightGCN pid=3364948)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3364948)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3364948)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3364948)[0m Using xavier initialization.


[36m(pid=3365113)[0m 2024-03-09 07:53:05.529478: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3365113)[0m 2024-03-09 07:53:05.529516: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3365113)[0m 2024-03-09 07:53:05.529535: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3365113)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3365113)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3365113)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3365113)[0m Using xavier initialization.


[36m(TrainLightGCN pid=3364144)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_f08da354_66_batch_size=9,conv_layers=5,embedding_dim=16,fold=9,l2=0.0000,learning_rate=0.0820_2024-03-09_07-51-15/checkpoint_000000)


[36m(pid=3365355)[0m 2024-03-09 07:53:22.749720: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3365355)[0m 2024-03-09 07:53:22.749850: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3365355)[0m 2024-03-09 07:53:22.749871: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3365355)[0m   df = train if test is None else train.append(test)
[36m(TrainLightGCN pid=3360469)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_35ba720b_52_batch_size=8,conv_layers=5,embedding_dim=999,fold=9,l2=0.0009,learning_rate=0.1191_2024-03-08_15-57-23/checkpoint_000000)


[36m(TrainLightGCN pid=3365355)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3365355)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3365355)[0m Using xavier initialization.


[36m(pid=3365504)[0m 2024-03-09 07:53:29.374096: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3365504)[0m 2024-03-09 07:53:29.374138: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3365504)[0m 2024-03-09 07:53:29.374157: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3360528)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_ff926038_51_batch_size=7,conv_layers=1,embedding_dim=7,fold=9,l2=0.0000,learning_rate=0.0001_2024-03-08_15-57-15/checkpoint_000000)


[36m(TrainLightGCN pid=3365504)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3365504)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3365504)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3365504)[0m Using xavier initialization.


[36m(pid=3365693)[0m 2024-03-09 07:53:35.410739: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3365693)[0m 2024-03-09 07:53:35.410775: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3365693)[0m 2024-03-09 07:53:35.410793: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3360363)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_c7404e88_48_batch_size=7,conv_layers=5,embedding_dim=252,fold=9,l2=0.0000,learning_rate=0.0027_2024-03-08_15-56-01/checkpoint_000000)[32m [repeated 3x across cluster][0m


[36m(TrainLightGCN pid=3365693)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3365693)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3365693)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3365693)[0m Using xavier initialization.


[36m(pid=3365894)[0m 2024-03-09 07:53:40.688046: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3365894)[0m 2024-03-09 07:53:40.688084: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3365894)[0m 2024-03-09 07:53:40.688100: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3360312)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_ac5b0c32_53_batch_size=6,conv_layers=1,embedding_dim=146,fold=9,l2=0.0000,learning_rate=0.0008_2024-03-08_15-57-29/checkpoint_000000)[32m [repeated 5x across cluster][0m


[36m(TrainLightGCN pid=3365894)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3365894)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3365894)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3365894)[0m Using xavier initialization.


[36m(pid=3366090)[0m 2024-03-09 07:53:45.496400: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3366090)[0m 2024-03-09 07:53:45.496436: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3366090)[0m 2024-03-09 07:53:45.496452: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3360888)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_8adc274b_41_batch_size=6,conv_layers=1,embedding_dim=204,fold=9,l2=0.0000,learning_rate=0.0133_2024-03-08_15-55-17/checkpoint_000000)[32m [repeated 4x across cluster][0m


[36m(TrainLightGCN pid=3366090)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3366090)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3366090)[0m Using xavier initialization.


[36m(TrainLightGCN pid=3366090)[0m   df = train if test is None else train.append(test)
[36m(pid=3366261)[0m 2024-03-09 07:53:50.511372: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3366261)[0m 2024-03-09 07:53:50.511412: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3366261)[0m 2024-03-09 07:53:50.511427: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3366261)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3366261)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3366261)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3366261)[0m Using xavier initialization.


[36m(pid=3366426)[0m 2024-03-09 07:53:55.532835: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3366426)[0m 2024-03-09 07:53:55.532874: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3366426)[0m 2024-03-09 07:53:55.532891: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3366426)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3366426)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3366426)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3366426)[0m Using xavier initialization.


[36m(pid=3366587)[0m 2024-03-09 07:54:00.680030: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3366587)[0m 2024-03-09 07:54:00.680070: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3366587)[0m 2024-03-09 07:54:00.680088: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3366587)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3366587)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3366587)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3366587)[0m Using xavier initialization.


[36m(pid=3366749)[0m 2024-03-09 07:54:05.708313: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3366749)[0m 2024-03-09 07:54:05.708351: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3366749)[0m 2024-03-09 07:54:05.708369: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3366749)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3366749)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3366749)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3366749)[0m Using xavier initialization.


[36m(pid=3366919)[0m 2024-03-09 07:54:11.230624: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3366919)[0m 2024-03-09 07:54:11.230660: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3366919)[0m 2024-03-09 07:54:11.230677: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3366919)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3366919)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3366919)[0m Using xavier initialization.
[36m(TrainLightGCN pid=3366919)[0m Already create adjacency matrix.


[36m(pid=3367083)[0m 2024-03-09 07:54:16.622465: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3367083)[0m 2024-03-09 07:54:16.622508: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3367083)[0m 2024-03-09 07:54:16.622524: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3367083)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3367083)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3367083)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3367083)[0m Using xavier initialization.


[36m(pid=3367258)[0m 2024-03-09 07:54:22.100046: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3367258)[0m 2024-03-09 07:54:22.100093: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3367258)[0m 2024-03-09 07:54:22.100114: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3367258)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3367258)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3367258)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3367258)[0m Using xavier initialization.


[36m(pid=3367422)[0m 2024-03-09 07:54:27.564143: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3367422)[0m 2024-03-09 07:54:27.564184: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3367422)[0m 2024-03-09 07:54:27.564198: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3367422)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3367422)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3367422)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3367422)[0m Using xavier initialization.


[36m(pid=3367590)[0m 2024-03-09 07:54:33.627894: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3367590)[0m 2024-03-09 07:54:33.627935: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3367590)[0m 2024-03-09 07:54:33.627949: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3367590)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3367590)[0m Using xavier initialization.
[36m(TrainLightGCN pid=3367590)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3367590)[0m Already normalize adjacency matrix.


[36m(pid=3367762)[0m 2024-03-09 07:54:39.933867: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3367762)[0m 2024-03-09 07:54:39.933905: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3367762)[0m 2024-03-09 07:54:39.933922: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3365355)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_192bb632_71_batch_size=9,conv_layers=1,embedding_dim=496,fold=9,l2=0.0000,learning_rate=0.0003_2024-03-09_07-53-10/checkpoint_000000)


[36m(TrainLightGCN pid=3367762)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3367762)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3367762)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3367762)[0m Using xavier initialization.


[36m(pid=3367935)[0m 2024-03-09 07:54:45.530548: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3367935)[0m 2024-03-09 07:54:45.530583: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3367935)[0m 2024-03-09 07:54:45.530600: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3367935)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3367935)[0m Using xavier initialization.
[36m(TrainLightGCN pid=3367935)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3367935)[0m Already normalize adjacency matrix.


[36m(TrainLightGCN pid=3366090)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_1576debe_75_batch_size=9,conv_layers=1,embedding_dim=3,fold=9,l2=0.0000,learning_rate=0.8676_2024-03-09_07-53-44/checkpoint_000000)


[36m(pid=3368206)[0m 2024-03-09 07:55:12.838205: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3368206)[0m 2024-03-09 07:55:12.838250: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3368206)[0m 2024-03-09 07:55:12.838273: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3368206)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3368206)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3368206)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3368206)[0m Using xavier initialization.


[36m(TrainLightGCN pid=3365113)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_c4b5bff5_70_batch_size=8,conv_layers=2,embedding_dim=2,fold=9,l2=0.0001,learning_rate=0.0018_2024-03-09_07-53-04/checkpoint_000000)


[36m(pid=3368490)[0m 2024-03-09 07:55:52.153188: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3368490)[0m 2024-03-09 07:55:52.153243: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3368490)[0m 2024-03-09 07:55:52.153280: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3368490)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3368490)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3368490)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3368490)[0m Using xavier initialization.


[36m(TrainLightGCN pid=3365504)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_cef2999b_72_batch_size=8,conv_layers=2,embedding_dim=248,fold=9,l2=0.0000,learning_rate=0.0123_2024-03-09_07-53-27/checkpoint_000000)


[36m(pid=3368715)[0m 2024-03-09 07:56:15.062424: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3368715)[0m 2024-03-09 07:56:15.062469: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3368715)[0m 2024-03-09 07:56:15.062492: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3368715)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3368715)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3368715)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3368715)[0m Using xavier initialization.


[36m(TrainLightGCN pid=3365693)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_d0839242_73_batch_size=8,conv_layers=3,embedding_dim=581,fold=9,l2=0.0000,learning_rate=0.0001_2024-03-09_07-53-33/checkpoint_000000)


[36m(pid=3368899)[0m 2024-03-09 07:56:24.150111: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3368899)[0m 2024-03-09 07:56:24.150149: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3368899)[0m 2024-03-09 07:56:24.150163: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3363829)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_3f0e8a66_64_batch_size=6,conv_layers=3,embedding_dim=109,fold=9,l2=0.0000,learning_rate=0.0048_2024-03-09_07-48-33/checkpoint_000000)


[36m(TrainLightGCN pid=3368899)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3368899)[0m Using xavier initialization.
[36m(TrainLightGCN pid=3368899)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3368899)[0m Already normalize adjacency matrix.


[36m(pid=3369063)[0m 2024-03-09 07:56:30.113026: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3369063)[0m 2024-03-09 07:56:30.113069: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3369063)[0m 2024-03-09 07:56:30.113086: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(TrainLightGCN pid=3363978)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_c7218dbd_

[36m(TrainLightGCN pid=3369063)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3369063)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3369063)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3369063)[0m Using xavier initialization.


[36m(pid=3369231)[0m 2024-03-09 07:56:35.767994: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3369231)[0m 2024-03-09 07:56:35.768025: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3369231)[0m 2024-03-09 07:56:35.768038: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3369231)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3369231)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3369231)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3369231)[0m Using xavier initialization.


[36m(pid=3369400)[0m 2024-03-09 07:56:41.445750: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3369400)[0m 2024-03-09 07:56:41.445797: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3369400)[0m 2024-03-09 07:56:41.445812: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3369400)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3366261)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_65c68914_76_batch_size=8,conv_layers=1,embedding_dim=26,fold=9,l2=0.0000,learning_rate=0.0008_2024-03-09_07-53-49/checkpoint_000000)


[36m(TrainLightGCN pid=3369400)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3369400)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3369400)[0m Using xavier initialization.


[36m(pid=3369581)[0m 2024-03-09 07:56:47.690238: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3369581)[0m 2024-03-09 07:56:47.690278: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3369581)[0m 2024-03-09 07:56:47.690293: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3369581)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3369581)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3369581)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3369581)[0m Using xavier initialization.


[36m(TrainLightGCN pid=3367590)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_1088e88f_84_batch_size=8,conv_layers=5,embedding_dim=14,fold=9,l2=0.0000,learning_rate=0.1670_2024-03-09_07-54-32/checkpoint_000000)


[36m(pid=3369982)[0m 2024-03-09 07:57:54.910767: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3369982)[0m 2024-03-09 07:57:54.910815: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3369982)[0m 2024-03-09 07:57:54.910840: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3369982)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3369982)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3369982)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3369982)[0m Using xavier initialization.


[36m(TrainLightGCN pid=3367762)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_29fbe4f8_85_batch_size=8,conv_layers=4,embedding_dim=10,fold=9,l2=0.0000,learning_rate=0.2050_2024-03-09_07-54-38/checkpoint_000000)


[36m(pid=3370163)[0m 2024-03-09 07:58:02.959034: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3370163)[0m 2024-03-09 07:58:02.959083: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3370163)[0m 2024-03-09 07:58:02.959106: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3370163)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3370163)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3370163)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3370163)[0m Using xavier initialization.


[36m(TrainLightGCN pid=3367935)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_05a1c1e5_86_batch_size=8,conv_layers=1,embedding_dim=6,fold=9,l2=0.0000,learning_rate=0.0003_2024-03-09_07-54-44/checkpoint_000000)


[36m(pid=3370312)[0m 2024-03-09 07:58:09.897396: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3370312)[0m 2024-03-09 07:58:09.897444: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3370312)[0m 2024-03-09 07:58:09.897466: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3370312)[0m   df = train if test is None else train.append(test)
[36m(TrainLightGCN pid=3364794)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_14088999_68_batch_size=6,conv_layers=4,embedding_dim=22,fold=9,l2=0.0000,learning_rate=0.0046_2024-03-09_07-51-39/checkpoint_000000)


[36m(TrainLightGCN pid=3370312)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3370312)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3370312)[0m Using xavier initialization.


[36m(pid=3370488)[0m 2024-03-09 07:58:16.581524: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3370488)[0m 2024-03-09 07:58:16.581571: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3370488)[0m 2024-03-09 07:58:16.581588: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3364948)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_4b0f4bf7_69_batch_size=7,conv_layers=2,embedding_dim=442,fold=9,l2=0.0001,learning_rate=0.0002_2024-03-09_07-52-58/checkpoint_000000)


[36m(TrainLightGCN pid=3370488)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3370488)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3370488)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3370488)[0m Using xavier initialization.


[36m(pid=3370663)[0m 2024-03-09 07:58:22.398538: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3370663)[0m 2024-03-09 07:58:22.398577: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3370663)[0m 2024-03-09 07:58:22.398591: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3370663)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3370663)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3370663)[0m Already normalize adjacency matrix.
[36m(TrainLightGCN pid=3370663)[0m Using xavier initialization.


[36m(TrainLightGCN pid=3368206)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_0c83cdd8_87_batch_size=8,conv_layers=2,embedding_dim=1,fold=9,l2=0.0000,learning_rate=0.0002_2024-03-09_07-54-49/checkpoint_000000)


[36m(pid=3370911)[0m 2024-03-09 07:58:43.747055: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3370911)[0m 2024-03-09 07:58:43.747233: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3370911)[0m 2024-03-09 07:58:43.747260: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[36m(TrainLightGCN pid=3370911)[0m   df = train if test is None else train.append(test)


[36m(TrainLightGCN pid=3370911)[0m Using xavier initialization.
[36m(TrainLightGCN pid=3370911)[0m Already create adjacency matrix.
[36m(TrainLightGCN pid=3370911)[0m Already normalize adjacency matrix.


[36m(TrainLightGCN pid=3366426)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_0fca4491_77_batch_size=7,conv_layers=1,embedding_dim=84,fold=9,l2=0.0000,learning_rate=0.0010_2024-03-09_07-53-54/checkpoint_000000)


[36m(TrainLightGCN pid=3366587)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_990f40da_78_batch_size=6,conv_layers=3,embedding_dim=36,fold=9,l2=0.0003,learning_rate=0.0003_2024-03-09_07-53-59/checkpoint_000000)


[36m(TrainLightGCN pid=3366749)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_d21046f2_79_batch_size=6,conv_layers=3,embedding_dim=130,fold=9,l2=0.0015,learning_rate=0.0074_2024-03-09_07-54-04/checkpoint_000000)


[36m(TrainLightGCN pid=3366919)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_44c5e2cd_80_batch_size=7,conv_layers=2,embedding_dim=198,fold=9,l2=0.0007,learning_rate=0.0032_2024-03-09_07-54-09/checkpoint_000000)


[36m(TrainLightGCN pid=3368490)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_aef2927e_88_batch_size=8,conv_layers=1,embedding_dim=3,fold=9,l2=0.0000,learning_rate=0.0001_2024-03-09_07-55-17/checkpoint_000000)


[36m(TrainLightGCN pid=3367422)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_cad4d469_83_batch_size=6,conv_layers=1,embedding_dim=356,fold=9,l2=0.0000,learning_rate=0.0107_2024-03-09_07-54-26/checkpoint_000000)


[36m(TrainLightGCN pid=3367083)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_ddccd067_81_batch_size=6,conv_layers=2,embedding_dim=304,fold=9,l2=0.0029,learning_rate=0.0055_2024-03-09_07-54-15/checkpoint_000000)


[36m(TrainLightGCN pid=3367258)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_1814965f_82_batch_size=6,conv_layers=1,embedding_dim=416,fold=9,l2=0.0000,learning_rate=0.0014_2024-03-09_07-54-20/checkpoint_000000)


[36m(TrainLightGCN pid=3368715)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_5a6aa49f_89_batch_size=8,conv_layers=1,embedding_dim=534,fold=9,l2=0.0000,learning_rate=0.0004_2024-03-09_07-55-57/checkpoint_000000)


[36m(TrainLightGCN pid=3368899)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_788cc57f_90_batch_size=8,conv_layers=2,embedding_dim=923,fold=9,l2=0.0011,learning_rate=0.0001_2024-03-09_07-56-19/checkpoint_000000)


[36m(TrainLightGCN pid=3370312)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_f5c17ecd_97_batch_size=9,conv_layers=1,embedding_dim=468,fold=9,l2=0.0000,learning_rate=0.0304_2024-03-09_07-58-08/checkpoint_000000)


[36m(TrainLightGCN pid=3370488)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_f7106607_98_batch_size=9,conv_layers=1,embedding_dim=777,fold=9,l2=0.0001,learning_rate=0.0001_2024-03-09_07-58-15/checkpoint_000000)


[36m(TrainLightGCN pid=3370911)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_4428c7ee_100_batch_size=9,conv_layers=1,embedding_dim=1010,fold=9,l2=0.0000,learning_rate=0.0654_2024-03-09_07-58-27/checkpoint_000000)


[36m(TrainLightGCN pid=3370663)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_be16ce21_99_batch_size=8,conv_layers=4,embedding_dim=66,fold=9,l2=0.0001,learning_rate=0.0001_2024-03-09_07-58-21/checkpoint_000000)


[36m(TrainLightGCN pid=3369063)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_06468eb6_91_batch_size=7,conv_layers=1,embedding_dim=158,fold=9,l2=0.0000,learning_rate=0.0006_2024-03-09_07-56-28/checkpoint_000000)


[36m(TrainLightGCN pid=3369400)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_c7307fc1_93_batch_size=7,conv_layers=2,embedding_dim=833,fold=9,l2=0.0000,learning_rate=0.0003_2024-03-09_07-56-40/checkpoint_000000)


[36m(TrainLightGCN pid=3369982)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=9_2024-03-08T15:49:07.362912/TrainLightGCN_e0905e1e_95_batch_size=7,conv_layers=3,embedding_dim=97,fold=9,l2=0.0000,learning_rate=0.0023_2024-03-09_07-56-52/checkpoint_000000)[32m [repeated 3x across cluster][0m


2024-03-09 08:02:00,292	INFO tune.py:1042 -- Total run time: 822.44 seconds (822.34 seconds for the tuning loop).


{'fold': 9, 'batch_size': 7, 'embedding_dim': 4, 'conv_layers': 3, 'learning_rate': 0.0002, 'l2': 0.001120904693873806} [True, True, True, False, False]
{'fold': 9, 'batch_size': 7, 'embedding_dim': 4, 'conv_layers': 3, 'learning_rate': 0.0002, 'l2': 0.001120904693873806} [True, True, True, False, False]


In [22]:
# This is needed for papermill to run the whole notebook and not stop above
# because ray tune catches the exception

print("All finished!")

All finished!
