<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [23]</a>'.</span>

In [1]:
%cd ../../notebooks
from typing import List, Dict, Any

from pathlib import Path
import os, sys
import time
import datetime as dt
import itertools as it

import numpy as np
import pandas as pd
from tqdm.autonotebook import tqdm
import ray
from ray import train, tune
import ipywidgets as widgets

from recsys4daos.model_selection import cvtt_open
from recsys4daos.datasets import to_microsoft, filter_window_size
from recsys4daos.models import LightGCNCustom
from recsys4daos.utils import Timer
import recsys4daos.utils.notebooks as nbu

import recommenders
if recommenders.__version__ == '1.2.0':
    print("Ignoring warnings")
    import warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)

import paths

nbu.print_versions('ray', 'tensorflow')

%load_ext autoreload
%autoreload 2

/home/daviddavo/recsys4daos/notebooks


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]
  from tqdm.autonotebook import tqdm


3.11.6 (main, Jun 24 2024, 07:45:26) [GCC 11.4.0]
recommenders: 1.2.0
ray: 2.30.0
tensorflow: 2.16.1


In [2]:
# Others config
SEED: int = 57
RAY_RESULTS_PATH: Path = '~/ray_results3.11'

# Dataset splits config
ORG_NAME = 'Decentraland'
SPLITS_FREQ = 'W-THU'  # Split weekly
LAST_FOLDS = 10  # Use just last 10 splits
SPLITS_NORMALIZE = True
LAST_FOLD_DATE_STR: str = '2023-07-13'

# Training config
MAX_EPOCHS: int = 200
EPOCHS_PER_ITER: int = 5
SAMPLES_PER_SPLIT: int = 100
MAX_TIME_TOTAL_TRAIN: int = 300
OPTIM_METRIC: str = 'map@10'

# Search space config
MAX_EMBEDDING_DIM = 1024
MIN_BATCH_SIZE = 6
MAX_BATCH_SIZE = 10 # 2**10
MIN_LR = 1e-4
# WINDOW_SIZES = ['7d', '14d', '21d', '30d', '60d', '90d', '10YE']
WINDOW_SIZES = ['21d', '30d', '60d', '90d', '10YE']
GPUS = 16

# Eval config
K_RECOMMENDATIONS: List[int] = [1,3,5,10,15,100]
METRICS: List[str] = ["recall", "ndcg", "precision", "map"]

In [3]:
# Parameters
EXECUTION_ID = "2024-09-04T10:00"
MIN_BATCH_SIZE = 4
ORG_NAME = "Magic Square"
SPLITS_FREQ = "7d"
LAST_FOLDS = 10
SPLITS_NORMALIZE = True
LAST_FOLD_DATE_STR = "2023-07-17"


In [4]:
RAY_RESULTS_PATH = Path(RAY_RESULTS_PATH).expanduser()

## Obtain dataset

In [5]:
!pwd

/home/daviddavo/recsys4daos/notebooks


In [6]:
dfp = paths.load_proposals(ORG_NAME)
dfv = paths.load_votes(ORG_NAME)

print(dfp.info())
print(dfv.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316 entries, 0 to 315
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   id                 316 non-null    object        
 1   author             316 non-null    object        
 2   date               316 non-null    datetime64[us]
 3   start              316 non-null    datetime64[us]
 4   end                316 non-null    datetime64[us]
 5   platform_proposal  316 non-null    object        
dtypes: datetime64[us](3), object(3)
memory usage: 14.9+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 899961 entries, 0 to 899960
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   id        899961 non-null  object        
 1   proposal  899961 non-null  object        
 2   voter     899961 non-null  object        
 3   date      899961 non-null  datetime64[us]

## Split data

In [7]:
df = to_microsoft(dfv)
df

Unnamed: 0,userID,itemID,timestamp,rating
0,0x2212d5f1e531d9786f12df2af0661cfd58cbae71,5611034c-c172-5e23-9c3b-1268fde09c85,2023-05-08 11:56:03,1
1,0x5f8ba595ede4554929a04a0ce9c0d7b31aa3661f,5611034c-c172-5e23-9c3b-1268fde09c85,2023-05-08 11:57:28,1
2,0xef6ad4fcaf7be57f244e617394b5968d3db9ffa0,5611034c-c172-5e23-9c3b-1268fde09c85,2023-05-08 11:57:48,1
3,0xa7db4f21096c45d9392affb9f9ed65110c2838d1,5611034c-c172-5e23-9c3b-1268fde09c85,2023-05-08 12:02:28,1
4,0xe39a6e7d1cfa126b2dcdba9bf8124ba403f37a68,5611034c-c172-5e23-9c3b-1268fde09c85,2023-05-08 12:02:33,1
...,...,...,...,...
899956,0xa607a4b265dd82fc99169faeb29470e2bf3566ce,cdcb5a6f-c0c6-5b1a-840b-85715450033b,2023-07-20 07:53:03,1
899957,0xa607a4b265dd82fc99169faeb29470e2bf3566ce,6b685db4-bf17-5c9a-9214-3ffa835699a3,2023-07-20 07:53:58,1
899958,0xa3d904e8cd2366d27724e0454cb5641cf1380be8,ceb1134c-c268-5d54-9ed6-4edbf129cd87,2023-07-20 08:15:16,1
899959,0xa3d904e8cd2366d27724e0454cb5641cf1380be8,6b685db4-bf17-5c9a-9214-3ffa835699a3,2023-07-20 08:15:59,1


In [8]:
# USE INTEGER INDEX
# folds_dict = list(cvtt_open(df, SPLITS_FREQ, dfp, remove_not_in_train_col='userID'))
# use_folds_idx = range(len(folds_dict))[-LAST_FOLDS:]

# USE TIMESTAMP INDEX
# Note: NO need to used OrderedDict, dict is ordered since Python 3.6
folds_dict = { f.end.isoformat():f for f in cvtt_open(df, SPLITS_FREQ, dfp, remove_not_in_train_col='userID', last_fold=LAST_FOLD_DATE_STR) }
use_folds_idx = list(folds_dict.keys())[-LAST_FOLDS:]

print(len(folds_dict), "folds")
print("Using", len(use_folds_idx), "folds, from", use_folds_idx[0], "to", use_folds_idx[-1])

11 folds
Using 10 folds, from 2023-05-15T00:00:00 to 2023-07-17T00:00:00


## Making some checks

### Checking that all the folds have open proposals

In [9]:
_nok_folds = []
for idx in use_folds_idx:
    if len(folds_dict[idx].open_proposals) == 0:
        _nok_folds.append(idx)

if _nok_folds:
    raise ValueError(f"Folds {', '.join(_nok_folds)} don't have any open proposals")

### Checking correct hparams

In [10]:
now = dt.datetime.now()
max_bs = 2**MAX_BATCH_SIZE
min_bs = 2**MIN_BATCH_SIZE
_sorted_ws = sorted(WINDOW_SIZES, key=lambda ws: now + pd.tseries.frequencies.to_offset(ws))

# Checking that the max_bs is doable with every window size
for idx in use_folds_idx:
    for ws in _sorted_ws:
        nusers = filter_window_size(folds_dict[idx].train, folds_dict[idx].end, ws)['userID'].nunique()
        if nusers > max_bs:
            print(f'On fold {idx} WINDOW_SIZE should be at least {ws}: {nusers} > {max_bs}', file=sys.stderr)
            break

# Checking that the min_bs is doable with every window size
for idx in use_folds_idx:
    for ws in reversed(_sorted_ws):
        nusers = filter_window_size(folds_dict[idx].train, folds_dict[idx].end, ws)['userID'].nunique()
        if nusers < min_bs:
            print(f'On fold {idx} WINDOW_SIZE should be more than {ws}: {nusers} < {min_bs}', file=sys.stderr)
            break

# Checking that the max_ws is doable with every batch size
max_ws = _sorted_ws[-1]
for idx in use_folds_idx:
    nusers = filter_window_size(folds_dict[idx].train, folds_dict[idx].end, max_ws)['userID'].nunique()
    for bs in (2**x for x in reversed(range(MIN_BATCH_SIZE, MAX_BATCH_SIZE+1))):
        if bs > nusers:
            print(f"On fold {idx} batch_size should be '<={bs/2}' with {max_ws}: {nusers} < {bs}", file=sys.stderr)
            break

print("All folds ok!")

On fold 2023-05-15T00:00:00 WINDOW_SIZE should be at least 21d: 1518 > 1024
On fold 2023-05-22T00:00:00 WINDOW_SIZE should be at least 21d: 2668 > 1024
On fold 2023-05-29T00:00:00 WINDOW_SIZE should be at least 21d: 5046 > 1024
On fold 2023-06-05T00:00:00 WINDOW_SIZE should be at least 21d: 10407 > 1024
On fold 2023-06-12T00:00:00 WINDOW_SIZE should be at least 21d: 15580 > 1024
On fold 2023-06-19T00:00:00 WINDOW_SIZE should be at least 21d: 20468 > 1024
On fold 2023-06-26T00:00:00 WINDOW_SIZE should be at least 21d: 19278 > 1024
On fold 2023-07-03T00:00:00 WINDOW_SIZE should be at least 21d: 33397 > 1024
On fold 2023-07-10T00:00:00 WINDOW_SIZE should be at least 21d: 31431 > 1024
On fold 2023-07-17T00:00:00 WINDOW_SIZE should be at least 21d: 37367 > 1024


All folds ok!


### Number of users with the min window_size

In [11]:
_min_ws = _sorted_ws[0]
_max_ws = _sorted_ws[-1]
for idx in use_folds_idx:
    nusers = filter_window_size(folds_dict[idx].train, folds_dict[idx].end, _min_ws)['userID'].nunique()
    nusers2 = filter_window_size(folds_dict[idx].train, folds_dict[idx].end, _max_ws)['userID'].nunique()
    print(f'On fold {idx} with ws {_min_ws}: {nusers} users, ws {_max_ws}: {nusers2}')

On fold 2023-05-15T00:00:00 with ws 21d: 1518 users, ws 10YE: 1518
On fold 2023-05-22T00:00:00 with ws 21d: 2668 users, ws 10YE: 2668
On fold 2023-05-29T00:00:00 with ws 21d: 5046 users, ws 10YE: 5046
On fold 2023-06-05T00:00:00 with ws 21d: 10407 users, ws 10YE: 11200
On fold 2023-06-12T00:00:00 with ws 21d: 15580 users, ws 10YE: 17262
On fold 2023-06-19T00:00:00 with ws 21d: 20468 users, ws 10YE: 23522
On fold 2023-06-26T00:00:00 with ws 21d: 19278 users, ws 10YE: 25944
On fold 2023-07-03T00:00:00 with ws 21d: 33397 users, ws 10YE: 44192
On fold 2023-07-10T00:00:00 with ws 21d: 31431 users, ws 10YE: 48097
On fold 2023-07-17T00:00:00 with ws 21d: 37367 users, ws 10YE: 55529


## Testing model

In [12]:
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.models.deeprec.deeprec_utils import prepare_hparams

In [13]:
# hparams = prepare_hparams(
#     model_type='lightgcn',
#     n_layers=3,
#     batch_size=512,
#     embed_size=64,
#     epochs=2,
#     learning_rate=0.001,
#     decay=0.001,
#     metrics=["recall", "ndcg", "precision", "map"],
#     eval_epoch=2,
#     top_k=K_RECOMMENDATIONS[0],
#     save_model=False,
#     MODEL_DIR='./data/model/lightgcn/',
# )
# dataloader = ImplicitCF(train=folds_dict[use_folds_idx[0]].train, test=folds_dict[use_folds_idx[0]].test, seed=SEED)
# print("items:", dataloader.n_items, "user:", dataloader.n_users)
# model = LightGCNCustom(data=dataloader, hparams=hparams)

In [14]:
# model.fit()
# model.run_eval()

In [15]:
# model.recommend_k_items(
#     dataloader.test, 
#     top_k=3, 
#     use_id=True, 
#     remove_seen=True, 
#     recommend_from=folds_dict[use_folds_idx[0]].open_proposals
# )

## Defining trainable

In [16]:
from recsys4daos.evaluation import calculate_all_metrics

In [17]:
class TrainLightGCN(tune.Trainable):
    def setup(
        self,
        config: Dict[str, Any],
        data,
    ):
        self.config = config
        train, test, self.t, self.open_proposals = data
        train_filtered = filter_window_size(train, self.t, config['window_size'])
        
        # Remove users that voted in everything
        # see https://github.com/recommenders-team/recommenders/issues/2163
        msk = train_filtered.groupby('userID').size() == train_filtered['itemID'].nunique()
        train_filtered = train_filtered[train_filtered['userID'].isin(msk[~msk].index)]
        
        self.dataloader = ImplicitCF(train=train_filtered, test=test, seed=SEED)
        # Some experiments will run multiple times, but that's a price to pay for
        # usability
        self.real_batch_size = min(2**config['batch_size'], self.dataloader.n_users_in_train)

        self.hparams = prepare_hparams(
            model_type='lightgcn',
            n_layers=config['conv_layers'],
            batch_size=self.real_batch_size,
            embed_size=config['embedding_dim'],
            epochs=EPOCHS_PER_ITER,
            learning_rate=config['learning_rate'],
            decay=config['l2'],
            metrics=METRICS,
            eval_epoch=-1,
            top_k=K_RECOMMENDATIONS[0],
            save_model=False,
            MODEL_DIR='./data/model/lightgcn/',
        )
        self.model = LightGCNCustom(self.hparams, self.dataloader, seed=SEED)
        self.total_train = 0
        self.total_eval = 0

    @property
    def iteration(self):
        return self.model.epochs_done

    @property
    def training_iteration(self):
        return self.model.epochs_done

    def step(self):
        """
        As a rule of thumb, the execution time of step should be large enough to avoid overheads 
        (i.e. more than a few seconds), but short enough to report progress periodically 
        (i.e. at most a few minutes).
        """
        assert EPOCHS_PER_ITER > 0

        with Timer() as t_train:
            for _ in range(EPOCHS_PER_ITER):
                ret = self.model.fit_epoch()


        with Timer() as t_rec:
            recs = self.model.recommend_k_items(
                self.dataloader.test, # Used only to get user ids
                top_k=max(K_RECOMMENDATIONS),
                use_id=True,
                remove_seen=True,
                recommend_from=self.open_proposals,
            )
        
        eval_dict = {'model_'+k:v for k,v in zip(self.model.metrics, self.model.run_eval())}
        eval_dict |= calculate_all_metrics(self.dataloader.test, recs, K_RECOMMENDATIONS)

        self.total_train += t_train.time
        self.total_eval += eval_dict['time_eval']
        
        return {
            'real_batch_size': self.real_batch_size,
            'iteration': self.iteration,
            'loss': ret[0],
            'mf_loss': ret[1],
            'emb_loss': ret[2],
            **eval_dict,
            'time_train': t_train.time,
            'time_rec': t_rec.time,
            'time_total_train': self.total_train,
            'time_total_test': self.total_eval,
        }

    def save_checkpoint(self, checkpoint_dir):
        checkpoint_path = os.path.join(checkpoint_dir, "model")
        self.model.saver.save(
            sess=self.model.sess,
            save_path=checkpoint_path,
        )
        return checkpoint_dir

    def load_checkpoint(self, checkpoint_path):
        self.model.load(checkpoint_path)

## Big experiment

In [18]:
RAY_RESULTS_PATH

PosixPath('/home/daviddavo/ray_results3.11')

In [19]:
print(os.uname().nodename)

### SET TRAINING RESOURCES
if os.uname().nodename == 'lamarck':
    # assert torch.cuda.is_available()

    NUM_SAMPLES = SAMPLES_PER_SPLIT
    # Every run takes approx half a gig of vram (no optimizations)
    # The RTX 4090 has 24GB so we can run the model about 48 times
    resources_per_trial={
        'cpu': 1,
        'gpu': 1 / GPUS,
    }
else:
    NUM_SAMPLES = 1
    resources_per_trial={
        'cpu': 1,
        # It takes about 1.5 GiB with full training data, but I put a bit more because
        # this notebook also takes a bit of memory
        'memory': 2e9,
    }
print(resources_per_trial)

lamarck
{'cpu': 1, 'gpu': 0.0625}


In [20]:
from ray.tune.search.hyperopt import HyperOptSearch

In [21]:
def getTunerOnFold(f_idx, points_to_evaluate = None):    
    name = paths.lightgcn_ray_tune_fname(ORG_NAME, SPLITS_FREQ, SPLITS_NORMALIZE, OPTIM_METRIC, fold=f_idx)
    experiments = list(RAY_RESULTS_PATH.glob(f'{name}_*'))
    last_experiment = max(experiments, key=lambda x: x.stat().st_ctime) if experiments else None
    f = folds_dict[f_idx]

    dftrain,dftest,t,open_proposals = folds_dict[f_idx]
    param_space = dict(
        batch_size=tune.randint(MIN_BATCH_SIZE, MAX_BATCH_SIZE+1), # 64 - 2**MAX_BATCH_SIZE
        embedding_dim=tune.lograndint(1, MAX_EMBEDDING_DIM, base=2),
        conv_layers=tune.randint(1,5),
        learning_rate=tune.qloguniform(MIN_LR, 1, 1e-4),
        l2=tune.loguniform(1e-7, 1e-2, 1e-7),
        window_size=tune.choice(WINDOW_SIZES),
        # Just so it appears on the output
        fold=f_idx,
    )
    
    ### RESTORE EXPERIMENT OR CREATE A NEW ONE
    if last_experiment and tune.Tuner.can_restore(last_experiment):
        print(f"Restoring last experiment: {last_experiment}")
        tuner = tune.Tuner.restore(
            str(last_experiment),
            trainable=tune.with_resources(
                # tune.with_parameters(TrainLightGCN,  train=dftrain, test=dftest, open_proposals=open_proposals),
                tune.with_parameters(TrainLightGCN, data=f),
                resources_per_trial,
            ),
            restart_errored=True,
            param_space=param_space,
        )
    else:
        print(f"No experiment found for fold {f_idx}, creating new tuner with {NUM_SAMPLES} samples")
        search_alg = None
        
        search_alg = HyperOptSearch(
            points_to_evaluate = points_to_evaluate,
            random_state_seed=SEED,
        )
        # search_alg = tune.search.Repeater(search_alg, N_SPLITS-SKIP_SPLIT)
        
        tuner = tune.Tuner(
            tune.with_resources(
                # tune.with_parameters(TrainLightGCN,  train=dftrain, test=dftest, open_proposals=open_proposals),
                tune.with_parameters(TrainLightGCN, data=folds_dict[f_idx]),
                resources_per_trial,
            ),
            run_config=train.RunConfig(
                stop={'training_iteration': MAX_EPOCHS/EPOCHS_PER_ITER, 'time_total_train': MAX_TIME_TOTAL_TRAIN},
                name=name + f'_{dt.datetime.now().isoformat()}',
                storage_path=RAY_RESULTS_PATH,
                # failure_config=train.FailureConfig(fail_fast='raise'),
                failure_config=train.FailureConfig(max_failures=3),
            ),
            param_space=param_space,
            tune_config=tune.TuneConfig(
                search_alg=search_alg,
                num_samples=NUM_SAMPLES,
                metric=OPTIM_METRIC,
                mode='max',
            )
        )

    return tuner

In [22]:
# We need to display the progress bar in another cell because ray tune "overwrites" the previous output
pbar = tqdm(total=len(use_folds_idx), desc='fold')
out = widgets.Output(layout={'border': '1px solid black'})
with out:
    print("In this cell important output from the next cell will be shown")
out

fold:   0%|          | 0/10 [00:00<?, ?it/s]

Output(layout=Layout(border_bottom='1px solid black', border_left='1px solid black', border_right='1px solid b…

<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [23]:
import logging
import requests

def findConfig(rg):
    for r in rg:
        if r.config:
            lbrc = last_best_result.config
            if all((r.config[k] == v for k, v in last_best_result.config.items() if k != 'fold')):
                return r
            elif all((r.config[k] == v for k, v in last_best_result.config.items() if k != 'fold' and k != 'window_size')):
                print("Possible coincidence:", r.config, file=sys.stderr)

    return None

tuners = []
results = []
last_best_result = None
pbar.reset()

last_best_fold = None
requests.post("https://ntfy.sh/grasia_notebooks", data=f"Start running microsoft_tuning for {ORG_NAME}")
for i, (prev_f_idx, f_idx) in enumerate(zip(it.chain([None], use_folds_idx), use_folds_idx)):
    try:
        with out:
            best_prev_config = None
            if last_best_result is not None:
                best_prev_config = last_best_result.config.copy()
                assert best_prev_config['fold'] == prev_f_idx
                best_prev_config['fold'] = f_idx
                print(f"Also evaluating best_prev_config ({OPTIM_METRIC}={last_best_result.metrics[OPTIM_METRIC]}): {best_prev_config}")
                best_prev_config = [best_prev_config]
        
        t = getTunerOnFold(f_idx, best_prev_config)
        tuners.append(t)
    
        rg = t.fit()
        
        # FIXME: load results from disk until ray-project/ray#47358 is solved
        # https://github.com/ray-project/ray/issues/47358
        t = getTunerOnFold(f_idx, best_prev_config)
        tuners[-1] = t
        rg = t.fit()
    
        assert rg.num_errors == 0, f"There are {rg.num_errors} errors"
        assert rg.num_terminated >= NUM_SAMPLES, f'Some samples are not terminated ({rg.num_terminated} != {NUM_SAMPLES})'
        assert len(rg.get_dataframe()) >= NUM_SAMPLES
        results.append(rg)
    
        # Assert that the prev config has been tried
        if last_best_result is not None:
            # if not any( 
            #     all((r.config[k] == v for k, v in last_best_result.config.items() if k != 'fold'))
            #     for r in rg if r.config
            # ):
            if not findConfig(rg):
                print("Best config:", last_best_result.config)
                assert False, f"The best config from previous fold has not been tested in fold {f_idx}"    
            else:
                logging.info(f'Fold {f_idx}. Best prev result was {last_best_result.path} and config has been found {findConfig(rg).path}')
        
        last_best_result = rg.get_best_result()
        pbar.update()
    
        print(f"Finished training for fold {f_idx}")
        requests.post("https://ntfy.sh/grasia_notebooks", data=f"Finished running fold {i} for {ORG_NAME}")
    except Exception as e:
        requests.post("https://ntfy.sh/grasia_notebooks", data=f"Error on fold {i} for {ORG_NAME}: {repr(e)}")
        raise

pbar.close()

0,1
Current time:,2024-09-16 11:12:25
Running for:,00:00:00.27
Memory:,13.2/125.6 GiB

Trial name,# failures,error file
TrainLightGCN_48d50df5,2*,"/tmp/ray/session_2024-09-16_07-40-51_621665_1312344/artifacts/2024-09-16_07-41-11/LightGCN_7d_normalize_map@10_fold=2023-07-17T00:00:00_2024-09-16T07:41:11.882650/driver_artifacts/TrainLightGCN_48d50df5_17_batch_size=7,conv_layers=2,embedding_dim=241,fold=2023-07-17T00_00_00,l2=0.0000,learning_rate=0.0017,win_2024-09-16_07-43-51/error.txt"

Trial name,status,loc,batch_size,conv_layers,embedding_dim,fold,l2,learning_rate,window_size,iter,total time (s),real_batch_size,iteration,loss
TrainLightGCN_3672f849,TERMINATED,147.96.81.131:1322217,4,2,2,2023-07-17T00:00:00,0.000426525,0.0012,60d,1,8922.72,16,5,0.160473
TrainLightGCN_bbf9bc21,TERMINATED,147.96.81.131:1335673,5,4,16,2023-07-17T00:00:00,0.00039879,0.038,10YE,1,2904.79,32,5,0.0992758
TrainLightGCN_a0897030,TERMINATED,147.96.81.131:1317883,5,4,97,2023-07-17T00:00:00,6.09932e-05,0.0021,10YE,1,11847.3,32,5,0.00491038
TrainLightGCN_1ce95d57,TERMINATED,147.96.81.131:1325940,4,4,4,2023-07-17T00:00:00,0.0062739,0.0038,21d,1,6282.41,16,5,0.0737498
TrainLightGCN_e8502921,TERMINATED,147.96.81.131:1335138,6,4,6,2023-07-17T00:00:00,0.00114208,0.2629,90d,1,2544.5,64,5,1.96685
TrainLightGCN_c7fb91fd,TERMINATED,147.96.81.131:1322550,4,3,1,2023-07-17T00:00:00,0.00268588,0.0079,21d,1,7671.01,16,5,0.610435
TrainLightGCN_80975fd0,TERMINATED,147.96.81.131:1323598,5,1,552,2023-07-17T00:00:00,0.0020047,0.0029,21d,1,6904.03,32,5,0.0231684
TrainLightGCN_0e404db3,TERMINATED,147.96.81.131:1334556,7,4,2,2023-07-17T00:00:00,0.00640252,0.0951,60d,1,1832.01,128,5,0.657955
TrainLightGCN_06700b4a,TERMINATED,147.96.81.131:1329682,6,2,1,2023-07-17T00:00:00,2.54826e-06,0.2646,10YE,1,3404.08,64,5,0.596523
TrainLightGCN_42e9082b,TERMINATED,147.96.81.131:1334725,6,4,11,2023-07-17T00:00:00,0.00543534,0.4095,21d,1,1709.6,64,5,5.38875


2024-09-16 11:12:24,727	INFO tune_controller.py:444 -- Restoring the run from the latest experiment state file: experiment_state-2024-09-16_07-41-11.json
2024-09-16 11:12:25,154	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/daviddavo/ray_results3.11/Magic Square/LightGCN_7d_normalize_map@10_fold=2023-07-17T00:00:00_2024-09-16T07:41:11.882650' in 0.2676s.
2024-09-16 11:12:25,168	INFO tune.py:1041 -- Total run time: 0.45 seconds (0.00 seconds for the tuning loop).


Finished training for fold 2023-07-17T00:00:00


[36m(TrainLightGCN pid=812333)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results3.11/Magic Square/LightGCN_7d_normalize_map@10_fold=2023-07-03T00:00:00_2024-09-09T18:42:44.802727/TrainLightGCN_fe7b025e_26_batch_size=8,conv_layers=4,embedding_dim=1,fold=2023-07-03T00_00_00,l2=0.0000,learning_rate=0.0002,windo_2024-09-09_21-10-11/checkpoint_000000)


[36m(TrainLightGCN pid=815229)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results3.11/Magic Square/LightGCN_7d_normalize_map@10_fold=2023-07-03T00:00:00_2024-09-09T18:42:44.802727/TrainLightGCN_dd0ea308_31_batch_size=9,conv_layers=1,embedding_dim=139,fold=2023-07-03T00_00_00,l2=0.0001,learning_rate=0.0003,win_2024-09-09_21-10-11/checkpoint_000000)


[36m(TrainLightGCN pid=812003)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results3.11/Magic Square/LightGCN_7d_normalize_map@10_fold=2023-07-03T00:00:00_2024-09-09T18:42:44.802727/TrainLightGCN_dfef496a_13_batch_size=9,conv_layers=2,embedding_dim=364,fold=2023-07-03T00_00_00,l2=0.0000,learning_rate=0.0002,win_2024-09-09_21-10-10/checkpoint_000000)


[36m(TrainLightGCN pid=814725)[0m Number of items is less than top_k, limiting top_k to number of items


[36m(TrainLightGCN pid=814725)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results3.11/Magic Square/LightGCN_7d_normalize_map@10_fold=2023-07-03T00:00:00_2024-09-09T18:42:44.802727/TrainLightGCN_3a03b874_23_batch_size=9,conv_layers=2,embedding_dim=3,fold=2023-07-03T00_00_00,l2=0.0021,learning_rate=0.2778,windo_2024-09-09_21-10-11/checkpoint_000000)


[36m(TrainLightGCN pid=812280)[0m Number of items is less than top_k, limiting top_k to number of items


[36m(TrainLightGCN pid=812280)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results3.11/Magic Square/LightGCN_7d_normalize_map@10_fold=2023-07-03T00:00:00_2024-09-09T18:42:44.802727/TrainLightGCN_88f3b33c_24_batch_size=7,conv_layers=1,embedding_dim=19,fold=2023-07-03T00_00_00,l2=0.0000,learning_rate=0.0091,wind_2024-09-09_21-10-11/checkpoint_000000)


[36m(TrainLightGCN pid=812177)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results3.11/Magic Square/LightGCN_7d_normalize_map@10_fold=2023-07-03T00:00:00_2024-09-09T18:42:44.802727/TrainLightGCN_123b3769_22_batch_size=9,conv_layers=2,embedding_dim=717,fold=2023-07-03T00_00_00,l2=0.0001,learning_rate=0.0145,win_2024-09-09_21-10-11/checkpoint_000000)


[36m(TrainLightGCN pid=812062)[0m Number of items is less than top_k, limiting top_k to number of items


[36m(TrainLightGCN pid=812062)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results3.11/Magic Square/LightGCN_7d_normalize_map@10_fold=2023-07-03T00:00:00_2024-09-09T18:42:44.802727/TrainLightGCN_500e164d_18_batch_size=6,conv_layers=2,embedding_dim=27,fold=2023-07-03T00_00_00,l2=0.0000,learning_rate=0.2056,wind_2024-09-09_21-10-10/checkpoint_000000)


[36m(TrainLightGCN pid=812057)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results3.11/Magic Square/LightGCN_7d_normalize_map@10_fold=2023-07-03T00:00:00_2024-09-09T18:42:44.802727/TrainLightGCN_e8316606_17_batch_size=7,conv_layers=2,embedding_dim=241,fold=2023-07-03T00_00_00,l2=0.0000,learning_rate=0.0017,win_2024-09-09_21-10-10/checkpoint_000000)


[36m(TrainLightGCN pid=812501)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results3.11/Magic Square/LightGCN_7d_normalize_map@10_fold=2023-07-03T00:00:00_2024-09-09T18:42:44.802727/TrainLightGCN_f6134ff3_33_batch_size=5,conv_layers=2,embedding_dim=47,fold=2023-07-03T00_00_00,l2=0.0000,learning_rate=0.0361,wind_2024-09-09_21-10-12/checkpoint_000000)


[36m(TrainLightGCN pid=812066)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results3.11/Magic Square/LightGCN_7d_normalize_map@10_fold=2023-07-03T00:00:00_2024-09-09T18:42:44.802727/TrainLightGCN_de7141ee_19_batch_size=5,conv_layers=4,embedding_dim=97,fold=2023-07-03T00_00_00,l2=0.0001,learning_rate=0.0021,wind_2024-09-09_21-10-10/checkpoint_000000)


2024-09-09 22:30:44,239	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/daviddavo/ray_results3.11/Magic Square/LightGCN_7d_normalize_map@10_fold=2023-07-03T00:00:00_2024-09-09T18:42:44.802727' in 0.0262s.


[36m(TrainLightGCN pid=812498)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/daviddavo/ray_results3.11/Magic Square/LightGCN_7d_normalize_map@10_fold=2023-07-03T00:00:00_2024-09-09T18:42:44.802727/TrainLightGCN_7c350a84_32_batch_size=5,conv_layers=4,embedding_dim=323,fold=2023-07-03T00_00_00,l2=0.0004,learning_rate=0.1294,win_2024-09-09_21-10-11/checkpoint_000000)


2024-09-09 22:30:44,255	ERROR tune.py:1037 -- Trials did not complete: [TrainLightGCN_216d91fd, TrainLightGCN_b43ee02b, TrainLightGCN_20ccbb87]


2024-09-09 22:30:44,255	INFO tune.py:1041 -- Total run time: 4834.10 seconds (4833.91 seconds for the tuning loop).


- TrainLightGCN_216d91fd: FileNotFoundError('Could not fetch metrics for TrainLightGCN_216d91fd: both result.json and progress.csv were not found at /home/daviddavo/ray_results3.11/Magic Square/LightGCN_7d_normalize_map@10_fold=2023-07-03T00:00:00_2024-09-09T18:42:44.802727/TrainLightGCN_216d91fd_36_batch_size=6,conv_layers=4,embedding_dim=896,fold=2023-07-03T00_00_00,l2=0.0000,learning_rate=0.4861,win_2024-09-09_21-15-44')


AssertionError: There are 3 errors

In [24]:
requests.post("https://ntfy.sh/grasia_notebooks", data=f"Finished running everything for {ORG_NAME}")
print("Finished!")

Finished!
