# Preparation stuff

## Connect to Drive

In [1]:
connect_to_drive = False

In [2]:
#Run command and authorize by popup --> other window
if connect_to_drive:
    from google.colab import drive
    drive.mount('/content/gdrive', force_remount=True)

## Install packages

In [3]:
if connect_to_drive:
    #Install FS code
    !pip install  --upgrade --force-reinstall git+https://github.com/federicosiciliano/easy_lightning.git

    !pip install pytorch_lightning

## IMPORTS

In [4]:
#Put all imports here
import numpy as np
from copy import deepcopy
import os
import sys

from ray import tune
from ray.train.torch import TorchTrainer
from ray.train import ScalingConfig#, RunConfig, CheckpointConfig

## Define paths

In [5]:
#every path should start from the project folder:
project_folder = "../"
if connect_to_drive:
    project_folder = "/content/gdrive/Shareddrives/<SharedDriveName>" #Name of SharedDrive folder
    #project_folder = "/content/gdrive/MyDrive/<MyDriveName>" #Name of MyDrive folder

#Config folder should contain hyperparameters configurations
cfg_folder = os.path.join(project_folder,"cfg")

#Data folder should contain raw and preprocessed data
data_folder = os.path.join(project_folder,"data")
raw_data_folder = os.path.join(data_folder,"raw")
processed_data_folder = os.path.join(data_folder,"processed")

#Source folder should contain all the (essential) source code
source_folder = os.path.join(project_folder,"src")

#The out folder should contain all outputs: models, results, plots, etc.
out_folder = os.path.join(project_folder,"out")
img_folder = os.path.join(out_folder,"img")

## Import own code

In [6]:
#To import from src:

#attach the source folder to the start of sys.path
sys.path.insert(0, project_folder)

#import from src directory
# from src import ??? as additional_module
import easy_rec as additional_module #REMOVE THIS LINE IF IMPORTING OWN ADDITIONAL MODULE

import easy_exp, easy_rec, easy_torch #easy_data

# MAIN

## Train

### Data

In [7]:
cfg = easy_exp.cfg.load_configuration("config_rec")

In [9]:
def prepare_raytune_config(cfg):
    raytune_cfg = {}
    for parameter_name, v in cfg["__exp__"]["__sweep__"]["parameters"].items():
        if "tune" in v:
            raytune_cfg[parameter_name] = getattr(tune, v["tune"]["name"])(**v["tune"]["params"])
    return raytune_cfg

In [10]:
raytune_cfg = prepare_raytune_config(cfg)

In [11]:
#from ray.train.lightning import RayDDPStrategy, RayLightningEnvironment, RayTrainReportCallback, prepare_trainer

In [12]:
# cfg["model"]["trainer_params"]["strategy"] = RayDDPStrategy()
# #cfg["model"]["trainer_params"]["callbacks"].append(lambda : RayTrainReportCallback())
# cfg["model"]["trainer_params"]["plugins"] = cfg["model"]["trainer_params"].get("plugins", []) + [RayLightningEnvironment()]

In [13]:
def run_config(cfg, if_exp_found=None, raytune=False):
    # exp_found
    # skip --> skip the experiment
    # load --> load the experiment
    # if not load nor skip, reruns the experiment completely

    exp_found, experiment_id = easy_exp.exp.get_set_experiment_id(cfg)
    #print("Experiment already found:", exp_found, "----> The experiment id is:", experiment_id)

    if exp_found and if_exp_found == "skip":
        #print("Skipping experiment")
        return
    
    # Save experiment (done here cause Early stopping with Tune schedulers may not run anything after training)
    easy_exp.exp.save_experiment(cfg)

    data, maps = easy_rec.preparation.prepare_rec_data(cfg)

    loaders = easy_rec.preparation.prepare_rec_dataloaders(cfg, data, maps)

    main_module = easy_rec.preparation.prepare_rec_model(cfg, maps)

    trainer = easy_torch.preparation.complete_prepare_trainer(cfg, experiment_id, additional_module=easy_rec, raytune=raytune)

    model = easy_torch.preparation.complete_prepare_model(cfg, main_module, additional_module=easy_rec)

    if exp_found and if_exp_found == "load":
        easy_torch.process.load_model(trainer, model, experiment_id)

    easy_torch.process.test_model(trainer, model, loaders, test_key=["val","test","train"])

    # Train the model using the prepared trainer, model, and data loaders
    easy_torch.process.train_model(trainer, model, loaders, val_key=["val","test"])

    # Early stopping with Tune schedulers may not run anything after training

In [14]:
def run_raytune_cfg(raytune_cfg, cfg, if_exp_found=None):
    complete_cfg = deepcopy(cfg)
    complete_cfg.update(raytune_cfg)

    # save complete_cfg to a file

    run_config(complete_cfg, if_exp_found, raytune=True)

In [15]:
# checkpoint_data = {
#     "epoch": epoch,
#     "net_state_dict": net.state_dict(),
#     "optimizer_state_dict": optimizer.state_dict(),
# }
# with tempfile.TemporaryDirectory() as checkpoint_dir:
#     data_path = Path(checkpoint_dir) / "data.pkl"
#     with open(data_path, "wb") as fp:
#         pickle.dump(checkpoint_data, fp)

#     checkpoint = Checkpoint.from_directory(checkpoint_dir)
#     train.report(
#         {"loss": val_loss / val_steps, "accuracy": correct / total},
#         checkpoint=checkpoint,
#     )

In [16]:
max_num_epochs = cfg["model"]["trainer_params"]["max_epochs"]
scheduler = tune.schedulers.ASHAScheduler(
        metric="val_NDCG_@10/dataloader_idx_0",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)

In [17]:
scaling_config = ScalingConfig(
    num_workers=2, use_gpu=True, resources_per_worker={"CPU": 4, "GPU": 1}
)

# run_config = RunConfig(
#     checkpoint_config=CheckpointConfig(
#         num_to_keep=2,
#         checkpoint_score_attribute="ptl/val_accuracy",
#         checkpoint_score_order="max",
#     ),
# )

# Define a TorchTrainer without hyper-parameters for Tuner
ray_trainer = TorchTrainer(
    lambda x: run_raytune_cfg(x, cfg),
    # scaling_config=scaling_config,
    # run_config=run_config,
)



In [18]:
os.environ["RAY_CHDIR_TO_TRIAL_DIR"] = "0" #To avoid changing working directory

In [19]:
tuner = tune.Tuner(
    ray_trainer,
    param_space={"train_loop_config": raytune_cfg},
    tune_config=tune.TuneConfig(
        # metric="val_NDCG_@10/dataloader_idx_0",
        # mode="max",
        num_samples=100,
        scheduler=scheduler,
        time_budget_s=40, #seconds #May raise WARNING Failed to fetch metrics for
        max_concurrent_trials=2,
    ),
)

results = tuner.fit()

0,1
Current time:,2024-12-24 13:25:09
Running for:,00:00:40.66
Memory:,13.1/16.0 GiB

Trial name,status,loc,.../data_params.coll ator_params.lookback,...loop_config/model .optimizer.params.lr,..._config/model.rec _model.dropout_input,...oop_config/model. rec_model.num_layers,iter,total time (s),val_loss/dataloader_ idx_0,val_Precision_@5/dat aloader_idx_0,val_Precision_@10/da taloader_idx_0
TorchTrainer_0519a_00000,TERMINATED,127.0.0.1:2994,20,0.0477965,0.4877,1,8.0,12.5336,0.517732,0.0629464,0.0482143
TorchTrainer_0519a_00001,TERMINATED,127.0.0.1:2995,5,0.087581,0.0990131,2,10.0,14.0873,0.514453,0.0524554,0.0402902
TorchTrainer_0519a_00002,TERMINATED,127.0.0.1:2996,5,0.0983281,0.173011,2,10.0,14.164,0.517134,0.0546875,0.0409598
TorchTrainer_0519a_00003,TERMINATED,127.0.0.1:2997,20,0.0558714,0.158131,1,1.0,6.66559,0.884696,0.0252232,0.0229911
TorchTrainer_0519a_00004,TERMINATED,127.0.0.1:3108,5,0.0273396,0.383866,1,10.0,12.7617,0.578544,0.0558036,0.0402902
TorchTrainer_0519a_00005,TERMINATED,127.0.0.1:3195,5,0.0568602,0.256524,2,4.0,8.17168,0.59405,0.0470982,0.0371652
TorchTrainer_0519a_00006,TERMINATED,127.0.0.1:3206,20,0.0974798,0.489921,2,1.0,6.08866,0.707201,0.0375,0.0294643
TorchTrainer_0519a_00007,TERMINATED,127.0.0.1:3207,20,0.0342597,0.0652479,1,4.0,8.35312,0.573536,0.0571429,0.04375
TorchTrainer_0519a_00008,TERMINATED,127.0.0.1:3311,10,0.0615497,0.474354,2,1.0,6.16058,0.821884,0.0229911,0.0215402
TorchTrainer_0519a_00009,TERMINATED,127.0.0.1:3325,5,0.00678983,0.452121,2,1.0,6.39057,0.777003,0.0140625,0.0131696


[36m(TorchTrainer pid=2997)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=2997)[0m - (node_id=fed15232356329568052a5376b724ec8d5728378a80c03a3ab3695fd, ip=127.0.0.1, pid=3011) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=3011)[0m Setting up process group for: env:// [rank=0, world_size=1]


[36m(RayTrainWorker pid=3012)[0m Ratings data already exists. Skip pre-processing
[36m(RayTrainWorker pid=3014)[0m ../out/exp/prova not found --> created
[36m(RayTrainWorker pid=3011)[0m ../out/exp not found --> created
[36m(RayTrainWorker pid=3012)[0m Filtering by minimum number of users per item: 5
[36m(RayTrainWorker pid=3012)[0m Filtering by minimum number of items per user: 5
[36m(RayTrainWorker pid=3012)[0m Densifying index
[36m(RayTrainWorker pid=3012)[0m Splitting: leave_n_out


[36m(RayTrainWorker pid=3011)[0m [rank: 0] Seed set to 42
[36m(RayTrainWorker pid=3011)[0m [rank: 0] Seed set to 42
[36m(RayTrainWorker pid=3011)[0m [rank: 0] Seed set to 42
[36m(RayTrainWorker pid=3011)[0m [rank: 0] Seed set to 42
[36m(RayTrainWorker pid=3011)[0m GPU available: True (mps), used: False
[36m(RayTrainWorker pid=3011)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=3011)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=3011)[0m /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
[36m(RayTrainWorker pid=3011)[0m [rank: 0] Seed set to 42
[36m(RayTrainWorker pid=3011)[0m [rank: 0] Seed set to 42
[36m(RayTrainWorker pid=3011)[0m [rank: 0] Seed set to 42
[36m(RayTrainWorker pid=3011)[0m [rank: 0] Seed set to 42
[36m(RayTrainWorker pid=3011)[0m [rank: 0] Seed set to 42
[

Testing: |          | 0/? [00:00<?, ?it/s]
Testing DataLoader 0:  14%|█▍        | 1/7 [00:00<00:00, 17.47it/s]
Testing DataLoader 0:  57%|█████▋    | 4/7 [00:00<00:00, 24.98it/s]
Testing DataLoader 0:  71%|███████▏  | 5/7 [00:00<00:00, 19.10it/s]
Testing DataLoader 0: 100%|██████████| 7/7 [00:00<00:00, 21.09it/s]
Testing DataLoader 1:  43%|████▎     | 3/7 [00:00<00:00, 37.84it/s]
Testing DataLoader 1: 100%|██████████| 7/7 [00:00<00:00, 34.95it/s]
Testing DataLoader 2:  29%|██▊       | 2/7 [00:00<00:00, 47.44it/s]
Testing DataLoader 2:  14%|█▍        | 1/7 [00:00<00:00, 80.61it/s]
Testing DataLoader 2:  71%|███████▏  | 5/7 [00:00<00:00, 37.03it/s]


[36m(RayTrainWorker pid=3011)[0m 
[36m(RayTrainWorker pid=3011)[0m   | Name        | Type                        | Params | Mode 
[36m(RayTrainWorker pid=3011)[0m --------------------------------------------------------------------
[36m(RayTrainWorker pid=3011)[0m 0 | main_module | GRU4Rec                     | 82.8 K | train
[36m(RayTrainWorker pid=3011)[0m 1 | loss        | SequentialBCEWithLogitsLoss | 0      | train
[36m(RayTrainWorker pid=3011)[0m 2 | metrics     | RobustModuleDict            | 0      | train
[36m(RayTrainWorker pid=3011)[0m --------------------------------------------------------------------
[36m(RayTrainWorker pid=3011)[0m 82.8 K    Trainable params
[36m(RayTrainWorker pid=3011)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=3011)[0m 82.8 K    Total params
[36m(RayTrainWorker pid=3011)[0m 0.331     Total estimated model params size (MB)
[36m(RayTrainWorker pid=3011)[0m 52        Modules in train mode
[36m(RayTrainWorker pid=30

Testing DataLoader 2: 100%|██████████| 7/7 [00:00<00:00, 35.74it/s]
[36m(RayTrainWorker pid=3012)[0m ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
[36m(RayTrainWorker pid=3012)[0m ┃[1m [0m[1m   Test metric   [0m[1m [0m┃[1m [0m[1m  DataLoader 0   [0m[1m [0m┃[1m [0m[1m  DataLoader 1   [0m[1m [0m┃[1m [0m[1m  DataLoader 2  [0m[1m [0m┃
[36m(RayTrainWorker pid=3012)[0m ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
[36m(RayTrainWorker pid=3012)[0m │[36m [0m[36m  test_MAP_@10   [0m[36m [0m│[35m [0m[35m0.00928013492375…[0m[35m [0m│[35m [0m[35m0.01138906646519…[0m[35m [0m│[35m [0m[35m0.2428633868694…[0m[35m [0m│
[36m(RayTrainWorker pid=3012)[0m │[36m [0m[36m  test_MAP_@20   [0m[36m [0m│[35m [0m[35m0.01000827737152…[0m[35m [0m│[35m [0m[35m0.01081543695181…[0m[35m [0m│[35m [0m[35m0.1548702716827…[0m[35m [0m│
[36m(RayTrainWorker pid=3012)[0m │[36

[36m(RayTrainWorker pid=3011)[0m /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
[36m(RayTrainWorker pid=3012)[0m /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (7) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 0:   0%|          | 0/7 [00:00<?, ?it/s]                             
Sanity Checking DataLoader 1:  50%|█████     | 1/2 [00:00<00:00, 40.34it/s]
                                                                           
Epoch 0:  29%|██▊       | 2/7 [00:00<00:00, 14.80it/s, v_num=0]
Epoch 0:   0%|          | 0/7 [00:00<?, ?it/s]                             
Epoch 0:   0%|          | 0/7 [00:00<?, ?it/s] 
Epoch 0:  14%|█▍        | 1/7 [00:00<00:00, 11.02it/s, v_num=0]
Epoch 0:  71%|███████▏  | 5/7 [00:00<00:00, 21.16it/s, v_num=0]
Epoch 0:  57%|█████▋    | 4/7 [00:00<00:00, 23.07it/s, v_num=0]
Epoch 0: 100%|██████████| 7/7 [00:00<00:00, 23.50it/s, v_num=0]
Validation: |          | 0/? [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=3012)[0m 
Validation:   0%|          | 0/7 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/7 [00:00<?, ?it/s][A
Epoch 0: 100%|██████████| 7/7 [00:00<00:00, 19.74it/s, v_num=0]
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid

[36m(RayTrainWorker pid=3012)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/federicosiciliano/ray_results/TorchTrainer_2024-12-24_13-24-26/TorchTrainer_0519a_00000_0_data_params_collator_params_lookback=20,model_optimizer_params_lr=0.0478,model_rec_model_dropout_input=_2024-12-24_13-24-28/checkpoint_000000)


Epoch 1:  57%|█████▋    | 4/7 [00:00<00:00, 18.35it/s, v_num=0]
Epoch 1: 100%|██████████| 7/7 [00:00<00:00, 24.94it/s, v_num=0]
[36m(RayTrainWorker pid=3012)[0m 
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3012)[0m 
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3012)[0m 
[36m(RayTrainWorker pid=3012)[0m 
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3012)[0m 
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3012)[0m 
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3012)[0m 
Epoch 1: 100%|██████████| 7/7 [00:00<00:00,  7.07it/s, v_num=0]       [A
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
Epoch 1: 100%|██████████| 7/7 [00:00<00:00,  7.00it/s, v_num=0]
[36m(RayTrain

[36m(TorchTrainer pid=2995)[0m Started distributed worker processes: [32m [repeated 3x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)[0m
[36m(TorchTrainer pid=2995)[0m - (node_id=fed15232356329568052a5376b724ec8d5728378a80c03a3ab3695fd, ip=127.0.0.1, pid=3014) world_rank=0, local_rank=0, node_rank=0[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3013)[0m Setting up process group for: env:// [rank=0, world_size=1][32m [repeated 3x across cluster][0m


[36m(RayTrainWorker pid=3012)[0m 
[36m(RayTrainWorker pid=3012)[0m 
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3012)[0m 
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3012)[0m 
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3012)[0m 
[36m(RayTrainWorker pid=3012)[0m 
[36m(RayTrainWorker pid=3013)[0m Ratings data already exists. Skip pre-processing[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3012)[0m 
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
Epoch 3:  14%|█▍        | 1/7 [00:00<00:00, 14.21it/s, v_num=0]
[36m(RayTrainWorker pid=3013)[0m Filtering by minimum number of users per item: 5[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3013)[0m Filtering by minimum number of items per user

[36m(RayTrainWorker pid=3013)[0m [rank: 0] Seed set to 42[32m [repeated 118x across cluster][0m
[36m(RayTrainWorker pid=3013)[0m GPU available: True (mps), used: False[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3013)[0m TPU available: False, using: 0 TPU cores[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3013)[0m HPU available: False, using: 0 HPUs[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3013)[0m /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3013)[0m /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `n

Epoch 4: 100%|██████████| 7/7 [00:00<00:00, 27.13it/s, v_num=0]
Testing DataLoader 0:  71%|███████▏  | 5/7 [00:00<00:00, 19.95it/s][32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3012)[0m 
Testing DataLoader 0: 100%|██████████| 7/7 [00:00<00:00, 21.69it/s][32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3012)[0m 
Testing DataLoader 1:  43%|████▎     | 3/7 [00:00<00:00, 34.31it/s][32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3012)[0m 
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3012)[0m 
Testing DataLoader 2:  14%|█▍        | 1/7 [00:00<00:00, 79.68it/s][32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3012)[0m 
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
                                            

[36m(RayTrainWorker pid=3013)[0m   | Name        | Type                        | Params | Mode [32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3013)[0m --------------------------------------------------------------------[32m [repeated 6x across cluster][0m
[36m(RayTrainWorker pid=3013)[0m 0 | main_module | GRU4Rec                     | 98.1 K | train[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3013)[0m 1 | loss        | SequentialBCEWithLogitsLoss | 0      | train[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3013)[0m 2 | metrics     | RobustModuleDict            | 0      | train[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3013)[0m 98.1 K    Trainable params[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3013)[0m 0         Non-trainable params[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3013)[0m 98.1 K    Total params[32m [repeated 3x across cluster][0m
[36m(RayTrain

Epoch 0:   0%|          | 0/7 [00:00<?, ?it/s]                             
[36m(RayTrainWorker pid=3012)[0m 
Epoch 5:  29%|██▊       | 2/7 [00:00<00:00, 18.14it/s, v_num=0][32m [repeated 5x across cluster][0m
Epoch 1:  71%|███████▏  | 5/7 [00:00<00:00, 20.43it/s, v_num=0][32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3012)[0m 
Epoch 5:  71%|███████▏  | 5/7 [00:00<00:00, 24.92it/s, v_num=0][32m [repeated 11x across cluster][0m
Epoch 5:  71%|███████▏  | 5/7 [00:00<00:00, 26.83it/s, v_num=0][32m [repeated 8x across cluster][0m
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3012)[0m 
Epoch 5: 100%|██████████| 7/7 [00:00<00:00, 28.86it/s, v_num=0][32m [repeated 8x across cluster][0m
Validation: |          | 0/? [00:00<?, ?it/s][A[32m [repeated 18x across cluster][0m
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 


[36m(RayTrainWorker pid=3014)[0m /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3014)[0m /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (7) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.[32m [repeated 3x across cluster][0m


Epoch 6: 100%|██████████| 7/7 [00:00<00:00, 26.97it/s, v_num=0]
Validation DataLoader 1: 100%|██████████| 7/7 [00:00<00:00, 33.24it/s][A[32m [repeated 37x across cluster][0m
[36m(RayTrainWorker pid=3012)[0m 
Validation DataLoader 0:   0%|          | 0/7 [00:00<?, ?it/s]        [A[32m [repeated 18x across cluster][0m
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3012)[0m 
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3012)[0m 
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3012)[0m 
Epoch 6:   0%|          | 0/7 [00:00<?, ?it/s, v_num=0][32m [repeated 12x across cluster][0m
Epoch 2: 100%|██████████| 7/7 [00:00<00:00,  7.77it/s, v_num=0]       [A
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3012)[0m 
Epoch 3:  14%|█▍        | 1/7 [00:00<00:00, 14.44it/s, v_num=0][32m [repeated 2x across cluster][0m
[36m(R

[36m(RayTrainWorker pid=3014)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/federicosiciliano/ray_results/TorchTrainer_2024-12-24_13-24-26/TorchTrainer_0519a_00001_1_data_params_collator_params_lookback=5,model_optimizer_params_lr=0.0876,model_rec_model_dropout_input=0_2024-12-24_13-24-28/checkpoint_000006)[32m [repeated 19x across cluster][0m


[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3012)[0m 
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3012)[0m 
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3012)[0m 
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3012)[0m 
Epoch 4: 100%|██████████| 7/7 [00:00<00:00,  9.15it/s, v_num=0][32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3012)[0m 
[36m(RayTrainWorker pid=3012)[0m 
Epoch 4: 100%|██████████| 7/7 [00:00<00:00,  9.23it/s, v_num=0]
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 


[36m(TorchTrainer pid=3108)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=3108)[0m - (node_id=fed15232356329568052a5376b724ec8d5728378a80c03a3ab3695fd, ip=127.0.0.1, pid=3155) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=3155)[0m Setting up process group for: env:// [rank=0, world_size=1]


[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3013)[0m 
Epoch 9:  14%|█▍        | 1/7 [00:00<00:00, 14.86it/s, v_num=0]       [A
[36m(RayTrainWorker pid=3014)[0m 
Epoch 8:  14%|█▍        | 1/7 [00:00<00:00, 12.93it/s, v_num=0][32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3014)[0m 
[36m(RayTrainWorker pid=3013)[0m 


[36m(RayTrainWorker pid=3014)[0m `Trainer.fit` stopped: `max_epochs=10` reached.


[36m(RayTrainWorker pid=3014)[0m 
Epoch 9: 100%|██████████| 7/7 [00:00<00:00,  8.95it/s, v_num=0]       [A
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3013)[0m 
[36m(RayTrainWorker pid=3155)[0m Ratings data already exists. Skip pre-processing
[36m(RayTrainWorker pid=3155)[0m Filtering by minimum number of users per item: 5
[36m(RayTrainWorker pid=3155)[0m Filtering by minimum number of items per user: 5
[36m(RayTrainWorker pid=3155)[0m Densifying index
Epoch 9: 100%|██████████| 7/7 [00:00<00:00, 28.06it/s, v_num=0][32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3155)[0m Splitting: leave_n_out
Testing: |          | 0/? [00:00<?, ?it/s]


[36m(RayTrainWorker pid=3155)[0m [rank: 0] Seed set to 42[32m [repeated 4x across cluster][0m
[36m(RayTrainWorker pid=3155)[0m GPU available: True (mps), used: False
[36m(RayTrainWorker pid=3155)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=3155)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=3155)[0m /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
[36m(RayTrainWorker pid=3155)[0m /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
[36m(RayTrainWorker pid=3155)[0m /Library/Frameworks/Python.framewo

Testing DataLoader 0:  29%|██▊       | 2/7 [00:00<00:00, 34.28it/s]
Testing DataLoader 0:  71%|███████▏  | 5/7 [00:00<00:00, 34.85it/s]
Testing DataLoader 1:   0%|          | 0/7 [00:00<?, ?it/s]        
Testing DataLoader 1:  43%|████▎     | 3/7 [00:00<00:00, 27.74it/s]
Testing DataLoader 1: 100%|██████████| 7/7 [00:00<00:00, 31.49it/s]
Testing DataLoader 2:  29%|██▊       | 2/7 [00:00<00:00, 55.31it/s]
Epoch 7:  29%|██▊       | 2/7 [00:00<00:00, 20.46it/s, v_num=0][32m [repeated 2x across cluster][0m
Epoch 7:  86%|████████▌ | 6/7 [00:00<00:00, 26.94it/s, v_num=0][32m [repeated 2x across cluster][0m
Testing DataLoader 2:  86%|████████▌ | 6/7 [00:00<00:00, 42.02it/s]
Epoch 9:  43%|████▎     | 3/7 [00:00<00:00, 21.72it/s, v_num=0][32m [repeated 8x across cluster][0m
Epoch 9:  71%|███████▏  | 5/7 [00:00<00:00, 28.14it/s, v_num=0][32m [repeated 3x across cluster][0m
Testing DataLoader 2: 100%|██████████| 7/7 [00:00<00:00, 39.46it/s]
[36m(RayTrainWorker pid=3155)[0m ┏━━━━━━━━━━━━

[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3155)[0m   | Name        | Type                        | Params | Mode 
[36m(RayTrainWorker pid=3155)[0m --------------------------------------------------------------------[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3155)[0m 0 | main_module | GRU4Rec                     | 82.8 K | train
[36m(RayTrainWorker pid=3155)[0m 1 | loss        | SequentialBCEWithLogitsLoss | 0      | train
[36m(RayTrainWorker pid=3155)[0m 2 | metrics     | RobustModuleDict            | 0      | train
[36m(RayTrainWorker pid=3155)[0m 82.8 K    Trainable params
[36m(RayTrainWorker pid=3155)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=3155)[0m 82.8 K    Total params
[36m(RayTrainWorker pid=3155)[0m 0.331     Total estimated model params size (MB)
[36m(RayTrainWorker pid=3155)[0m 52        Modules in train mode
[36m(RayTrainWorker pid=3155)[0m 0         Modules in eval mode
[36m(RayTrainWorker pid=31

Sanity Checking DataLoader 1:   0%|          | 0/2 [00:00<?, ?it/s]        
Validation:   0%|          | 0/7 [00:00<?, ?it/s][A[32m [repeated 10x across cluster][0m
Validation DataLoader 1:   0%|          | 0/7 [00:00<?, ?it/s][A[32m [repeated 20x across cluster][0m
Epoch 7: 100%|██████████| 7/7 [00:00<00:00, 27.97it/s, v_num=0][32m [repeated 3x across cluster][0m
Epoch 0:   0%|          | 0/7 [00:00<?, ?it/s]                             
Validation DataLoader 1:  86%|████████▌ | 6/7 [00:00<00:00, 35.12it/s][A[32m [repeated 122x across cluster][0m


[36m(RayTrainWorker pid=3155)[0m /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
[36m(RayTrainWorker pid=3155)[0m /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (7) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Validation DataLoader 1: 100%|██████████| 7/7 [00:00<00:00, 33.46it/s][A[32m [repeated 20x across cluster][0m
[36m(RayTrainWorker pid=3155)[0m 
Validation DataLoader 0:   0%|          | 0/7 [00:00<?, ?it/s]        [A[32m [repeated 10x across cluster][0m
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3155)[0m 
Epoch 9:   0%|          | 0/7 [00:00<?, ?it/s, v_num=0][32m [repeated 6x across cluster][0m
[36m(RayTrainWorker pid=3155)[0m 
Epoch 0: 100%|██████████| 7/7 [00:00<00:00, 28.41it/s, v_num=0]
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3155)[0m 


[36m(RayTrainWorker pid=3155)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/federicosiciliano/ray_results/TorchTrainer_2024-12-24_13-24-26/TorchTrainer_0519a_00004_4_data_params_collator_params_lookback=5,model_optimizer_params_lr=0.0273,model_rec_model_dropout_input=0_2024-12-24_13-24-28/checkpoint_000001)[32m [repeated 11x across cluster][0m


Epoch 7: 100%|██████████| 7/7 [00:00<00:00,  8.89it/s, v_num=0]       [A
Epoch 2: 100%|██████████| 7/7 [00:00<00:00, 30.71it/s, v_num=0]
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3155)[0m 


[36m(TorchTrainer pid=3195)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=3195)[0m - (node_id=fed15232356329568052a5376b724ec8d5728378a80c03a3ab3695fd, ip=127.0.0.1, pid=3221) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=3221)[0m Setting up process group for: env:// [rank=0, world_size=1]


[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3155)[0m 
Epoch 3: 100%|██████████| 7/7 [00:00<00:00,  9.07it/s, v_num=0]       [A


[36m(RayTrainWorker pid=3013)[0m `Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 4:  29%|██▊       | 2/7 [00:00<00:00, 19.60it/s, v_num=0]
Epoch 9: 100%|██████████| 7/7 [00:00<00:00,  9.01it/s, v_num=0]       [A
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3155)[0m 
Epoch 1: 100%|██████████| 7/7 [00:00<00:00, 30.86it/s, v_num=0]
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3155)[0m 


[36m(RayTrainWorker pid=3155)[0m [rank: 0] Seed set to 42[32m [repeated 35x across cluster][0m


Epoch 5: 100%|██████████| 7/7 [00:00<00:00, 31.98it/s, v_num=0]
[36m(RayTrainWorker pid=3221)[0m Ratings data already exists. Skip pre-processing
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3221)[0m Filtering by minimum number of users per item: 5
[36m(RayTrainWorker pid=3221)[0m Filtering by minimum number of items per user: 5
[36m(RayTrainWorker pid=3221)[0m Densifying index
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3221)[0m Splitting: leave_n_out
Epoch 6:  14%|█▍        | 1/7 [00:00<00:00, 14.15it/s, v_num=0]
Epoch 6:  14%|█▍        | 1/7 [00:00<00:00, 14.09it/s, v_num=0]


[36m(RayTrainWorker pid=3221)[0m GPU available: True (mps), used: False
[36m(RayTrainWorker pid=3221)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=3221)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=3221)[0m /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
[36m(RayTrainWorker pid=3221)[0m /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
[36m(RayTrainWorker pid=3221)[0m /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.

Epoch 6:  57%|█████▋    | 4/7 [00:00<00:00, 25.49it/s, v_num=0][32m [repeated 5x across cluster][0m
Epoch 4:  86%|████████▌ | 6/7 [00:00<00:00, 28.74it/s, v_num=0][32m [repeated 3x across cluster][0m
Testing: |          | 0/? [00:00<?, ?it/s]
Epoch 6: 100%|██████████| 7/7 [00:00<00:00, 29.12it/s, v_num=0]
Validation: |          | 0/? [00:00<?, ?it/s][A[32m [repeated 7x across cluster][0m
[36m(RayTrainWorker pid=3155)[0m 
Validation:   0%|          | 0/7 [00:00<?, ?it/s][A[32m [repeated 7x across cluster][0m
Validation DataLoader 0:   0%|          | 0/7 [00:00<?, ?it/s][A[32m [repeated 13x across cluster][0m
Epoch 4: 100%|██████████| 7/7 [00:00<00:00, 29.71it/s, v_num=0][32m [repeated 2x across cluster][0m
Testing DataLoader 0:  29%|██▊       | 2/7 [00:00<00:00, 31.70it/s]
[36m(RayTrainWorker pid=3155)[0m 
Validation DataLoader 0:  86%|████████▌ | 6/7 [00:00<00:00, 35.25it/s][A[32m [repeated 78x across cluster][0m
Testing DataLoader 0: 100%|██████████| 7/7 [00:00<0

[36m(RayTrainWorker pid=3221)[0m 
[36m(RayTrainWorker pid=3221)[0m   | Name        | Type                        | Params | Mode 
[36m(RayTrainWorker pid=3221)[0m --------------------------------------------------------------------
[36m(RayTrainWorker pid=3221)[0m 0 | main_module | GRU4Rec                     | 98.1 K | train
[36m(RayTrainWorker pid=3221)[0m 1 | loss        | SequentialBCEWithLogitsLoss | 0      | train
[36m(RayTrainWorker pid=3221)[0m 2 | metrics     | RobustModuleDict            | 0      | train
[36m(RayTrainWorker pid=3221)[0m --------------------------------------------------------------------
[36m(RayTrainWorker pid=3221)[0m 98.1 K    Trainable params
[36m(RayTrainWorker pid=3221)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=3221)[0m 98.1 K    Total params
[36m(RayTrainWorker pid=3221)[0m 0.392     Total estimated model params size (MB)
[36m(RayTrainWorker pid=3221)[0m 52        Modules in train mode
[36m(RayTrainWorker pid=32

[36m(RayTrainWorker pid=3155)[0m 
Epoch 0:   0%|          | 0/7 [00:00<?, ?it/s]                             
[36m(RayTrainWorker pid=3155)[0m 
Epoch 0:  29%|██▊       | 2/7 [00:00<00:00, 21.61it/s, v_num=0]


[36m(RayTrainWorker pid=3221)[0m /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
[36m(RayTrainWorker pid=3221)[0m /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (7) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3221)[0m 
Epoch 8: 100%|██████████| 7/7 [00:00<00:00, 30.27it/s, v_num=0]
[36m(RayTrainWorker pid=3221)[0m 
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3221)[0m 
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3221)[0m 
Epoch 0: 100%|██████████| 7/7 [00:00<00:00,  8.84it/s, v_num=0]       [A
[36m(RayTrainWorker pid=3155)[0m 
Epoch 1:  14%|█▍        | 1/7 [00:00<00:00, 13.82it/s, v_num=0]
Testing DataLoader 0:   0%|          | 0/7 [00:00<?, ?it/s]
[36m(RayTrainWorker pid=3155)[0m 
Testing DataLoader 0:  43%|████▎     | 3/7 [00:00<00:00, 27.10it/s]


[36m(RayTrainWorker pid=3221)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/federicosiciliano/ray_results/TorchTrainer_2024-12-24_13-24-26/TorchTrainer_0519a_00005_5_data_params_collator_params_lookback=5,model_optimizer_params_lr=0.0569,model_rec_model_dropout_input=0_2024-12-24_13-24-28/checkpoint_000000)[32m [repeated 7x across cluster][0m


[36m(RayTrainWorker pid=3155)[0m 
Epoch 1: 100%|██████████| 7/7 [00:00<00:00, 24.79it/s, v_num=0]
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3221)[0m 
Testing DataLoader 1:  14%|█▍        | 1/7 [00:00<00:00, 39.50it/s]
Testing DataLoader 1:   0%|          | 0/7 [00:00<?, ?it/s]        
[36m(RayTrainWorker pid=3221)[0m 
[36m(RayTrainWorker pid=3221)[0m 
Testing DataLoader 1: 100%|██████████| 7/7 [00:00<00:00, 30.78it/s]
[36m(RayTrainWorker pid=3221)[0m 
Testing DataLoader 2:  29%|██▊       | 2/7 [00:00<00:00, 50.95it/s]
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3221)[0m 
[36m(RayTrainWorker pid=3155)[0m 
Epoch 2:  29%|██▊       | 2/7 [00:00<00:00, 20.61it/s, v_num=0]
Testing DataLoader 2: 100%|██████████| 7/7 [00:00<00:00, 34.31it/s]
[36m(RayTrainWorker pid=3236)[0m │[36m [0m[36m  test_MAP_@10   [0m[36m [0m│[35m [0m[35m0.00928013492375…[0m[35m [0m│[35m [0m[35m0.01138906646519…[0m[35m [0m│[35m [0m[35m0.2428633868694…

[36m(RayTrainWorker pid=3236)[0m 
[36m(RayTrainWorker pid=3235)[0m 


[36m(RayTrainWorker pid=3155)[0m 
Epoch 2: 100%|██████████| 7/7 [00:00<00:00, 27.82it/s, v_num=0]
                                                                           
[36m(RayTrainWorker pid=3155)[0m 
[36m(RayTrainWorker pid=3221)[0m 
Epoch 0:   0%|          | 0/7 [00:00<?, ?it/s] 
[36m(RayTrainWorker pid=3155)[0m 
Epoch 9: 100%|██████████| 7/7 [00:00<00:00,  8.12it/s, v_num=0]       [A
[36m(RayTrainWorker pid=3221)[0m 
Epoch 0:  14%|█▍        | 1/7 [00:00<00:00,  9.52it/s, v_num=0]


[36m(RayTrainWorker pid=3155)[0m `Trainer.fit` stopped: `max_epochs=10` reached.


[36m(RayTrainWorker pid=3221)[0m 
[36m(RayTrainWorker pid=3221)[0m 
[36m(RayTrainWorker pid=3221)[0m 
[36m(RayTrainWorker pid=3236)[0m 
[36m(RayTrainWorker pid=3235)[0m 
[36m(RayTrainWorker pid=3221)[0m 
[36m(RayTrainWorker pid=3236)[0m 
[36m(RayTrainWorker pid=3235)[0m 


[36m(TorchTrainer pid=3206)[0m Started distributed worker processes: [32m [repeated 2x across cluster][0m
[36m(TorchTrainer pid=3206)[0m - (node_id=fed15232356329568052a5376b724ec8d5728378a80c03a3ab3695fd, ip=127.0.0.1, pid=3235) world_rank=0, local_rank=0, node_rank=0[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3235)[0m Setting up process group for: env:// [rank=0, world_size=1][32m [repeated 2x across cluster][0m


[36m(RayTrainWorker pid=3236)[0m 
[36m(RayTrainWorker pid=3235)[0m 
[36m(RayTrainWorker pid=3236)[0m 
[36m(RayTrainWorker pid=3235)[0m 
[36m(RayTrainWorker pid=3221)[0m 
[36m(RayTrainWorker pid=3236)[0m 
[36m(RayTrainWorker pid=3235)[0m 
[36m(RayTrainWorker pid=3221)[0m 
[36m(RayTrainWorker pid=3221)[0m 
[36m(RayTrainWorker pid=3221)[0m 
[36m(RayTrainWorker pid=3221)[0m 
[36m(RayTrainWorker pid=3221)[0m 
[36m(RayTrainWorker pid=3236)[0m 
[36m(RayTrainWorker pid=3236)[0m 
[36m(RayTrainWorker pid=3235)[0m Ratings data already exists. Skip pre-processing[32m [repeated 2x across cluster][0m


[36m(RayTrainWorker pid=3235)[0m [rank: 0] Seed set to 42[32m [repeated 117x across cluster][0m


[36m(RayTrainWorker pid=3236)[0m 
[36m(RayTrainWorker pid=3236)[0m 
[36m(RayTrainWorker pid=3235)[0m Filtering by minimum number of users per item: 5[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3235)[0m Filtering by minimum number of items per user: 5[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3235)[0m Densifying index[32m [repeated 2x across cluster][0m
Epoch 2: 100%|██████████| 7/7 [00:00<00:00, 30.43it/s, v_num=0]
[36m(RayTrainWorker pid=3236)[0m 
[36m(RayTrainWorker pid=3235)[0m Splitting: leave_n_out[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3236)[0m 
Epoch 2:  43%|████▎     | 3/7 [00:00<00:00, 23.54it/s, v_num=0][32m [repeated 8x across cluster][0m
Epoch 1:  86%|████████▌ | 6/7 [00:00<00:00, 26.89it/s, v_num=0][32m [repeated 4x across cluster][0m
Testing: |          | 0/? [00:00<?, ?it/s][32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3236)[0m 
Validation: |          | 0/? [00:00<?, ?i

[36m(RayTrainWorker pid=3235)[0m GPU available: True (mps), used: False[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3235)[0m TPU available: False, using: 0 TPU cores[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3235)[0m HPU available: False, using: 0 HPUs[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3235)[0m /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3235)[0m /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.[32m [repeated 

Validation DataLoader 1: 100%|██████████| 7/7 [00:00<00:00, 29.75it/s][A[32m [repeated 23x across cluster][0m
Testing DataLoader 1:  57%|█████▋    | 4/7 [00:00<00:00, 34.41it/s]
Validation DataLoader 0:   0%|          | 0/7 [00:00<?, ?it/s]        [A[32m [repeated 11x across cluster][0m
[36m(RayTrainWorker pid=3236)[0m 
[36m(RayTrainWorker pid=3236)[0m 
[36m(RayTrainWorker pid=3235)[0m ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3235)[0m ┃[1m [0m[1m   Test metric   [0m[1m [0m┃[1m [0m[1m  DataLoader 0   [0m[1m [0m┃[1m [0m[1m  DataLoader 1   [0m[1m [0m┃[1m [0m[1m  DataLoader 2  [0m[1m [0m┃[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3235)[0m ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3235)[0m │[36m [0m[36m    test_loss    [0m[36

[36m(RayTrainWorker pid=3235)[0m   | Name        | Type                        | Params | Mode [32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3235)[0m --------------------------------------------------------------------[32m [repeated 4x across cluster][0m
[36m(RayTrainWorker pid=3235)[0m 0 | main_module | GRU4Rec                     | 98.1 K | train[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3235)[0m 1 | loss        | SequentialBCEWithLogitsLoss | 0      | train[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3235)[0m 2 | metrics     | RobustModuleDict            | 0      | train[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3235)[0m 98.1 K    Trainable params[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3235)[0m 0         Non-trainable params[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3235)[0m 98.1 K    Total params[32m [repeated 2x across cluster][0m
[36m(RayTrain

Testing DataLoader 0:   0%|          | 0/7 [00:00<?, ?it/s]
Testing DataLoader 2:  71%|███████▏  | 5/7 [00:00<00:00, 35.94it/s][32m [repeated 6x across cluster][0m
Epoch 3: 100%|██████████| 7/7 [00:00<00:00, 29.07it/s, v_num=0][32m [repeated 2x across cluster][0m
Testing DataLoader 1: 100%|██████████| 7/7 [00:00<00:00, 32.92it/s]
Testing DataLoader 2:  29%|██▊       | 2/7 [00:00<00:00, 48.46it/s]
Testing DataLoader 2: 100%|██████████| 7/7 [00:00<00:00, 32.92it/s]
Epoch 3: 100%|██████████| 7/7 [00:00<00:00, 26.53it/s, v_num=0][32m [repeated 2x across cluster][0m
Epoch 3:  14%|█▍        | 1/7 [00:00<00:00, 11.08it/s, v_num=0][32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3337)[0m Ratings data already exists. Skip pre-processing
Epoch 3:  71%|███████▏  | 5/7 [00:00<00:00, 24.97it/s, v_num=0]
Validation: |          | 0/? [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=3337)[0m Filtering by minimum number of users per item: 5
[36m(RayTrainWorker pid=3337)[0m Filterin

[36m(TorchTrainer pid=3325)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=3325)[0m - (node_id=fed15232356329568052a5376b724ec8d5728378a80c03a3ab3695fd, ip=127.0.0.1, pid=3358) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=3337)[0m [rank: 0] Seed set to 42
[36m(RayTrainWorker pid=3337)[0m [rank: 0] Seed set to 42
[36m(RayTrainWorker pid=3337)[0m [rank: 0] Seed set to 42
[36m(RayTrainWorker pid=3337)[0m [rank: 0] Seed set to 42
[36m(RayTrainWorker pid=3337)[0m GPU available: True (mps), used: False
[36m(RayTrainWorker pid=3337)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=3337)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=3337)[0m /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
[36m(RayTrainWorker pid=3337)[0m [rank: 0] Seed set to 42
[36

[36m(RayTrainWorker pid=3337)[0m Splitting: leave_n_out
Validation DataLoader 1: 100%|██████████| 7/7 [00:00<00:00, 34.06it/s][A[32m [repeated 2x across cluster][0m
Validation DataLoader 0:   0%|          | 0/7 [00:00<?, ?it/s]        [A
Testing: |          | 0/? [00:00<?, ?it/s]
Testing DataLoader 0:  43%|████▎     | 3/7 [00:00<00:00, 36.44it/s]
Epoch 4:   0%|          | 0/7 [00:00<?, ?it/s, v_num=0]               [A
Testing DataLoader 1:   0%|          | 0/7 [00:00<?, ?it/s]        
Testing DataLoader 1:  57%|█████▋    | 4/7 [00:00<00:00, 35.56it/s]
Testing DataLoader 1: 100%|██████████| 7/7 [00:00<00:00, 31.52it/s]
Testing DataLoader 2:  29%|██▊       | 2/7 [00:00<00:00, 54.49it/s]
Testing DataLoader 2:  86%|████████▌ | 6/7 [00:00<00:00, 39.68it/s]
Testing DataLoader 2: 100%|██████████| 7/7 [00:00<00:00, 38.08it/s]
[36m(RayTrainWorker pid=3337)[0m ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
[36m(RayTrainWorker pid=3337)[0m ┃[1m [0m[

[36m(RayTrainWorker pid=3337)[0m [rank: 0] Seed set to 42
[36m(RayTrainWorker pid=3337)[0m 
[36m(RayTrainWorker pid=3337)[0m   | Name        | Type                        | Params | Mode 
[36m(RayTrainWorker pid=3337)[0m --------------------------------------------------------------------[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3337)[0m 0 | main_module | GRU4Rec                     | 98.1 K | train
[36m(RayTrainWorker pid=3337)[0m 1 | loss        | SequentialBCEWithLogitsLoss | 0      | train
[36m(RayTrainWorker pid=3337)[0m 2 | metrics     | RobustModuleDict            | 0      | train
[36m(RayTrainWorker pid=3337)[0m 98.1 K    Trainable params
[36m(RayTrainWorker pid=3337)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=3337)[0m 98.1 K    Total params
[36m(RayTrainWorker pid=3337)[0m 0.392     Total estimated model params size (MB)
[36m(RayTrainWorker pid=3337)[0m 52        Modules in train mode
[36m(RayTrainWorker pid=3337)[0

Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:00<00:00, 31.52it/s]
[36m(RayTrainWorker pid=3358)[0m Ratings data already exists. Skip pre-processing
Epoch 0:   0%|          | 0/7 [00:00<?, ?it/s]                             


[36m(RayTrainWorker pid=3337)[0m /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
[36m(RayTrainWorker pid=3337)[0m /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (7) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 0:  29%|██▊       | 2/7 [00:00<00:00, 15.74it/s, v_num=0]
Epoch 0:  86%|████████▌ | 6/7 [00:00<00:00, 24.93it/s, v_num=0]
Epoch 0: 100%|██████████| 7/7 [00:00<00:00, 26.00it/s, v_num=0]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/7 [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=3337)[0m 
[36m(RayTrainWorker pid=3337)[0m 
[36m(RayTrainWorker pid=3337)[0m 
Validation DataLoader 0:   0%|          | 0/7 [00:00<?, ?it/s]        [A
[36m(RayTrainWorker pid=3337)[0m 
[36m(RayTrainWorker pid=3337)[0m 
Epoch 1:   0%|          | 0/7 [00:00<?, ?it/s, v_num=0]               [A
Testing DataLoader 0:   0%|          | 0/7 [00:00<?, ?it/s]
Testing DataLoader 0: 100%|██████████| 7/7 [00:00<00:00, 26.74it/s]
Testing DataLoader 1:  14%|█▍        | 1/7 [00:00<00:00, 34.50it/s]
Testing DataLoader 2:   0%|          | 0/7 [00:00<?, ?it/s]        
Testing DataLoader 1: 100%|██████████| 7/7 [00:00<00:00, 33.07it/s]
Testing DataLoader 2:  14%|█▍        | 1/7 [00

[36m(RayTrainWorker pid=3358)[0m 
[36m(RayTrainWorker pid=3358)[0m   | Name        | Type                        | Params | Mode 
[36m(RayTrainWorker pid=3358)[0m 0 | main_module | GRU4Rec                     | 98.1 K | train
[36m(RayTrainWorker pid=3358)[0m 1 | loss        | SequentialBCEWithLogitsLoss | 0      | train
[36m(RayTrainWorker pid=3358)[0m 2 | metrics     | RobustModuleDict            | 0      | train
[36m(RayTrainWorker pid=3358)[0m 98.1 K    Trainable params
[36m(RayTrainWorker pid=3358)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=3358)[0m 98.1 K    Total params
[36m(RayTrainWorker pid=3358)[0m 0.392     Total estimated model params size (MB)
[36m(RayTrainWorker pid=3358)[0m 52        Modules in train mode
[36m(RayTrainWorker pid=3358)[0m 0         Modules in eval mode
[36m(RayTrainWorker pid=3358)[0m /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py

                                                                           
Sanity Checking DataLoader 1:  50%|█████     | 1/2 [00:00<00:00, 33.07it/s]
Epoch 0:   0%|          | 0/7 [00:00<?, ?it/s] 
Epoch 0:   0%|          | 0/7 [00:00<?, ?it/s]                             


[36m(RayTrainWorker pid=3337)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/federicosiciliano/ray_results/TorchTrainer_2024-12-24_13-24-26/TorchTrainer_0519a_00008_8_data_params_collator_params_lookback=10,model_optimizer_params_lr=0.0615,model_rec_model_dropout_input=_2024-12-24_13-24-28/checkpoint_000000)
[36m(RayTrainWorker pid=3365)[0m Setting up process group for: env:// [rank=0, world_size=1][32m [repeated 3x across cluster][0m


Epoch 0:  57%|█████▋    | 4/7 [00:00<00:00, 20.02it/s, v_num=0]
[36m(RayTrainWorker pid=3358)[0m 
[36m(RayTrainWorker pid=3357)[0m 
[36m(RayTrainWorker pid=3358)[0m 
[36m(RayTrainWorker pid=3357)[0m 
[36m(RayTrainWorker pid=3358)[0m 
[36m(RayTrainWorker pid=3357)[0m 
[36m(RayTrainWorker pid=3358)[0m 
[36m(RayTrainWorker pid=3357)[0m 
[36m(RayTrainWorker pid=3358)[0m 
[36m(RayTrainWorker pid=3357)[0m 


[36m(RayTrainWorker pid=3358)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/federicosiciliano/ray_results/TorchTrainer_2024-12-24_13-24-26/TorchTrainer_0519a_00009_9_data_params_collator_params_lookback=5,model_optimizer_params_lr=0.0068,model_rec_model_dropout_input=0_2024-12-24_13-24-28/checkpoint_000000)
2024-12-24 13:25:08,944	INFO timeout.py:54 -- Reached timeout of 40 seconds. Stopping all trials.


[36m(RayTrainWorker pid=3358)[0m 
Epoch 0: 100%|██████████| 7/7 [00:00<00:00,  7.26it/s, v_num=0]       [A
[36m(RayTrainWorker pid=3357)[0m 
Epoch 1:   0%|          | 0/7 [00:00<?, ?it/s, v_num=0]        
[36m(RayTrainWorker pid=3357)[0m 


2024-12-24 13:25:09,035	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/Users/federicosiciliano/ray_results/TorchTrainer_2024-12-24_13-24-26' in 0.0316s.


Testing DataLoader 2:  57%|█████▋    | 4/7 [00:00<00:00, 41.78it/s][32m [repeated 4x across cluster][0m
Epoch 0: 100%|██████████| 7/7 [00:00<00:00, 20.75it/s, v_num=0]
Epoch 1:  14%|█▍        | 1/7 [00:00<00:00, 10.55it/s, v_num=0][32m [repeated 2x across cluster][0m
Testing DataLoader 0:  29%|██▊       | 2/7 [00:00<00:00, 21.71it/s]
[36m(RayTrainWorker pid=3358)[0m 
[36m(RayTrainWorker pid=3365)[0m Filtering by minimum number of users per item: 5[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3365)[0m Filtering by minimum number of items per user: 5[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3365)[0m Densifying index[32m [repeated 3x across cluster][0m
Validation DataLoader 0:   0%|          | 0/7 [00:00<?, ?it/s][A[32m [repeated 7x across cluster][0m
Validation DataLoader 0:  57%|█████▋    | 4/7 [00:00<00:00, 35.89it/s][A[32m [repeated 40x across cluster][0m
[36m(RayTrainWorker pid=3358)[0m 
Testing DataLoader 1:  43%|████▎     

[36m(TorchTrainer pid=3338)[0m Started distributed worker processes: [32m [repeated 2x across cluster][0m
[36m(TorchTrainer pid=3338)[0m - (node_id=fed15232356329568052a5376b724ec8d5728378a80c03a3ab3695fd, ip=127.0.0.1, pid=3365) world_rank=0, local_rank=0, node_rank=0[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3365)[0m [rank: 0] Seed set to 42[32m [repeated 116x across cluster][0m
[36m(RayTrainWorker pid=3365)[0m GPU available: True (mps), used: False[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3365)[0m TPU available: False, using: 0 TPU cores[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3365)[0m HPU available: False, using: 0 HPUs[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3365)[0m /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.[32m [repeated

Epoch 0: 100%|██████████| 7/7 [00:00<00:00, 24.15it/s, v_num=0]
[36m(RayTrainWorker pid=3365)[0m 
Testing DataLoader 1:  71%|███████▏  | 5/7 [00:00<00:00, 33.50it/s][32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3365)[0m 
Testing DataLoader 1: 100%|██████████| 7/7 [00:00<00:00, 37.23it/s][32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=3365)[0m 
Testing DataLoader 1:  29%|██▊       | 2/7 [00:00<00:00, 37.19it/s]
[36m(RayTrainWorker pid=3365)[0m 
[36m(RayTrainWorker pid=3365)[0m 
[36m(RayTrainWorker pid=3365)[0m ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3365)[0m ┃[1m [0m[1m   Test metric   [0m[1m [0m┃[1m [0m[1m  DataLoader 0   [0m[1m [0m┃[1m [0m[1m  DataLoader 1   [0m[1m [0m┃[1m [0m[1m  DataLoader 2  [0m[1m [0m┃[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=3365)[0m ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━

[36m(RayTrainWorker pid=3365)[0m --------------------------------------------------------------------[32m [repeated 6x across cluster][0m
2024-12-24 13:25:10,949	INFO tune.py:1041 -- Total run time: 42.59 seconds (40.63 seconds for the tuning loop).
- TorchTrainer_0519a_00012: FileNotFoundError('Could not fetch metrics for TorchTrainer_0519a_00012: both result.json and progress.csv were not found at /Users/federicosiciliano/ray_results/TorchTrainer_2024-12-24_13-24-26/TorchTrainer_0519a_00012_12_data_params_collator_params_lookback=5,model_optimizer_params_lr=0.0993,model_rec_model_dropout_input=_2024-12-24_13-24-28')
- TorchTrainer_0519a_00013: FileNotFoundError('Could not fetch metrics for TorchTrainer_0519a_00013: both result.json and progress.csv were not found at /Users/federicosiciliano/ray_results/TorchTrainer_2024-12-24_13-24-26/TorchTrainer_0519a_00013_13_data_params_collator_params_lookback=10,model_optimizer_params_lr=0.0186,model_rec_model_dropout_input_2024-12-24_13-24



In [21]:
results

ResultGrid<[
  Result(
    metrics={'val_loss/dataloader_idx_0': 0.5177322626113892, 'val_Precision_@5/dataloader_idx_0': 0.06294643133878708, 'val_Precision_@10/dataloader_idx_0': 0.04821429029107094, 'val_Precision_@20/dataloader_idx_0': 0.03231026604771614, 'val_Recall_@5/dataloader_idx_0': 0.3147321343421936, 'val_Recall_@10/dataloader_idx_0': 0.4821428656578064, 'val_Recall_@20/dataloader_idx_0': 0.6462053656578064, 'val_MAP_@5/dataloader_idx_0': 0.0751078799366951, 'val_MAP_@10/dataloader_idx_0': 0.0640486404299736, 'val_MAP_@20/dataloader_idx_0': 0.05102028697729111, 'val_NDCG_@5/dataloader_idx_0': 0.20437857508659363, 'val_NDCG_@10/dataloader_idx_0': 0.258093923330307, 'val_NDCG_@20/dataloader_idx_0': 0.29930517077445984, 'val_MRR_@5/dataloader_idx_0': 0.16845238208770752, 'val_MRR_@10/dataloader_idx_0': 0.19038008153438568, 'val_MRR_@20/dataloader_idx_0': 0.20154595375061035, 'val_loss/dataloader_idx_1': 0.5141581296920776, 'val_Precision_@5/dataloader_idx_1': 0.05625000223517

In [23]:
results.get_best_result(metric="val_NDCG_@10/dataloader_idx_0", mode="max")

Result(
  metrics={'val_loss/dataloader_idx_0': 0.5177322626113892, 'val_Precision_@5/dataloader_idx_0': 0.06294643133878708, 'val_Precision_@10/dataloader_idx_0': 0.04821429029107094, 'val_Precision_@20/dataloader_idx_0': 0.03231026604771614, 'val_Recall_@5/dataloader_idx_0': 0.3147321343421936, 'val_Recall_@10/dataloader_idx_0': 0.4821428656578064, 'val_Recall_@20/dataloader_idx_0': 0.6462053656578064, 'val_MAP_@5/dataloader_idx_0': 0.0751078799366951, 'val_MAP_@10/dataloader_idx_0': 0.0640486404299736, 'val_MAP_@20/dataloader_idx_0': 0.05102028697729111, 'val_NDCG_@5/dataloader_idx_0': 0.20437857508659363, 'val_NDCG_@10/dataloader_idx_0': 0.258093923330307, 'val_NDCG_@20/dataloader_idx_0': 0.29930517077445984, 'val_MRR_@5/dataloader_idx_0': 0.16845238208770752, 'val_MRR_@10/dataloader_idx_0': 0.19038008153438568, 'val_MRR_@20/dataloader_idx_0': 0.20154595375061035, 'val_loss/dataloader_idx_1': 0.5141581296920776, 'val_Precision_@5/dataloader_idx_1': 0.05625000223517418, 'val_Precisi

In [20]:
#Problems with parallel execution:
# May generate the same experiment id (very unlikely)
### By default, there are N = 16^62 possible experiment ids
### L = already run experiments
### M = N - L possible experiment ids
### If generating K experiment ids concurrently, probability is sti
# TODO: reuse already run experiments, --> if using time budget, sleep for previous run-time (for fairness in comparison)